tests/checkers/unittest_unicode/unittest_functions.py - third_party/github.com/pylint-dev/pylint - Git at Google

 # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 # For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE
 # Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt

 from __future__ import annotations

 import itertools
 from pathlib import Path

 import pytest

 import pylint.checkers.unicode

 SEARCH_DICT_BYTE_UTF8 = {
     char.unescaped.encode("utf-8"): char for char in pylint.checkers.unicode.BAD_CHARS
 }


 @pytest.mark.parametrize(
     "line, expected, search_dict",
     [
         # Test special carrier return cases
         pytest.param(
             "valid windows\r\n",
             {},
             pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
             id="valid-windows",
         ),
         pytest.param(
             b"TOTO = ('Caf\xe9', 'Caf\xe9', 'Caf\xe9')\r\n",
             {},
             SEARCH_DICT_BYTE_UTF8,
             id="valid-windows-bytes",
         ),
         pytest.param(
             "invalid\r windows\r\n",
             {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
             pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
             id="invalid-carrier-return-windows",
         ),
         pytest.param(
             "invalid\r linux\n",
             {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
             pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
             id="invalid-carrier-return-linux",
         ),
         pytest.param(
             b"invalid\r windows\r\n",
             {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
             SEARCH_DICT_BYTE_UTF8,
             id="invalid-carrier-return-windows-bytes",
         ),
         pytest.param(
             b"invalid\r linux\n",
             {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
             SEARCH_DICT_BYTE_UTF8,
             id="invalid-carrier-return-linux-bytes",
         ),
         # Auto test Linux all remaining Linux cases ...
         *(
             pytest.param(
                 f"invalid{char.unescaped} back\n",
                 {7: char},
                 pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
                 id=f"invalid-{char.name}-linux",
             )
             for char in pylint.checkers.unicode.BAD_CHARS
             if char.unescaped != "\r"
         ),
         # ... also byte encoded
         *(
             pytest.param(
                 f"invalid{char.unescaped} back\n".encode("ASCII"),
                 {7: char},
                 SEARCH_DICT_BYTE_UTF8,
                 id=f"invalid-{char.name}-linux-bytes",
             )
             for char in pylint.checkers.unicode.BAD_CHARS[:-1]
             if char.unescaped != "\r"
         ),
         # Test all remaining windows cases ...
         *(
             pytest.param(
                 f"invalid{char.unescaped} back\r\n",
                 {7: char},
                 pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
                 id=f"invalid-{char.name}-windows",
             )
             for char in pylint.checkers.unicode.BAD_CHARS
             if char.unescaped != "\r"
         ),
         # ... also byte encoded
         *(
             pytest.param(
                 f"invalid{char.unescaped} back\r\n".encode("ASCII"),
                 {7: char},
                 SEARCH_DICT_BYTE_UTF8,
                 id=f"invalid-{char.name}-windows-bytes",
             )
             for char in pylint.checkers.unicode.BAD_CHARS[:-1]
             if char.unescaped != "\r"
         ),
     ],
 )
 def test_map_positions_to_result(
     line: pylint.checkers.unicode._StrLike,
     expected: dict[int, pylint.checkers.unicode._BadChar],
     search_dict: dict[
         pylint.checkers.unicode._StrLike, pylint.checkers.unicode._BadChar
     ],
 ) -> None:
     """Test all possible outcomes for map position function in UTF-8 and ASCII."""
     if isinstance(line, bytes):
         newline = b"\n"
     else:
         newline = "\n"
     assert (
         pylint.checkers.unicode._map_positions_to_result(
             line, search_dict, new_line=newline
         )
         == expected
     )


 @pytest.mark.parametrize(
     "line",
     [
         pytest.param("1234567890", id="no_line_ending"),
         pytest.param(b"1234567890", id="no_line_ending_byte"),
         pytest.param("1234567890\n", id="linux"),
         pytest.param(b"1234567890\n", id="linux_byte"),
         pytest.param("1234567890\r\n", id="windows"),
         pytest.param(b"1234567890\r\n", id="windows_byte"),
         pytest.param("12345678\n\r", id="wrong_order"),
         pytest.param(b"12345678\n\r", id="wrong_order_byte"),
     ],
 )
 def test_line_length(line: pylint.checkers.unicode._StrLike) -> None:
     assert pylint.checkers.unicode._line_length(line, "utf-8") == 10


 @pytest.mark.parametrize(
     "line",
     [
         pytest.param("1234567890", id="no_line_ending"),
         pytest.param("1234567890\n", id="linux"),
         pytest.param("1234567890\r\n", id="windows"),
         pytest.param("12345678\n\r", id="wrong_order"),
     ],
 )
 def test_line_length_utf16(line: str) -> None:
     assert pylint.checkers.unicode._line_length(line.encode("utf-16"), "utf-16") == 10


 @pytest.mark.parametrize(
     "line",
     [
         pytest.param("1234567890", id="no_line_ending"),
         pytest.param("1234567890\n", id="linux"),
         pytest.param("1234567890\r\n", id="windows"),
         pytest.param("12345678\n\r", id="wrong_order"),
     ],
 )
 def test_line_length_utf32(line: str) -> None:
     assert pylint.checkers.unicode._line_length(line.encode("utf-32"), "utf-32") == 10


 @pytest.mark.parametrize(
     "codec, expected",
     [
         ("utf-8sig", "utf-8"),
         ("utf8", "utf-8"),
         ("utf 8", "utf-8"),
         ("utf-8", "utf-8"),
         ("utf-8", "utf-8"),
         ("utf-16", "utf-16"),
         ("utf-32", "utf-32"),
         ("utf 16", "utf-16"),
         ("utf 32", "utf-32"),
         ("utf 16 LE", "utf-16le"),
         ("utf 32-BE", "utf-32be"),
         ("UTF-32", "utf-32"),
         ("UTF-32-le", "utf-32le"),
         ("UTF-16 LE", "utf-16le"),
         ("UTF-16BE", "utf-16be"),
         ("UTF8", "utf-8"),
         ("Latin1", "latin1"),
         ("ASCII", "ascii"),
     ],
 )
 def test__normalize_codec_name(codec: str, expected: str) -> None:
     assert pylint.checkers.unicode._normalize_codec_name(codec) == expected


 @pytest.mark.parametrize(
     "codec, line_ending, final_new_line",
     [
         pytest.param(
             codec,
             line_ending[0],
             final_nl[0],
             id=f"{codec}_{line_ending[1]}_{final_nl[1]}",
         )
         for codec, line_ending, final_nl in itertools.product(
             (
                 "utf-8",
                 "utf-16",
                 "utf-16le",
                 "utf-16be",
                 "utf-32",
                 "utf-32le",
                 "utf-32be",
             ),
             (("\n", "linux"), ("\r\n", "windows")),
             ((True, "final_nl"), (False, "no_final_nl")),
         )
     ],
 )
 def test___fix_utf16_32_line_stream(
     tmp_path: Path, codec: str, line_ending: str, final_new_line: bool
 ) -> None:
     """Content of stream should be the same as should be the length."""

     def decode_line(line: bytes, codec: str) -> str:
         return line.decode(codec)

     file = tmp_path / "test.txt"

     content = [
         f"line1{line_ending}",
         f"# Line 2{line_ending}",
         f"łöł{line_ending}",
         f"last line{line_ending if final_new_line else ''}",
     ]

     text = "".join(content)
     encoded = text.encode(codec)

     file.write_bytes(encoded)

     gathered = b""
     collected = []
     with file.open("rb") as f:
         for line in pylint.checkers.unicode._fix_utf16_32_line_stream(f, codec):
             gathered += line
             collected.append(decode_line(line, codec))

     # Test content equality
     assert collected == content
     # Test byte equality
     assert gathered == encoded


 @pytest.mark.parametrize(
     "codec, expected",
     [
         ("utf-32", 4),
         ("utf-32-le", 4),
         ("utf-16", 2),
         ("utf-8", 1),
         ("latin1", 1),
         ("ascii", 1),
     ],
 )
 def test__byte_to_str_length(codec: str, expected: int) -> None:
     assert pylint.checkers.unicode._byte_to_str_length(codec) == expected
	# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
	# For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE
	# Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt

	from __future__ import annotations

	import itertools
	from pathlib import Path

	import pytest

	import pylint.checkers.unicode

	SEARCH_DICT_BYTE_UTF8 = {
	char.unescaped.encode("utf-8"): char for char in pylint.checkers.unicode.BAD_CHARS
	}


	@pytest.mark.parametrize(
	"line, expected, search_dict",
	[
	# Test special carrier return cases
	pytest.param(
	"valid windows\r\n",
	{},
	pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
	id="valid-windows",
	),
	pytest.param(
	b"TOTO = ('Caf\xe9', 'Caf\xe9', 'Caf\xe9')\r\n",
	{},
	SEARCH_DICT_BYTE_UTF8,
	id="valid-windows-bytes",
	),
	pytest.param(
	"invalid\r windows\r\n",
	{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
	pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
	id="invalid-carrier-return-windows",
	),
	pytest.param(
	"invalid\r linux\n",
	{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
	pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
	id="invalid-carrier-return-linux",
	),
	pytest.param(
	b"invalid\r windows\r\n",
	{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
	SEARCH_DICT_BYTE_UTF8,
	id="invalid-carrier-return-windows-bytes",
	),
	pytest.param(
	b"invalid\r linux\n",
	{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
	SEARCH_DICT_BYTE_UTF8,
	id="invalid-carrier-return-linux-bytes",
	),
	# Auto test Linux all remaining Linux cases ...
	*(
	pytest.param(
	f"invalid{char.unescaped} back\n",
	{7: char},
	pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
	id=f"invalid-{char.name}-linux",
	)
	for char in pylint.checkers.unicode.BAD_CHARS
	if char.unescaped != "\r"
	),
	# ... also byte encoded
	*(
	pytest.param(
	f"invalid{char.unescaped} back\n".encode("ASCII"),
	{7: char},
	SEARCH_DICT_BYTE_UTF8,
	id=f"invalid-{char.name}-linux-bytes",
	)
	for char in pylint.checkers.unicode.BAD_CHARS[:-1]
	if char.unescaped != "\r"
	),
	# Test all remaining windows cases ...
	*(
	pytest.param(
	f"invalid{char.unescaped} back\r\n",
	{7: char},
	pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
	id=f"invalid-{char.name}-windows",
	)
	for char in pylint.checkers.unicode.BAD_CHARS
	if char.unescaped != "\r"
	),
	# ... also byte encoded
	*(
	pytest.param(
	f"invalid{char.unescaped} back\r\n".encode("ASCII"),
	{7: char},
	SEARCH_DICT_BYTE_UTF8,
	id=f"invalid-{char.name}-windows-bytes",
	)
	for char in pylint.checkers.unicode.BAD_CHARS[:-1]
	if char.unescaped != "\r"
	),
	],
	)
	def test_map_positions_to_result(
	line: pylint.checkers.unicode._StrLike,
	expected: dict[int, pylint.checkers.unicode._BadChar],
	search_dict: dict[
	pylint.checkers.unicode._StrLike, pylint.checkers.unicode._BadChar
	],
	) -> None:
	"""Test all possible outcomes for map position function in UTF-8 and ASCII."""
	if isinstance(line, bytes):
	newline = b"\n"
	else:
	newline = "\n"
	assert (
	pylint.checkers.unicode._map_positions_to_result(
	line, search_dict, new_line=newline
	)
	== expected
	)


	@pytest.mark.parametrize(
	"line",
	[
	pytest.param("1234567890", id="no_line_ending"),
	pytest.param(b"1234567890", id="no_line_ending_byte"),
	pytest.param("1234567890\n", id="linux"),
	pytest.param(b"1234567890\n", id="linux_byte"),
	pytest.param("1234567890\r\n", id="windows"),
	pytest.param(b"1234567890\r\n", id="windows_byte"),
	pytest.param("12345678\n\r", id="wrong_order"),
	pytest.param(b"12345678\n\r", id="wrong_order_byte"),
	],
	)
	def test_line_length(line: pylint.checkers.unicode._StrLike) -> None:
	assert pylint.checkers.unicode._line_length(line, "utf-8") == 10


	@pytest.mark.parametrize(
	"line",
	[
	pytest.param("1234567890", id="no_line_ending"),
	pytest.param("1234567890\n", id="linux"),
	pytest.param("1234567890\r\n", id="windows"),
	pytest.param("12345678\n\r", id="wrong_order"),
	],
	)
	def test_line_length_utf16(line: str) -> None:
	assert pylint.checkers.unicode._line_length(line.encode("utf-16"), "utf-16") == 10


	@pytest.mark.parametrize(
	"line",
	[
	pytest.param("1234567890", id="no_line_ending"),
	pytest.param("1234567890\n", id="linux"),
	pytest.param("1234567890\r\n", id="windows"),
	pytest.param("12345678\n\r", id="wrong_order"),
	],
	)
	def test_line_length_utf32(line: str) -> None:
	assert pylint.checkers.unicode._line_length(line.encode("utf-32"), "utf-32") == 10


	@pytest.mark.parametrize(
	"codec, expected",
	[
	("utf-8sig", "utf-8"),
	("utf8", "utf-8"),
	("utf 8", "utf-8"),
	("utf-8", "utf-8"),
	("utf-8", "utf-8"),
	("utf-16", "utf-16"),
	("utf-32", "utf-32"),
	("utf 16", "utf-16"),
	("utf 32", "utf-32"),
	("utf 16 LE", "utf-16le"),
	("utf 32-BE", "utf-32be"),
	("UTF-32", "utf-32"),
	("UTF-32-le", "utf-32le"),
	("UTF-16 LE", "utf-16le"),
	("UTF-16BE", "utf-16be"),
	("UTF8", "utf-8"),
	("Latin1", "latin1"),
	("ASCII", "ascii"),
	],
	)
	def test__normalize_codec_name(codec: str, expected: str) -> None:
	assert pylint.checkers.unicode._normalize_codec_name(codec) == expected


	@pytest.mark.parametrize(
	"codec, line_ending, final_new_line",
	[
	pytest.param(
	codec,
	line_ending[0],
	final_nl[0],
	id=f"{codec}_{line_ending[1]}_{final_nl[1]}",
	)
	for codec, line_ending, final_nl in itertools.product(
	(
	"utf-8",
	"utf-16",
	"utf-16le",
	"utf-16be",
	"utf-32",
	"utf-32le",
	"utf-32be",
	),
	(("\n", "linux"), ("\r\n", "windows")),
	((True, "final_nl"), (False, "no_final_nl")),
	)
	],
	)
	def test___fix_utf16_32_line_stream(
	tmp_path: Path, codec: str, line_ending: str, final_new_line: bool
	) -> None:
	"""Content of stream should be the same as should be the length."""

	def decode_line(line: bytes, codec: str) -> str:
	return line.decode(codec)

	file = tmp_path / "test.txt"

	content = [
	f"line1{line_ending}",
	f"# Line 2{line_ending}",
	f"łöł{line_ending}",
	f"last line{line_ending if final_new_line else ''}",
	]

	text = "".join(content)
	encoded = text.encode(codec)

	file.write_bytes(encoded)

	gathered = b""
	collected = []
	with file.open("rb") as f:
	for line in pylint.checkers.unicode._fix_utf16_32_line_stream(f, codec):
	gathered += line
	collected.append(decode_line(line, codec))

	# Test content equality
	assert collected == content
	# Test byte equality
	assert gathered == encoded


	@pytest.mark.parametrize(
	"codec, expected",
	[
	("utf-32", 4),
	("utf-32-le", 4),
	("utf-16", 2),
	("utf-8", 1),
	("latin1", 1),
	("ascii", 1),
	],
	)
	def test__byte_to_str_length(codec: str, expected: int) -> None:
	assert pylint.checkers.unicode._byte_to_str_length(codec) == expected