| # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html |
| # For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE |
| # Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt |
| |
| from __future__ import annotations |
| |
| import itertools |
| from pathlib import Path |
| |
| import pytest |
| |
| import pylint.checkers.unicode |
| |
| SEARCH_DICT_BYTE_UTF8 = { |
| char.unescaped.encode("utf-8"): char for char in pylint.checkers.unicode.BAD_CHARS |
| } |
| |
| |
| @pytest.mark.parametrize( |
| "line, expected, search_dict", |
| [ |
| # Test special carrier return cases |
| pytest.param( |
| "valid windows\r\n", |
| {}, |
| pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, |
| id="valid-windows", |
| ), |
| pytest.param( |
| b"TOTO = ('Caf\xe9', 'Caf\xe9', 'Caf\xe9')\r\n", |
| {}, |
| SEARCH_DICT_BYTE_UTF8, |
| id="valid-windows-bytes", |
| ), |
| pytest.param( |
| "invalid\r windows\r\n", |
| {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, |
| pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, |
| id="invalid-carrier-return-windows", |
| ), |
| pytest.param( |
| "invalid\r linux\n", |
| {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, |
| pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, |
| id="invalid-carrier-return-linux", |
| ), |
| pytest.param( |
| b"invalid\r windows\r\n", |
| {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, |
| SEARCH_DICT_BYTE_UTF8, |
| id="invalid-carrier-return-windows-bytes", |
| ), |
| pytest.param( |
| b"invalid\r linux\n", |
| {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, |
| SEARCH_DICT_BYTE_UTF8, |
| id="invalid-carrier-return-linux-bytes", |
| ), |
| # Auto test Linux all remaining Linux cases ... |
| *( |
| pytest.param( |
| f"invalid{char.unescaped} back\n", |
| {7: char}, |
| pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, |
| id=f"invalid-{char.name}-linux", |
| ) |
| for char in pylint.checkers.unicode.BAD_CHARS |
| if char.unescaped != "\r" |
| ), |
| # ... also byte encoded |
| *( |
| pytest.param( |
| f"invalid{char.unescaped} back\n".encode("ASCII"), |
| {7: char}, |
| SEARCH_DICT_BYTE_UTF8, |
| id=f"invalid-{char.name}-linux-bytes", |
| ) |
| for char in pylint.checkers.unicode.BAD_CHARS[:-1] |
| if char.unescaped != "\r" |
| ), |
| # Test all remaining windows cases ... |
| *( |
| pytest.param( |
| f"invalid{char.unescaped} back\r\n", |
| {7: char}, |
| pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, |
| id=f"invalid-{char.name}-windows", |
| ) |
| for char in pylint.checkers.unicode.BAD_CHARS |
| if char.unescaped != "\r" |
| ), |
| # ... also byte encoded |
| *( |
| pytest.param( |
| f"invalid{char.unescaped} back\r\n".encode("ASCII"), |
| {7: char}, |
| SEARCH_DICT_BYTE_UTF8, |
| id=f"invalid-{char.name}-windows-bytes", |
| ) |
| for char in pylint.checkers.unicode.BAD_CHARS[:-1] |
| if char.unescaped != "\r" |
| ), |
| ], |
| ) |
| def test_map_positions_to_result( |
| line: pylint.checkers.unicode._StrLike, |
| expected: dict[int, pylint.checkers.unicode._BadChar], |
| search_dict: dict[ |
| pylint.checkers.unicode._StrLike, pylint.checkers.unicode._BadChar |
| ], |
| ) -> None: |
| """Test all possible outcomes for map position function in UTF-8 and ASCII.""" |
| if isinstance(line, bytes): |
| newline = b"\n" |
| else: |
| newline = "\n" |
| assert ( |
| pylint.checkers.unicode._map_positions_to_result( |
| line, search_dict, new_line=newline |
| ) |
| == expected |
| ) |
| |
| |
| @pytest.mark.parametrize( |
| "line", |
| [ |
| pytest.param("1234567890", id="no_line_ending"), |
| pytest.param(b"1234567890", id="no_line_ending_byte"), |
| pytest.param("1234567890\n", id="linux"), |
| pytest.param(b"1234567890\n", id="linux_byte"), |
| pytest.param("1234567890\r\n", id="windows"), |
| pytest.param(b"1234567890\r\n", id="windows_byte"), |
| pytest.param("12345678\n\r", id="wrong_order"), |
| pytest.param(b"12345678\n\r", id="wrong_order_byte"), |
| ], |
| ) |
| def test_line_length(line: pylint.checkers.unicode._StrLike) -> None: |
| assert pylint.checkers.unicode._line_length(line, "utf-8") == 10 |
| |
| |
| @pytest.mark.parametrize( |
| "line", |
| [ |
| pytest.param("1234567890", id="no_line_ending"), |
| pytest.param("1234567890\n", id="linux"), |
| pytest.param("1234567890\r\n", id="windows"), |
| pytest.param("12345678\n\r", id="wrong_order"), |
| ], |
| ) |
| def test_line_length_utf16(line: str) -> None: |
| assert pylint.checkers.unicode._line_length(line.encode("utf-16"), "utf-16") == 10 |
| |
| |
| @pytest.mark.parametrize( |
| "line", |
| [ |
| pytest.param("1234567890", id="no_line_ending"), |
| pytest.param("1234567890\n", id="linux"), |
| pytest.param("1234567890\r\n", id="windows"), |
| pytest.param("12345678\n\r", id="wrong_order"), |
| ], |
| ) |
| def test_line_length_utf32(line: str) -> None: |
| assert pylint.checkers.unicode._line_length(line.encode("utf-32"), "utf-32") == 10 |
| |
| |
| @pytest.mark.parametrize( |
| "codec, expected", |
| [ |
| ("utf-8sig", "utf-8"), |
| ("utf8", "utf-8"), |
| ("utf 8", "utf-8"), |
| ("utf-8", "utf-8"), |
| ("utf-8", "utf-8"), |
| ("utf-16", "utf-16"), |
| ("utf-32", "utf-32"), |
| ("utf 16", "utf-16"), |
| ("utf 32", "utf-32"), |
| ("utf 16 LE", "utf-16le"), |
| ("utf 32-BE", "utf-32be"), |
| ("UTF-32", "utf-32"), |
| ("UTF-32-le", "utf-32le"), |
| ("UTF-16 LE", "utf-16le"), |
| ("UTF-16BE", "utf-16be"), |
| ("UTF8", "utf-8"), |
| ("Latin1", "latin1"), |
| ("ASCII", "ascii"), |
| ], |
| ) |
| def test__normalize_codec_name(codec: str, expected: str) -> None: |
| assert pylint.checkers.unicode._normalize_codec_name(codec) == expected |
| |
| |
| @pytest.mark.parametrize( |
| "codec, line_ending, final_new_line", |
| [ |
| pytest.param( |
| codec, |
| line_ending[0], |
| final_nl[0], |
| id=f"{codec}_{line_ending[1]}_{final_nl[1]}", |
| ) |
| for codec, line_ending, final_nl in itertools.product( |
| ( |
| "utf-8", |
| "utf-16", |
| "utf-16le", |
| "utf-16be", |
| "utf-32", |
| "utf-32le", |
| "utf-32be", |
| ), |
| (("\n", "linux"), ("\r\n", "windows")), |
| ((True, "final_nl"), (False, "no_final_nl")), |
| ) |
| ], |
| ) |
| def test___fix_utf16_32_line_stream( |
| tmp_path: Path, codec: str, line_ending: str, final_new_line: bool |
| ) -> None: |
| """Content of stream should be the same as should be the length.""" |
| |
| def decode_line(line: bytes, codec: str) -> str: |
| return line.decode(codec) |
| |
| file = tmp_path / "test.txt" |
| |
| content = [ |
| f"line1{line_ending}", |
| f"# Line 2{line_ending}", |
| f"łöł{line_ending}", |
| f"last line{line_ending if final_new_line else ''}", |
| ] |
| |
| text = "".join(content) |
| encoded = text.encode(codec) |
| |
| file.write_bytes(encoded) |
| |
| gathered = b"" |
| collected = [] |
| with file.open("rb") as f: |
| for line in pylint.checkers.unicode._fix_utf16_32_line_stream(f, codec): |
| gathered += line |
| collected.append(decode_line(line, codec)) |
| |
| # Test content equality |
| assert collected == content |
| # Test byte equality |
| assert gathered == encoded |
| |
| |
| @pytest.mark.parametrize( |
| "codec, expected", |
| [ |
| ("utf-32", 4), |
| ("utf-32-le", 4), |
| ("utf-16", 2), |
| ("utf-8", 1), |
| ("latin1", 1), |
| ("ascii", 1), |
| ], |
| ) |
| def test__byte_to_str_length(codec: str, expected: int) -> None: |
| assert pylint.checkers.unicode._byte_to_str_length(codec) == expected |