blob: 09401fd64f81de3dac0258f202b2c000ae1fd40d [file] [log] [blame]
# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
# For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE
# Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt
from __future__ import annotations
import itertools
from pathlib import Path
import pytest
import pylint.checkers.unicode
SEARCH_DICT_BYTE_UTF8 = {
char.unescaped.encode("utf-8"): char for char in pylint.checkers.unicode.BAD_CHARS
}
@pytest.mark.parametrize(
"line, expected, search_dict",
[
# Test special carrier return cases
pytest.param(
"valid windows\r\n",
{},
pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
id="valid-windows",
),
pytest.param(
b"TOTO = ('Caf\xe9', 'Caf\xe9', 'Caf\xe9')\r\n",
{},
SEARCH_DICT_BYTE_UTF8,
id="valid-windows-bytes",
),
pytest.param(
"invalid\r windows\r\n",
{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
id="invalid-carrier-return-windows",
),
pytest.param(
"invalid\r linux\n",
{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
id="invalid-carrier-return-linux",
),
pytest.param(
b"invalid\r windows\r\n",
{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
SEARCH_DICT_BYTE_UTF8,
id="invalid-carrier-return-windows-bytes",
),
pytest.param(
b"invalid\r linux\n",
{7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]},
SEARCH_DICT_BYTE_UTF8,
id="invalid-carrier-return-linux-bytes",
),
# Auto test Linux all remaining Linux cases ...
*(
pytest.param(
f"invalid{char.unescaped} back\n",
{7: char},
pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
id=f"invalid-{char.name}-linux",
)
for char in pylint.checkers.unicode.BAD_CHARS
if char.unescaped != "\r"
),
# ... also byte encoded
*(
pytest.param(
f"invalid{char.unescaped} back\n".encode("ASCII"),
{7: char},
SEARCH_DICT_BYTE_UTF8,
id=f"invalid-{char.name}-linux-bytes",
)
for char in pylint.checkers.unicode.BAD_CHARS[:-1]
if char.unescaped != "\r"
),
# Test all remaining windows cases ...
*(
pytest.param(
f"invalid{char.unescaped} back\r\n",
{7: char},
pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT,
id=f"invalid-{char.name}-windows",
)
for char in pylint.checkers.unicode.BAD_CHARS
if char.unescaped != "\r"
),
# ... also byte encoded
*(
pytest.param(
f"invalid{char.unescaped} back\r\n".encode("ASCII"),
{7: char},
SEARCH_DICT_BYTE_UTF8,
id=f"invalid-{char.name}-windows-bytes",
)
for char in pylint.checkers.unicode.BAD_CHARS[:-1]
if char.unescaped != "\r"
),
],
)
def test_map_positions_to_result(
line: pylint.checkers.unicode._StrLike,
expected: dict[int, pylint.checkers.unicode._BadChar],
search_dict: dict[
pylint.checkers.unicode._StrLike, pylint.checkers.unicode._BadChar
],
) -> None:
"""Test all possible outcomes for map position function in UTF-8 and ASCII."""
if isinstance(line, bytes):
newline = b"\n"
else:
newline = "\n"
assert (
pylint.checkers.unicode._map_positions_to_result(
line, search_dict, new_line=newline
)
== expected
)
@pytest.mark.parametrize(
"line",
[
pytest.param("1234567890", id="no_line_ending"),
pytest.param(b"1234567890", id="no_line_ending_byte"),
pytest.param("1234567890\n", id="linux"),
pytest.param(b"1234567890\n", id="linux_byte"),
pytest.param("1234567890\r\n", id="windows"),
pytest.param(b"1234567890\r\n", id="windows_byte"),
pytest.param("12345678\n\r", id="wrong_order"),
pytest.param(b"12345678\n\r", id="wrong_order_byte"),
],
)
def test_line_length(line: pylint.checkers.unicode._StrLike) -> None:
assert pylint.checkers.unicode._line_length(line, "utf-8") == 10
@pytest.mark.parametrize(
"line",
[
pytest.param("1234567890", id="no_line_ending"),
pytest.param("1234567890\n", id="linux"),
pytest.param("1234567890\r\n", id="windows"),
pytest.param("12345678\n\r", id="wrong_order"),
],
)
def test_line_length_utf16(line: str) -> None:
assert pylint.checkers.unicode._line_length(line.encode("utf-16"), "utf-16") == 10
@pytest.mark.parametrize(
"line",
[
pytest.param("1234567890", id="no_line_ending"),
pytest.param("1234567890\n", id="linux"),
pytest.param("1234567890\r\n", id="windows"),
pytest.param("12345678\n\r", id="wrong_order"),
],
)
def test_line_length_utf32(line: str) -> None:
assert pylint.checkers.unicode._line_length(line.encode("utf-32"), "utf-32") == 10
@pytest.mark.parametrize(
"codec, expected",
[
("utf-8sig", "utf-8"),
("utf8", "utf-8"),
("utf 8", "utf-8"),
("utf-8", "utf-8"),
("utf-8", "utf-8"),
("utf-16", "utf-16"),
("utf-32", "utf-32"),
("utf 16", "utf-16"),
("utf 32", "utf-32"),
("utf 16 LE", "utf-16le"),
("utf 32-BE", "utf-32be"),
("UTF-32", "utf-32"),
("UTF-32-le", "utf-32le"),
("UTF-16 LE", "utf-16le"),
("UTF-16BE", "utf-16be"),
("UTF8", "utf-8"),
("Latin1", "latin1"),
("ASCII", "ascii"),
],
)
def test__normalize_codec_name(codec: str, expected: str) -> None:
assert pylint.checkers.unicode._normalize_codec_name(codec) == expected
@pytest.mark.parametrize(
"codec, line_ending, final_new_line",
[
pytest.param(
codec,
line_ending[0],
final_nl[0],
id=f"{codec}_{line_ending[1]}_{final_nl[1]}",
)
for codec, line_ending, final_nl in itertools.product(
(
"utf-8",
"utf-16",
"utf-16le",
"utf-16be",
"utf-32",
"utf-32le",
"utf-32be",
),
(("\n", "linux"), ("\r\n", "windows")),
((True, "final_nl"), (False, "no_final_nl")),
)
],
)
def test___fix_utf16_32_line_stream(
tmp_path: Path, codec: str, line_ending: str, final_new_line: bool
) -> None:
"""Content of stream should be the same as should be the length."""
def decode_line(line: bytes, codec: str) -> str:
return line.decode(codec)
file = tmp_path / "test.txt"
content = [
f"line1{line_ending}",
f"# Line 2{line_ending}",
f"łöł{line_ending}",
f"last line{line_ending if final_new_line else ''}",
]
text = "".join(content)
encoded = text.encode(codec)
file.write_bytes(encoded)
gathered = b""
collected = []
with file.open("rb") as f:
for line in pylint.checkers.unicode._fix_utf16_32_line_stream(f, codec):
gathered += line
collected.append(decode_line(line, codec))
# Test content equality
assert collected == content
# Test byte equality
assert gathered == encoded
@pytest.mark.parametrize(
"codec, expected",
[
("utf-32", 4),
("utf-32-le", 4),
("utf-16", 2),
("utf-8", 1),
("latin1", 1),
("ascii", 1),
],
)
def test__byte_to_str_length(codec: str, expected: int) -> None:
assert pylint.checkers.unicode._byte_to_str_length(codec) == expected