blob: 23feb63fb36026c6a1f596189af521b6611b6609 [file] [log] [blame]
#!/usr/bin/env python3
from argparse import ArgumentParser
from collections import defaultdict
from functools import reduce
from pathlib import Path
from re import match
from more_itertools import chunked
def main(input_file, output_file):
Nothing = 0
WhiteSpace = 1
IdentifierStart = 2
IdentifierPart = 3
cat_indices = {
"zs": WhiteSpace,
"lc": IdentifierStart,
"ll": IdentifierStart,
"lm": IdentifierStart,
"lo": IdentifierStart,
"lt": IdentifierStart,
"lu": IdentifierStart,
"nl": IdentifierStart,
"mc": IdentifierPart,
"mn": IdentifierPart,
"pc": IdentifierPart,
"nd": IdentifierPart,
}
planes = defaultdict(lambda: [0] * 0x10000)
for input_line in input_file:
m = match(r"^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+([A-Z][a-z])", input_line)
if not m:
continue
start, end, cat = m.groups()
idx = cat_indices.get(cat.lower())
if idx:
end = int(end or start, 16)
start = int(start, 16)
for i in range(start, end + 1):
planes[i // 0x10000][i % 0x10000] = idx
# per: https://spec.json5.org/#white-space
for i in (0x9, 0xA, 0xB, 0xC, 0xD, 0x20, 0xA0, 0x2028, 0x2028, 0x2029, 0xFEFF):
planes[0][i] = WhiteSpace
# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (ord("$"), ord("_"), ord("\\")):
planes[0][i] = IdentifierStart
# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (0x200C, 0x200D):
planes[0][i] = IdentifierPart
print("#ifndef JSON5EncoderCpp_unicode_cat_of", file=output_file)
print("#define JSON5EncoderCpp_unicode_cat_of", file=output_file)
print(file=output_file)
print("// GENERATED FILE", file=output_file)
print("// All changes will be lost.", file=output_file)
print(file=output_file)
print("#include <cstdint>", file=output_file)
print(file=output_file)
print("namespace JSON5EncoderCpp {", file=output_file)
print("inline namespace {", file=output_file)
print(file=output_file)
print("static unsigned unicode_cat_of(std::uint32_t codepoint) {", file=output_file)
print(" static std::uint8_t plane_X[0x10000 / 4] = {0};", file=output_file)
print(file=output_file)
for plane_idx, plane_data in planes.items():
print(
" static std::uint8_t plane_" + str(plane_idx) + "[0x10000 / 4] = {",
file=output_file,
)
for chunk in chunked(plane_data, 4 * 16):
print(" ", end="", file=output_file)
for value in chunked(chunk, 4):
value = reduce(lambda a, i: ((a << 2) | i), reversed(value), 0)
print("0x{:02x}u".format(value), end=", ", file=output_file)
print(file=output_file)
print(" };", file=output_file)
print(file=output_file)
print(" static std::uint8_t *planes[17] = {", end="", file=output_file)
for plane_idx in range(0, 17):
if plane_idx % 8 == 0:
print("\n ", end="", file=output_file)
if plane_idx in planes:
print("plane_" + str(plane_idx) + ", ", end="", file=output_file)
else:
print("plane_X, ", end="", file=output_file)
print(file=output_file)
print(" };", file=output_file)
print(file=output_file)
print(
" std::uint16_t plane_idx = std::uint16_t(codepoint / 0x10000);",
file=output_file,
)
print(
" if (JSON5EncoderCpp_expect(plane_idx > 16, false)) return 1;",
file=output_file,
)
print(
" std::uint16_t datum_idx = std::uint16_t(codepoint & 0xffff);",
file=output_file,
)
print(" const std::uint8_t *plane = planes[plane_idx];", file=output_file)
print(
" return (plane[datum_idx / 4] >> (2 * (datum_idx % 4))) % 4;",
file=output_file,
)
print("}", file=output_file)
print(file=output_file)
print("}", file=output_file)
print("}", file=output_file)
print(file=output_file)
print("#endif", file=output_file)
argparser = ArgumentParser(description="Generate Unicode Category Matcher(s)")
argparser.add_argument("input", nargs="?", type=Path, default=Path("/dev/stdin"))
argparser.add_argument("output", nargs="?", type=Path, default=Path("/dev/stdout"))
if __name__ == "__main__":
args = argparser.parse_args()
with open(str(args.input.resolve()), "rt") as input_file, open(
str(args.output.resolve()), "wt"
) as output_file:
raise SystemExit(main(input_file, output_file))