blob: bffd073642d83d2b3c6d932f6a02015725e69040 [file] [log] [blame]
#!/usr/bin/env python3
from argparse import ArgumentParser
from collections import defaultdict
from functools import reduce
from pathlib import Path
from re import match
from more_itertools import chunked
def main(input_file, output_file):
Nothing = 0
WhiteSpace = 1
IdentifierStart = 2
IdentifierPart = 3
cat_indices = {
'zs': WhiteSpace,
'lc': IdentifierStart,
'll': IdentifierStart,
'lm': IdentifierStart,
'lo': IdentifierStart,
'lt': IdentifierStart,
'lu': IdentifierStart,
'nl': IdentifierStart,
'mc': IdentifierPart,
'mn': IdentifierPart,
'pc': IdentifierPart,
'nd': IdentifierPart,
}
planes = defaultdict(lambda: [0] * 0x1_0000)
for input_line in input_file:
m = match(r'^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+([A-Z][a-z])', input_line)
if not m:
continue
start, end, cat = m.groups()
idx = cat_indices.get(cat.lower())
if idx:
end = int(end or start, 16)
start = int(start, 16)
for i in range(start, end + 1):
planes[i // 0x1_0000][i % 0x1_0000] = idx
# per: https://spec.json5.org/#white-space
for i in (0x9, 0xa, 0xb, 0xc, 0xd, 0x20, 0xa0, 0x2028, 0x2028, 0x2029, 0xfeff):
planes[0][i] = WhiteSpace
# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (ord('$'), ord('_'), ord('\\')):
planes[0][i] = IdentifierStart
# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (0x200C, 0x200D):
planes[0][i] = IdentifierPart
print('#ifndef JSON5EncoderCpp_unicode_cat_of', file=output_file)
print('#define JSON5EncoderCpp_unicode_cat_of', file=output_file)
print(file=output_file)
print('// GENERATED FILE', file=output_file)
print('// All changes will be lost.', file=output_file)
print(file=output_file)
print('#include <cstdint>', file=output_file)
print(file=output_file)
print('namespace JSON5EncoderCpp {', file=output_file)
print('inline namespace {', file=output_file)
print(file=output_file)
print('static unsigned unicode_cat_of(std::uint32_t codepoint) {', file=output_file)
print(' static std::uint8_t plane_X[0x10000 / 4] = {0};', file=output_file)
print(file=output_file)
for plane_idx, plane_data in planes.items():
print(' static std::uint8_t plane_' + str(plane_idx) + '[0x10000 / 4] = {', file=output_file)
for chunk in chunked(plane_data, 4*16):
print(' ', end='', file=output_file)
for value in chunked(chunk, 4):
value = reduce(lambda a, i: ((a << 2) | i), reversed(value), 0)
print('0x{:02x}u'.format(value), end=', ', file=output_file)
print(file=output_file)
print(' };', file=output_file)
print(file=output_file)
print(' static std::uint8_t *planes[17] = {', end='', file=output_file)
for plane_idx in range(0, 17):
if plane_idx % 8 == 0:
print('\n ', end='', file=output_file)
if plane_idx in planes:
print('plane_' + str(plane_idx) + ', ', end='', file=output_file)
else:
print('plane_X, ', end='', file=output_file)
print(file=output_file)
print(' };', file=output_file)
print(file=output_file)
print(' std::uint16_t plane_idx = std::uint16_t(codepoint / 0x10000);', file=output_file)
print(' if (__builtin_expect(plane_idx > 16, false)) return 1;', file=output_file)
print(' std::uint16_t datum_idx = std::uint16_t(codepoint & 0xffff);', file=output_file)
print(' const std::uint8_t *plane = planes[plane_idx];', file=output_file)
print(' return (plane[datum_idx / 4] >> (2 * (datum_idx % 4))) % 4;', file=output_file)
print('}', file=output_file)
print(file=output_file)
print('}', file=output_file)
print('}', file=output_file)
print(file=output_file)
print('#endif', file=output_file)
argparser = ArgumentParser(description='Generate Unicode Category Matcher(s)')
argparser.add_argument('input', nargs='?', type=Path, default=Path('/dev/stdin'))
argparser.add_argument('output', nargs='?', type=Path, default=Path('/dev/stdout'))
if __name__ == '__main__':
args = argparser.parse_args()
with open(args.input, 'rt') as input_file, \
open(args.output, 'wt') as output_file:
raise SystemExit(main(input_file, output_file))