| #!/usr/bin/env python |
| |
| # This script uses the following Unicode tables: |
| # - UnicodeData.txt |
| |
| |
| from collections import namedtuple |
| import csv |
| import os |
| import subprocess |
| |
| NUM_CODEPOINTS=0x110000 |
| |
| def to_ranges(iter): |
| current = None |
| for i in iter: |
| if current is None or i != current[1] or i in (0x10000, 0x20000): |
| if current is not None: |
| yield tuple(current) |
| current = [i, i + 1] |
| else: |
| current[1] += 1 |
| if current is not None: |
| yield tuple(current) |
| |
| def get_escaped(codepoints): |
| for c in codepoints: |
| if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): |
| yield c.value |
| |
| def get_file(f): |
| try: |
| return open(os.path.basename(f)) |
| except FileNotFoundError: |
| subprocess.run(["curl", "-O", f], check=True) |
| return open(os.path.basename(f)) |
| |
| Codepoint = namedtuple('Codepoint', 'value class_') |
| |
| def get_codepoints(f): |
| r = csv.reader(f, delimiter=";") |
| prev_codepoint = 0 |
| class_first = None |
| for row in r: |
| codepoint = int(row[0], 16) |
| name = row[1] |
| class_ = row[2] |
| |
| if class_first is not None: |
| if not name.endswith("Last>"): |
| raise ValueError("Missing Last after First") |
| |
| for c in range(prev_codepoint + 1, codepoint): |
| yield Codepoint(c, class_first) |
| |
| class_first = None |
| if name.endswith("First>"): |
| class_first = class_ |
| |
| yield Codepoint(codepoint, class_) |
| prev_codepoint = codepoint |
| |
| if class_first is not None: |
| raise ValueError("Missing Last after First") |
| |
| for c in range(prev_codepoint + 1, NUM_CODEPOINTS): |
| yield Codepoint(c, None) |
| |
| def compress_singletons(singletons): |
| uppers = [] # (upper, # items in lowers) |
| lowers = [] |
| |
| for i in singletons: |
| upper = i >> 8 |
| lower = i & 0xff |
| if len(uppers) == 0 or uppers[-1][0] != upper: |
| uppers.append((upper, 1)) |
| else: |
| upper, count = uppers[-1] |
| uppers[-1] = upper, count + 1 |
| lowers.append(lower) |
| |
| return uppers, lowers |
| |
| def compress_normal(normal): |
| # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f |
| # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff |
| compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] |
| |
| prev_start = 0 |
| for start, count in normal: |
| truelen = start - prev_start |
| falselen = count |
| prev_start = start + count |
| |
| assert truelen < 0x8000 and falselen < 0x8000 |
| entry = [] |
| if truelen > 0x7f: |
| entry.append(0x80 | (truelen >> 8)) |
| entry.append(truelen & 0xff) |
| else: |
| entry.append(truelen & 0x7f) |
| if falselen > 0x7f: |
| entry.append(0x80 | (falselen >> 8)) |
| entry.append(falselen & 0xff) |
| else: |
| entry.append(falselen & 0x7f) |
| |
| compressed.append(entry) |
| |
| return compressed |
| |
| def print_singletons(uppers, lowers, uppersname, lowersname): |
| print("const {}: &[(u8, u8)] = &[".format(uppersname)) |
| for u, c in uppers: |
| print(" ({:#04x}, {}),".format(u, c)) |
| print("];") |
| print("const {}: &[u8] = &[".format(lowersname)) |
| for i in range(0, len(lowers), 8): |
| print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) |
| print("];") |
| |
| def print_normal(normal, normalname): |
| print("const {}: &[u8] = &[".format(normalname)) |
| for v in normal: |
| print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) |
| print("];") |
| |
| def main(): |
| file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt") |
| |
| codepoints = get_codepoints(file) |
| |
| CUTOFF=0x10000 |
| singletons0 = [] |
| singletons1 = [] |
| normal0 = [] |
| normal1 = [] |
| extra = [] |
| |
| for a, b in to_ranges(get_escaped(codepoints)): |
| if a > 2 * CUTOFF: |
| extra.append((a, b - a)) |
| elif a == b - 1: |
| if a & CUTOFF: |
| singletons1.append(a & ~CUTOFF) |
| else: |
| singletons0.append(a) |
| elif a == b - 2: |
| if a & CUTOFF: |
| singletons1.append(a & ~CUTOFF) |
| singletons1.append((a + 1) & ~CUTOFF) |
| else: |
| singletons0.append(a) |
| singletons0.append(a + 1) |
| else: |
| if a >= 2 * CUTOFF: |
| extra.append((a, b - a)) |
| elif a & CUTOFF: |
| normal1.append((a & ~CUTOFF, b - a)) |
| else: |
| normal0.append((a, b - a)) |
| |
| singletons0u, singletons0l = compress_singletons(singletons0) |
| singletons1u, singletons1l = compress_singletons(singletons1) |
| normal0 = compress_normal(normal0) |
| normal1 = compress_normal(normal1) |
| |
| print("""\ |
| // NOTE: The following code was generated by "src/libcore/unicode/printable.py", |
| // do not edit directly! |
| |
| fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], |
| normal: &[u8]) -> bool { |
| let xupper = (x >> 8) as u8; |
| let mut lowerstart = 0; |
| for &(upper, lowercount) in singletonuppers { |
| let lowerend = lowerstart + lowercount as usize; |
| if xupper == upper { |
| for &lower in &singletonlowers[lowerstart..lowerend] { |
| if lower == x as u8 { |
| return false; |
| } |
| } |
| } else if xupper < upper { |
| break; |
| } |
| lowerstart = lowerend; |
| } |
| |
| let mut x = x as i32; |
| let mut normal = normal.iter().cloned(); |
| let mut current = true; |
| while let Some(v) = normal.next() { |
| let len = if v & 0x80 != 0 { |
| ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 |
| } else { |
| v as i32 |
| }; |
| x -= len; |
| if x < 0 { |
| break; |
| } |
| current = !current; |
| } |
| current |
| } |
| |
| pub(crate) fn is_printable(x: char) -> bool { |
| let x = x as u32; |
| let lower = x as u16; |
| if x < 0x10000 { |
| check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) |
| } else if x < 0x20000 { |
| check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) |
| } else {\ |
| """) |
| for a, b in extra: |
| print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) |
| print(" return false;") |
| print(" }") |
| print("""\ |
| true |
| } |
| }\ |
| """) |
| print() |
| print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L') |
| print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L') |
| print_normal(normal0, 'NORMAL0') |
| print_normal(normal1, 'NORMAL1') |
| |
| if __name__ == '__main__': |
| main() |