| #!/usr/bin/env python |
| |
| """ |
| Regenerate Unicode tables (tables.rs). |
| """ |
| |
| # This script uses the Unicode tables as defined |
| # in the UnicodeFiles class. |
| |
| # Since this should not require frequent updates, we just store this |
| # out-of-line and check the tables.rs file into git. |
| |
| # Note that the "curl" program is required for operation. |
| # This script is compatible with Python 2.7 and 3.x. |
| |
| import argparse |
| import datetime |
| import fileinput |
| import itertools |
| import os |
| import re |
| import textwrap |
| import subprocess |
| |
| from collections import defaultdict, namedtuple |
| |
| try: |
| # Python 3 |
| from itertools import zip_longest |
| from io import StringIO |
| except ImportError: |
| # Python 2 compatibility |
| zip_longest = itertools.izip_longest |
| from StringIO import StringIO |
| |
| try: |
| # Completely optional type hinting |
| # (Python 2 compatible using comments, |
| # see: https://mypy.readthedocs.io/en/latest/python2.html) |
| # This is very helpful in typing-aware IDE like PyCharm. |
| from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple |
| except ImportError: |
| pass |
| |
| |
| # We don't use enum.Enum because of Python 2.7 compatibility. |
| class UnicodeFiles(object): |
| # ReadMe does not contain any Unicode data, we |
| # only use it to extract versions. |
| README = "ReadMe.txt" |
| |
| DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt" |
| DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt" |
| PROPS = "PropList.txt" |
| SCRIPTS = "Scripts.txt" |
| SPECIAL_CASING = "SpecialCasing.txt" |
| UNICODE_DATA = "UnicodeData.txt" |
| |
| |
| # The order doesn't really matter (Python < 3.6 won't preserve it), |
| # we only want to aggregate all the file names. |
| ALL_UNICODE_FILES = tuple( |
| value for name, value in UnicodeFiles.__dict__.items() |
| if not name.startswith("_") |
| ) |
| |
| assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files" |
| |
| # The directory this file is located in. |
| THIS_DIR = os.path.dirname(os.path.realpath(__file__)) |
| |
| # Where to download the Unicode data. The downloaded files |
| # will be placed in sub-directories named after Unicode version. |
| FETCH_DIR = os.path.join(THIS_DIR, "downloaded") |
| |
| FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}" |
| FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}" |
| |
| PREAMBLE = """\ |
| // NOTE: The following code was generated by "./unicode.py", do not edit directly |
| |
| #![allow(missing_docs, non_upper_case_globals, non_snake_case, clippy::unreadable_literal)] |
| |
| use crate::unicode::bool_trie::{{BoolTrie, SmallBoolTrie}}; |
| use crate::unicode::version::UnicodeVersion; |
| """.format(year=datetime.datetime.now().year) |
| |
| # Mapping taken from Table 12 from: |
| # http://www.unicode.org/reports/tr44/#General_Category_Values |
| EXPANDED_CATEGORIES = { |
| "Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"], |
| "Lm": ["L"], "Lo": ["L"], |
| "Mn": ["M"], "Mc": ["M"], "Me": ["M"], |
| "Nd": ["N"], "Nl": ["N"], "No": ["N"], |
| "Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"], |
| "Pi": ["P"], "Pf": ["P"], "Po": ["P"], |
| "Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"], |
| "Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"], |
| "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"], |
| } |
| |
| # This is the (inclusive) range of surrogate codepoints. |
| # These are not valid Rust characters. |
| SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff) |
| |
| UnicodeData = namedtuple( |
| "UnicodeData", ( |
| # Conversions: |
| "to_upper", "to_lower", "to_title", |
| |
| # Decompositions: canonical decompositions, compatibility decomp |
| "canon_decomp", "compat_decomp", |
| |
| # Grouped: general categories and combining characters |
| "general_categories", "combines", |
| ) |
| ) |
| |
| UnicodeVersion = namedtuple( |
| "UnicodeVersion", ("major", "minor", "micro", "as_str") |
| ) |
| |
| |
| def fetch_files(version=None): |
| # type: (str) -> UnicodeVersion |
| """ |
| Fetch all the Unicode files from unicode.org. |
| |
| This will use cached files (stored in `FETCH_DIR`) if they exist, |
| creating them if they don't. In any case, the Unicode version |
| is always returned. |
| |
| :param version: The desired Unicode version, as string. |
| (If None, defaults to latest final release available, |
| querying the unicode.org service). |
| """ |
| have_version = check_stored_version(version) |
| if have_version: |
| return have_version |
| |
| if version: |
| # Check if the desired version exists on the server. |
| get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name) |
| else: |
| # Extract the latest version. |
| get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name) |
| |
| readme_url = get_fetch_url(UnicodeFiles.README) |
| |
| print("Fetching: {}".format(readme_url)) |
| readme_content = subprocess.check_output(("curl", readme_url)) |
| |
| unicode_version = parse_readme_unicode_version( |
| readme_content.decode("utf8") |
| ) |
| |
| download_dir = get_unicode_dir(unicode_version) |
| if not os.path.exists(download_dir): |
| # For 2.7 compat, we don't use `exist_ok=True`. |
| os.makedirs(download_dir) |
| |
| for filename in ALL_UNICODE_FILES: |
| file_path = get_unicode_file_path(unicode_version, filename) |
| |
| if os.path.exists(file_path): |
| # Assume file on the server didn't change if it's been saved before. |
| continue |
| |
| if filename == UnicodeFiles.README: |
| with open(file_path, "wb") as fd: |
| fd.write(readme_content) |
| else: |
| url = get_fetch_url(filename) |
| print("Fetching: {}".format(url)) |
| subprocess.check_call(("curl", "-o", file_path, url)) |
| |
| return unicode_version |
| |
| |
| def check_stored_version(version): |
| # type: (Optional[str]) -> Optional[UnicodeVersion] |
| """ |
| Given desired Unicode version, return the version |
| if stored files are all present, and `None` otherwise. |
| """ |
| if not version: |
| # If no desired version specified, we should check what's the latest |
| # version, skipping stored version checks. |
| return None |
| |
| fetch_dir = os.path.join(FETCH_DIR, version) |
| |
| for filename in ALL_UNICODE_FILES: |
| file_path = os.path.join(fetch_dir, filename) |
| |
| if not os.path.exists(file_path): |
| return None |
| |
| with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd: |
| return parse_readme_unicode_version(fd.read()) |
| |
| |
| def parse_readme_unicode_version(readme_content): |
| # type: (str) -> UnicodeVersion |
| """ |
| Parse the Unicode version contained in their `ReadMe.txt` file. |
| """ |
| # "Raw string" is necessary for \d not being treated as escape char |
| # (for the sake of compat with future Python versions). |
| # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior |
| pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" |
| groups = re.search(pattern, readme_content).groups() |
| |
| return UnicodeVersion(*map(int, groups), as_str=".".join(groups)) |
| |
| |
| def get_unicode_dir(unicode_version): |
| # type: (UnicodeVersion) -> str |
| """ |
| Indicate in which parent dir the Unicode data files should be stored. |
| |
| This returns a full, absolute path. |
| """ |
| return os.path.join(FETCH_DIR, unicode_version.as_str) |
| |
| |
| def get_unicode_file_path(unicode_version, filename): |
| # type: (UnicodeVersion, str) -> str |
| """ |
| Indicate where the Unicode data file should be stored. |
| """ |
| return os.path.join(get_unicode_dir(unicode_version), filename) |
| |
| |
| def is_surrogate(n): |
| # type: (int) -> bool |
| """ |
| Tell if given codepoint is a surrogate (not a valid Rust character). |
| """ |
| return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1] |
| |
| |
| def load_unicode_data(file_path): |
| # type: (str) -> UnicodeData |
| """ |
| Load main Unicode data. |
| """ |
| # Conversions |
| to_lower = {} # type: Dict[int, Tuple[int, int, int]] |
| to_upper = {} # type: Dict[int, Tuple[int, int, int]] |
| to_title = {} # type: Dict[int, Tuple[int, int, int]] |
| |
| # Decompositions |
| compat_decomp = {} # type: Dict[int, List[int]] |
| canon_decomp = {} # type: Dict[int, List[int]] |
| |
| # Combining characters |
| # FIXME: combines are not used |
| combines = defaultdict(set) # type: Dict[str, Set[int]] |
| |
| # Categories |
| general_categories = defaultdict(set) # type: Dict[str, Set[int]] |
| category_assigned_codepoints = set() # type: Set[int] |
| |
| all_codepoints = {} |
| |
| range_start = -1 |
| |
| for line in fileinput.input(file_path): |
| data = line.split(";") |
| if len(data) != 15: |
| continue |
| codepoint = int(data[0], 16) |
| if is_surrogate(codepoint): |
| continue |
| if range_start >= 0: |
| for i in range(range_start, codepoint): |
| all_codepoints[i] = data |
| range_start = -1 |
| if data[1].endswith(", First>"): |
| range_start = codepoint |
| continue |
| all_codepoints[codepoint] = data |
| |
| for code, data in all_codepoints.items(): |
| (code_org, name, gencat, combine, bidi, |
| decomp, deci, digit, num, mirror, |
| old, iso, upcase, lowcase, titlecase) = data |
| |
| # Generate char to char direct common and simple conversions: |
| |
| # Uppercase to lowercase |
| if lowcase != "" and code_org != lowcase: |
| to_lower[code] = (int(lowcase, 16), 0, 0) |
| |
| # Lowercase to uppercase |
| if upcase != "" and code_org != upcase: |
| to_upper[code] = (int(upcase, 16), 0, 0) |
| |
| # Title case |
| if titlecase.strip() != "" and code_org != titlecase: |
| to_title[code] = (int(titlecase, 16), 0, 0) |
| |
| # Store decomposition, if given |
| if decomp: |
| decompositions = decomp.split()[1:] |
| decomp_code_points = [int(i, 16) for i in decompositions] |
| |
| if decomp.startswith("<"): |
| # Compatibility decomposition |
| compat_decomp[code] = decomp_code_points |
| else: |
| # Canonical decomposition |
| canon_decomp[code] = decomp_code_points |
| |
| # Place letter in categories as appropriate. |
| for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])): |
| general_categories[cat].add(code) |
| category_assigned_codepoints.add(code) |
| |
| # Record combining class, if any. |
| if combine != "0": |
| combines[combine].add(code) |
| |
| # Generate Not_Assigned from Assigned. |
| general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints) |
| |
| # Other contains Not_Assigned |
| general_categories["C"].update(general_categories["Cn"]) |
| |
| grouped_categories = group_categories(general_categories) |
| |
| # FIXME: combines are not used |
| return UnicodeData( |
| to_lower=to_lower, to_upper=to_upper, to_title=to_title, |
| compat_decomp=compat_decomp, canon_decomp=canon_decomp, |
| general_categories=grouped_categories, combines=combines, |
| ) |
| |
| |
| def load_special_casing(file_path, unicode_data): |
| # type: (str, UnicodeData) -> None |
| """ |
| Load special casing data and enrich given Unicode data. |
| """ |
| for line in fileinput.input(file_path): |
| data = line.split("#")[0].split(";") |
| if len(data) == 5: |
| code, lower, title, upper, _comment = data |
| elif len(data) == 6: |
| code, lower, title, upper, condition, _comment = data |
| if condition.strip(): # Only keep unconditional mappins |
| continue |
| else: |
| continue |
| code = code.strip() |
| lower = lower.strip() |
| title = title.strip() |
| upper = upper.strip() |
| key = int(code, 16) |
| for (map_, values) in ((unicode_data.to_lower, lower), |
| (unicode_data.to_upper, upper), |
| (unicode_data.to_title, title)): |
| if values != code: |
| split = values.split() |
| |
| codepoints = list(itertools.chain( |
| (int(i, 16) for i in split), |
| (0 for _ in range(len(split), 3)) |
| )) |
| |
| assert len(codepoints) == 3 |
| map_[key] = codepoints |
| |
| |
| def group_categories(mapping): |
| # type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]] |
| """ |
| Group codepoints mapped in "categories". |
| """ |
| return {category: group_codepoints(codepoints) |
| for category, codepoints in mapping.items()} |
| |
| |
| def group_codepoints(codepoints): |
| # type: (Iterable[int]) -> List[Tuple[int, int]] |
| """ |
| Group integral values into continuous, disjoint value ranges. |
| |
| Performs value deduplication. |
| |
| :return: sorted list of pairs denoting start and end of codepoint |
| group values, both ends inclusive. |
| |
| >>> group_codepoints([1, 2, 10, 11, 12, 3, 4]) |
| [(1, 4), (10, 12)] |
| >>> group_codepoints([1]) |
| [(1, 1)] |
| >>> group_codepoints([1, 5, 6]) |
| [(1, 1), (5, 6)] |
| >>> group_codepoints([]) |
| [] |
| """ |
| sorted_codes = sorted(set(codepoints)) |
| result = [] # type: List[Tuple[int, int]] |
| |
| if not sorted_codes: |
| return result |
| |
| next_codes = sorted_codes[1:] |
| start_code = sorted_codes[0] |
| |
| for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None): |
| if next_code is None or next_code - code != 1: |
| result.append((start_code, code)) |
| start_code = next_code |
| |
| return result |
| |
| |
| def ungroup_codepoints(codepoint_pairs): |
| # type: (Iterable[Tuple[int, int]]) -> List[int] |
| """ |
| The inverse of group_codepoints -- produce a flat list of values |
| from value range pairs. |
| |
| >>> ungroup_codepoints([(1, 4), (10, 12)]) |
| [1, 2, 3, 4, 10, 11, 12] |
| >>> ungroup_codepoints([(1, 1), (5, 6)]) |
| [1, 5, 6] |
| >>> ungroup_codepoints(group_codepoints([1, 2, 7, 8])) |
| [1, 2, 7, 8] |
| >>> ungroup_codepoints([]) |
| [] |
| """ |
| return list(itertools.chain.from_iterable( |
| range(lo, hi + 1) for lo, hi in codepoint_pairs |
| )) |
| |
| |
| def get_unassigned_codepoints(assigned_codepoints): |
| # type: (Set[int]) -> Set[int] |
| """ |
| Given a set of "assigned" codepoints, return a set |
| of these that are not in assigned and not surrogate. |
| """ |
| return {i for i in range(0, 0x110000) |
| if i not in assigned_codepoints and not is_surrogate(i)} |
| |
| |
| def generate_table_lines(items, indent, wrap=98): |
| # type: (Iterable[str], int, int) -> Iterator[str] |
| """ |
| Given table items, generate wrapped lines of text with comma-separated items. |
| |
| This is a generator function. |
| |
| :param wrap: soft wrap limit (characters per line), integer. |
| """ |
| line = " " * indent |
| first = True |
| for item in items: |
| if len(line) + len(item) < wrap: |
| if first: |
| line += item |
| else: |
| line += ", " + item |
| first = False |
| else: |
| yield line + ",\n" |
| line = " " * indent + item |
| |
| yield line |
| |
| |
| def load_properties(file_path, interesting_props): |
| # type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]] |
| """ |
| Load properties data and return in grouped form. |
| """ |
| props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]] |
| # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars |
| # (for the sake of compat with future Python versions). |
| # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior |
| re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") |
| re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") |
| |
| for line in fileinput.input(file_path): |
| match = re1.match(line) or re2.match(line) |
| if match: |
| groups = match.groups() |
| |
| if len(groups) == 2: |
| # `re1` matched (2 groups). |
| d_lo, prop = groups |
| d_hi = d_lo |
| else: |
| d_lo, d_hi, prop = groups |
| else: |
| continue |
| |
| if interesting_props and prop not in interesting_props: |
| continue |
| |
| lo_value = int(d_lo, 16) |
| hi_value = int(d_hi, 16) |
| |
| props[prop].append((lo_value, hi_value)) |
| |
| # Optimize if possible. |
| for prop in props: |
| props[prop] = group_codepoints(ungroup_codepoints(props[prop])) |
| |
| return props |
| |
| |
| def escape_char(c): |
| # type: (int) -> str |
| r""" |
| Escape a codepoint for use as Rust char literal. |
| |
| Outputs are OK to use as Rust source code as char literals |
| and they also include necessary quotes. |
| |
| >>> escape_char(97) |
| "'\\u{61}'" |
| >>> escape_char(0) |
| "'\\0'" |
| """ |
| return r"'\u{%x}'" % c if c != 0 else r"'\0'" |
| |
| |
| def format_char_pair(pair): |
| # type: (Tuple[int, int]) -> str |
| """ |
| Format a pair of two Rust chars. |
| """ |
| return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1])) |
| |
| |
| def generate_table( |
| name, # type: str |
| items, # type: List[Tuple[int, int]] |
| decl_type="&[(char, char)]", # type: str |
| is_pub=True, # type: bool |
| format_item=format_char_pair, # type: Callable[[Tuple[int, int]], str] |
| ): |
| # type: (...) -> Iterator[str] |
| """ |
| Generate a nicely formatted Rust constant "table" array. |
| |
| This generates actual Rust code. |
| """ |
| pub_string = "" |
| if is_pub: |
| pub_string = "pub " |
| |
| yield "\n" |
| yield " #[rustfmt::skip]\n" |
| yield " %sconst %s: %s = &[\n" % (pub_string, name, decl_type) |
| |
| data = [] |
| first = True |
| for item in items: |
| if not first: |
| data.append(",") |
| first = False |
| data.extend(format_item(item)) |
| |
| for table_line in generate_table_lines("".join(data).split(","), 8): |
| yield table_line |
| |
| yield "\n ];\n" |
| |
| |
| def compute_trie(raw_data, chunk_size): |
| # type: (List[int], int) -> Tuple[List[int], List[int]] |
| """ |
| Compute postfix-compressed trie. |
| |
| See: bool_trie.rs for more details. |
| |
| >>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3) |
| ([0, 0, 1], [1, 2, 3, 4, 5, 6]) |
| >>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3) |
| ([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6]) |
| """ |
| root = [] |
| childmap = {} # type: Dict[Tuple[int, ...], int] |
| child_data = [] |
| |
| assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized" |
| |
| for i in range(len(raw_data) // chunk_size): |
| data = raw_data[i * chunk_size : (i + 1) * chunk_size] |
| |
| # Postfix compression of child nodes (data chunks) |
| # (identical child nodes are shared). |
| |
| # Make a tuple out of the list so it's hashable. |
| child = tuple(data) |
| if child not in childmap: |
| childmap[child] = len(childmap) |
| child_data.extend(data) |
| |
| root.append(childmap[child]) |
| |
| return root, child_data |
| |
| |
| def generate_bool_trie(name, codepoint_ranges, is_pub=False): |
| # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] |
| """ |
| Generate Rust code for BoolTrie struct. |
| |
| This yields string fragments that should be joined to produce |
| the final string. |
| |
| See: `bool_trie.rs`. |
| """ |
| chunk_size = 64 |
| rawdata = [False] * 0x110000 |
| for (lo, hi) in codepoint_ranges: |
| for cp in range(lo, hi + 1): |
| rawdata[cp] = True |
| |
| # Convert to bitmap chunks of `chunk_size` bits each. |
| chunks = [] |
| for i in range(0x110000 // chunk_size): |
| chunk = 0 |
| for j in range(chunk_size): |
| if rawdata[i * chunk_size + j]: |
| chunk |= 1 << j |
| chunks.append(chunk) |
| |
| pub_string = "" |
| if is_pub: |
| pub_string = "pub " |
| |
| yield "\n" |
| yield " #[rustfmt::skip]\n" |
| yield " %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name) |
| yield " r1: [\n" |
| data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size]) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| # 0x800..0x10000 trie |
| (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size) |
| yield " r2: [\n" |
| data = map(str, r2) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| yield " r3: &[\n" |
| data = ("0x%016x" % node for node in r3) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| # 0x10000..0x110000 trie |
| (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], |
| 64 // chunk_size) |
| (r4, r5) = compute_trie(mid, 64) |
| |
| yield " r4: [\n" |
| data = map(str, r4) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| yield " r5: &[\n" |
| data = map(str, r5) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| yield " r6: &[\n" |
| data = ("0x%016x" % node for node in r6) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| yield " };\n" |
| |
| |
| def generate_small_bool_trie(name, codepoint_ranges, is_pub=False): |
| # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] |
| """ |
| Generate Rust code for `SmallBoolTrie` struct. |
| |
| See: `bool_trie.rs`. |
| """ |
| last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges) |
| n_chunks = last_chunk + 1 |
| chunks = [0] * n_chunks |
| for (lo, hi) in codepoint_ranges: |
| for cp in range(lo, hi + 1): |
| assert cp // 64 < len(chunks) |
| chunks[cp // 64] |= 1 << (cp & 63) |
| |
| pub_string = "" |
| if is_pub: |
| pub_string = "pub " |
| |
| yield "\n" |
| yield " #[rustfmt::skip]\n" |
| yield (" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n" |
| % (pub_string, name)) |
| |
| (r1, r2) = compute_trie(chunks, 1) |
| |
| yield " r1: &[\n" |
| data = (str(node) for node in r1) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| yield " r2: &[\n" |
| data = ("0x%016x" % node for node in r2) |
| for fragment in generate_table_lines(data, 12): |
| yield fragment |
| yield "\n ],\n" |
| |
| yield " };\n" |
| |
| |
| def generate_property_module(mod, grouped_categories, category_subset): |
| # type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str] |
| """ |
| Generate Rust code for module defining properties. |
| """ |
| |
| yield "pub(crate) mod %s {" % mod |
| for cat in sorted(category_subset): |
| if cat in ("Cc", "White_Space"): |
| generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat]) |
| else: |
| generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat]) |
| |
| for fragment in generator: |
| yield fragment |
| |
| yield "\n" |
| yield " pub fn %s(c: char) -> bool {\n" % cat |
| yield " %s_table.lookup(c)\n" % cat |
| yield " }\n" |
| |
| yield "}\n\n" |
| |
| |
| def generate_conversions_module(unicode_data): |
| # type: (UnicodeData) -> Iterator[str] |
| """ |
| Generate Rust code for module defining conversions. |
| """ |
| |
| yield "pub(crate) mod conversions {" |
| yield """ |
| pub fn to_lower(c: char) -> [char; 3] { |
| match bsearch_case_table(c, to_lowercase_table) { |
| None => [c, '\\0', '\\0'], |
| Some(index) => to_lowercase_table[index].1, |
| } |
| } |
| |
| pub fn to_upper(c: char) -> [char; 3] { |
| match bsearch_case_table(c, to_uppercase_table) { |
| None => [c, '\\0', '\\0'], |
| Some(index) => to_uppercase_table[index].1, |
| } |
| } |
| |
| fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> { |
| table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() |
| }\n""" |
| |
| decl_type = "&[(char, [char; 3])]" |
| format_conversion = lambda x: "({},[{},{},{}])".format(*( |
| escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2]) |
| )) |
| |
| for fragment in generate_table( |
| name="to_lowercase_table", |
| items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]), |
| decl_type=decl_type, |
| is_pub=False, |
| format_item=format_conversion |
| ): |
| yield fragment |
| |
| for fragment in generate_table( |
| name="to_uppercase_table", |
| items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]), |
| decl_type=decl_type, |
| is_pub=False, |
| format_item=format_conversion |
| ): |
| yield fragment |
| |
| yield "}\n" |
| |
| |
| def parse_args(): |
| # type: () -> argparse.Namespace |
| """ |
| Parse command line arguments. |
| """ |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument("-v", "--version", default=None, type=str, |
| help="Unicode version to use (if not specified," |
| " defaults to latest release).") |
| |
| return parser.parse_args() |
| |
| |
| def main(): |
| # type: () -> None |
| """ |
| Script entry point. |
| """ |
| args = parse_args() |
| |
| unicode_version = fetch_files(args.version) |
| print("Using Unicode version: {}".format(unicode_version.as_str)) |
| |
| # All the writing happens entirely in memory, we only write to file |
| # once we have generated the file content (it's not very large, <1 MB). |
| buf = StringIO() |
| buf.write(PREAMBLE) |
| |
| unicode_version_notice = textwrap.dedent(""" |
| /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of |
| /// `char` and `str` methods are based on. |
| #[unstable(feature = "unicode_version", issue = "49726")] |
| pub const UNICODE_VERSION: UnicodeVersion = |
| UnicodeVersion {{ major: {v.major}, minor: {v.minor}, micro: {v.micro}, _priv: () }}; |
| """).format(v=unicode_version) |
| buf.write(unicode_version_notice) |
| |
| get_path = lambda f: get_unicode_file_path(unicode_version, f) |
| |
| unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA)) |
| load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data) |
| |
| want_derived = {"Alphabetic", "Lowercase", "Uppercase", |
| "Cased", "Case_Ignorable", "Grapheme_Extend"} |
| derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived) |
| |
| props = load_properties(get_path(UnicodeFiles.PROPS), |
| {"White_Space", "Join_Control", "Noncharacter_Code_Point"}) |
| |
| # Category tables |
| for (name, categories, category_subset) in ( |
| ("general_category", unicode_data.general_categories, ["N", "Cc"]), |
| ("derived_property", derived, want_derived), |
| ("property", props, ["White_Space"]) |
| ): |
| for fragment in generate_property_module(name, categories, category_subset): |
| buf.write(fragment) |
| |
| for fragment in generate_conversions_module(unicode_data): |
| buf.write(fragment) |
| |
| tables_rs_path = os.path.join(THIS_DIR, "tables.rs") |
| |
| # Actually write out the file content. |
| # Will overwrite the file if it exists. |
| with open(tables_rs_path, "w") as fd: |
| fd.write(buf.getvalue()) |
| |
| print("Regenerated tables.rs.") |
| |
| |
| if __name__ == "__main__": |
| main() |