| # Copyright 2013-2014 The rust-url developers. |
| # |
| # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| # option. This file may not be copied, modified, or distributed |
| # except according to those terms. |
| |
| # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs |
| # You can get the latest idna table from |
| # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt |
| |
| from __future__ import print_function |
| import collections |
| import itertools |
| |
| print('''\ |
| // Copyright 2013-2014 The rust-url developers. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| // Generated by make_idna_table.py |
| ''') |
| |
| txt = open("IdnaMappingTable.txt") |
| |
| def escape_char(c): |
| return "\\u{%x}" % ord(c[0]) |
| |
| def char(s): |
| return unichr(int(s, 16)) |
| |
| strtab = collections.OrderedDict() |
| strtab_offset = 0 |
| |
| def strtab_slice(s): |
| global strtab, strtab_offset |
| |
| if s in strtab: |
| return strtab[s] |
| else: |
| utf8_len = len(s.encode('utf8')) |
| c = (strtab_offset, utf8_len) |
| strtab[s] = c |
| strtab_offset += utf8_len |
| return c |
| |
| def rust_slice(s): |
| start = s[0] |
| length = s[1] |
| start_lo = start & 0xff |
| start_hi = start >> 8 |
| assert length <= 255 |
| assert start_hi <= 255 |
| return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length) |
| |
| ranges = [] |
| |
| for line in txt: |
| # remove comments |
| line, _, _ = line.partition('#') |
| # skip empty lines |
| if len(line.strip()) == 0: |
| continue |
| fields = line.split(';') |
| if fields[0].strip() == 'D800..DFFF': |
| continue # Surrogates don't occur in Rust strings. |
| first, _, last = fields[0].strip().partition('..') |
| if not last: |
| last = first |
| mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '') |
| unicode_str = None |
| if len(fields) > 2: |
| if fields[2].strip(): |
| unicode_str = u''.join(char(c) for c in fields[2].strip().split(' ')) |
| elif mapping == "Deviation": |
| unicode_str = u'' |
| ranges.append((first, last, mapping, unicode_str)) |
| |
| def mergeable_key(r): |
| mapping = r[2] |
| |
| # These types have associated data, so we should not merge them. |
| if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'): |
| return r |
| assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid') |
| return mapping |
| |
| grouped_ranges = itertools.groupby(ranges, key=mergeable_key) |
| |
| optimized_ranges = [] |
| |
| for (k, g) in grouped_ranges: |
| group = list(g) |
| if len(group) == 1: |
| optimized_ranges.append(group[0]) |
| continue |
| # Assert that nothing in the group has an associated unicode string. |
| for g in group: |
| if g[3] is not None and len(g[3]) > 2: |
| assert not g[3][2].strip() |
| # Assert that consecutive members of the group don't leave gaps in |
| # the codepoint space. |
| a, b = itertools.tee(group) |
| next(b, None) |
| for (g1, g2) in itertools.izip(a, b): |
| last_char = int(g1[1], 16) |
| next_char = int(g2[0], 16) |
| if last_char + 1 == next_char: |
| continue |
| # There's a gap where surrogates would appear, but we don't have to |
| # worry about that gap, as surrogates never appear in Rust strings. |
| # Assert we're seeing the surrogate case here. |
| assert last_char == 0xd7ff |
| assert next_char == 0xe000 |
| first = group[0][0] |
| last = group[-1][1] |
| mapping = group[0][2] |
| unicode_str = group[0][3] |
| optimized_ranges.append((first, last, mapping, unicode_str)) |
| |
| def is_single_char_range(r): |
| (first, last, _, _) = r |
| return first == last |
| |
| # We can reduce the size of the character range table and the index table to about 1/4 |
| # by merging runs of single character ranges and using character offsets from the start |
| # of that range to retrieve the correct `Mapping` value |
| def merge_single_char_ranges(ranges): |
| current = [] |
| for r in ranges: |
| if not current or is_single_char_range(current[-1]) and is_single_char_range(r): |
| current.append(r) |
| continue |
| if len(current) != 0: |
| ret = current |
| current = [r] |
| yield ret |
| continue |
| current.append(r) |
| ret = current |
| current = [] |
| yield ret |
| yield current |
| |
| optimized_ranges = list(merge_single_char_ranges(optimized_ranges)) |
| |
| |
| print("static TABLE: &'static [Range] = &[") |
| |
| for ranges in optimized_ranges: |
| first = ranges[0][0] |
| last = ranges[-1][1] |
| print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)), |
| escape_char(char(last)))) |
| |
| print("];\n") |
| |
| print("static INDEX_TABLE: &'static [u16] = &[") |
| |
| SINGLE_MARKER = 1 << 15 |
| |
| offset = 0 |
| for ranges in optimized_ranges: |
| assert offset < SINGLE_MARKER |
| |
| block_len = len(ranges) |
| single = SINGLE_MARKER if block_len == 1 else 0 |
| print(" %s," % (offset | single)) |
| offset += block_len |
| |
| print("];\n") |
| |
| print("static MAPPING_TABLE: &'static [Mapping] = &[") |
| |
| for ranges in optimized_ranges: |
| for (first, last, mapping, unicode_str) in ranges: |
| if unicode_str is not None: |
| mapping += rust_slice(strtab_slice(unicode_str)) |
| print(" %s," % mapping) |
| |
| print("];\n") |
| |
| def escape_str(s): |
| return [escape_char(c) for c in s] |
| |
| print("static STRING_TABLE: &'static str = \"%s\";" |
| % '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()]))) |