| # ===--- GYBUnicodeDataUtils.py ----------------------*- coding: utf-8 -*-===// |
| # |
| # This source file is part of the Swift.org open source project |
| # |
| # Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors |
| # Licensed under Apache License v2.0 with Runtime Library Exception |
| # |
| # See https://swift.org/LICENSE.txt for license information |
| # See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| |
| import codecs |
| import re |
| |
| |
| class UnicodeProperty(object): |
| """Abstract base class for Unicode properties.""" |
| |
| def __init__(self): |
| raise NotImplementedError( |
| "UnicodeProperty.__init__ is not implemented.") |
| |
| def get_default_value(self): |
| raise NotImplementedError( |
| "UnicodeProperty.get_default_value is not implemented.") |
| |
| def get_value(self, cp): |
| raise NotImplementedError( |
| "UnicodeProperty.get_value is not implemented.") |
| |
| def to_numeric_value(self, value): |
| raise NotImplementedError( |
| "UnicodeProperty.to_numeric_value is not implemented.") |
| |
| def get_numeric_value(self, cp): |
| raise NotImplementedError( |
| "UnicodeProperty.get_numeric_value is not implemented.") |
| |
| |
| class GraphemeClusterBreakPropertyTable(UnicodeProperty): |
| """Grapheme_Cluster_Break property.""" |
| |
| # An array of tuples (start_code_point, end_code_point, value). |
| property_value_ranges = [] |
| |
| property_values = [None for i in range(0, 0x110000)] |
| |
| # Note: Numeric values (including the names) should be consistent with |
| # '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with |
| # 'GraphemeClusterBreakProperty' in the compiler C++ code. If there is a |
| # reason for either of those to differ, then this mapping can be overridden |
| # after an instance of this class is created. |
| numeric_value_table = { |
| 'Other': 0, |
| 'CR': 1, |
| 'LF': 2, |
| 'Control': 3, |
| 'Extend': 4, |
| 'Regional_Indicator': 5, |
| 'Prepend': 6, |
| 'SpacingMark': 7, |
| 'L': 8, |
| 'V': 9, |
| 'T': 10, |
| 'LV': 11, |
| 'LVT': 12, |
| } |
| |
| def __init__(self, grapheme_break_property_file_name): |
| # Build 'self.symbolic_values' -- an array that maps numeric property |
| # values to symbolic values. |
| self.symbolic_values = \ |
| [None] * (max(self.numeric_value_table.values()) + 1) |
| for k, v in self.numeric_value_table.items(): |
| self.symbolic_values[v] = k |
| |
| # Load the data file. |
| with codecs.open( |
| grapheme_break_property_file_name, |
| encoding='utf-8', |
| errors='strict') as f: |
| for line in f: |
| # Strip comments. |
| line = re.sub('#.*', '', line) |
| |
| # Single code point? |
| m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line) |
| if m: |
| code_point = int(m.group(1), 16) |
| value = m.group(2) |
| self.property_value_ranges += \ |
| [(code_point, code_point, value)] |
| continue |
| |
| # Range of code points? |
| m = re.match( |
| '([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line) |
| if m: |
| start_code_point = int(m.group(1), 16) |
| end_code_point = int(m.group(2), 16) |
| value = m.group(3) |
| self.property_value_ranges += \ |
| [(start_code_point, end_code_point, value)] |
| |
| # Prepare a flat lookup table for fast access. |
| for cp in range(0, 0x110000): |
| self.property_values[cp] = self.get_default_value() |
| |
| for start_code_pt, end_code_pt, val in self.property_value_ranges: |
| for cp in range(start_code_pt, end_code_pt + 1): |
| self.property_values[cp] = val |
| |
| def get_default_value(self): |
| return 'Other' |
| |
| def get_value(self, cp): |
| return self.property_values[cp] |
| |
| def to_numeric_value(self, value): |
| return self.numeric_value_table[value] |
| |
| def get_numeric_value(self, cp): |
| return self.to_numeric_value(self.get_value(cp)) |
| |
| |
| # BMP code points are 16-bit values. The code point value is split as |
| # follows: |
| # |
| # 8 bits 8 bits |
| # +-------------------------+-------------------------+ |
| # | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 | |
| # +-------------------------+-------------------------+ |
| # first-level index data offset |
| # |
| # Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values. |
| # The code point value is split as follows: |
| # |
| # 5 bits 8 bits 8 bits |
| # +----------------+-------------------------+-------------------------+ |
| # | 20 19 18 17 16 | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 | |
| # +----------------+-------------------------+-------------------------+ |
| # first-level second-level index data offset |
| # index |
| # |
| # The actual number of bits are just trie parameters. They affect the size of |
| # the lookup tables (and thus, lookup time), but do not change the overall |
| # structure of the trie. |
| # |
| # Here and below 'supp' stands for 'supplementary characters'. |
| # |
| # Property data for BMP code points is stored as a one-stage trie. |
| # A trie with one lookup table consists of two memory blocks: |
| # |
| # First-level lookup table |
| # +-----+-----+-----+-----+--...--+ |
| # | * | * | * | * | | |
| # +--|--+--|--+--|--+--|--+--...--+ |
| # | | | \ The references don't form |
| # | \____| \___, a systematic pattern |
| # | | | |
| # | | | Data storage |
| # +-V--------++-V--------++-V--------++---...---+ |
| # | data || data || data || | |
| # +----------++----------++----------++---...---+ |
| # |
| # In order to fetch data for a given code point, you need to: |
| # * load from the first-level lookup table using first-level index; this will |
| # give you the number of the data block that you should use. |
| # * load from the data block applying the data offset. |
| # |
| # Property data for supplementary code points is stored as a two-stage trie. |
| # A trie with two-stage lookup tables consists of three memory blocks. The |
| # following drawing explains how it is implemented: |
| # |
| # First-level lookup table |
| # +-----+-----+-----+-----+-----+--...--+ |
| # | * | * | * | * | * | | |
| # +--|--+--|--+--|--+--|--+--|--+--...--+ |
| # | | | | \ The references don't form |
| # ,__/ | \____| \___, a systematic pattern |
| # / | | | |
| # | | | | Second-level lookup table |
| # +-V--------++-V--------++-V--------++-V--------++---...---+ |
| # | ******** || ******** || ******** || || | |
| # +-||||||||-++-||||||||-++-||||||||-++----------++---...---+ |
| # \\\|//// ||||||VV |VVV|V|V |
| # \\|/// |||||| / | | |
| # \|// |||||| / | | |
| # |/ ||||| \__|___. \ \ The references don't form |
| # | |||| \___|__. \ | \ a systematic pattern |
| # | ||| \____| \ \__| \ |
| # | || \_____|__. \___|___\ ...___. |
| # | | \______| \____| \___, | Data storage |
| # +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+ |
| # | data || data || data || data || || || | |
| # +-------++-------++-------++-------++-------++-------++---...---+ |
| # |
| # In order to fetch data for a given code point, you need to: |
| # * load from the first-level lookup table using first-level index; this will |
| # give you the number of the second-level lookup table that you should use. |
| # * load from the chosen second-level lookup table using the second-level |
| # index, which will give you the number of the data block that you should |
| # use. |
| # * load from the data block applying the data offset. |
| # |
| # First- and second-level lookup tables in the general case contain 16-bit |
| # words; that will be sufficient to store a trie that does not compress at all. |
| # But in many cases, after trie compression there will be fewer than 256 |
| # unique second-level lookup tables and/or data storage blocks, which allows |
| # one to use 8-bit words in lookup tables. |
| # |
| # The bitwidth of data depends on the application of the trie. |
| # |
| # The supp tables contain entries for BMP code units to simplify trie |
| # implementation, but those BMP entries are filled with the default value, so |
| # they compress well. |
| class UnicodeTrieGenerator(object): |
| # Note: if you change any of these parameters, don't forget to update the |
| # ASCII art above. |
| bmp_first_level_index_bits = 8 |
| |
| supp_first_level_index_bits = 5 |
| supp_second_level_index_bits = 8 |
| |
| def get_bmp_first_level_index(self, cp): |
| return cp >> self.bmp_data_offset_bits |
| |
| def get_bmp_data_offset(self, cp): |
| return cp & ((1 << self.bmp_data_offset_bits) - 1) |
| |
| def get_supp_first_level_index(self, cp): |
| return cp >> \ |
| (self.supp_second_level_index_bits + self.supp_data_offset_bits) |
| |
| def get_supp_second_level_index(self, cp): |
| return (cp >> self.supp_data_offset_bits) & \ |
| ((1 << self.supp_second_level_index_bits) - 1) |
| |
| def get_supp_data_offset(self, cp): |
| return cp & ((1 << self.supp_data_offset_bits) - 1) |
| |
| def __init__(self): |
| """Create a trie generator with default parameters.""" |
| pass |
| |
| def create_tables(self): |
| """Compute derived parameter values and create internal data |
| structures. |
| |
| Don't change parameter values after calling this method. |
| """ |
| self.bmp_data_offset_bits = 16 - self.bmp_first_level_index_bits |
| |
| self.supp_data_offset_bits = \ |
| 21 - self.supp_first_level_index_bits - \ |
| self.supp_second_level_index_bits |
| |
| # The maximum value of the first level index for supp tables. It is |
| # not equal to ((1 << supp_first_level_index_bits) - 1), because |
| # maximum Unicode code point value is not 2^21-1 (0x1fffff), it is |
| # 0x10ffff. |
| self.supp_first_level_index_max = \ |
| 0x10ffff >> \ |
| (self.supp_second_level_index_bits + self.supp_data_offset_bits) |
| |
| # A mapping from BMP first-level index to BMP data block index. |
| self.bmp_lookup = \ |
| [i for i in range(0, 1 << self.bmp_first_level_index_bits)] |
| |
| # An array of BMP data blocks. |
| self.bmp_data = [ |
| [-1 for i in range(0, 1 << self.bmp_data_offset_bits)] |
| for i in range(0, 1 << self.bmp_first_level_index_bits) |
| ] |
| |
| # A mapping from supp first-level index to an index of the second-level |
| # lookup table. |
| self.supp_lookup1 = \ |
| [i for i in range(0, self.supp_first_level_index_max + 1)] |
| |
| # An array of second-level lookup tables. Each second-level lookup |
| # table is a mapping from a supp second-level index to supp data block |
| # index. |
| self.supp_lookup2 = [ |
| [j for j in range(i << self.supp_second_level_index_bits, |
| (i + 1) << self.supp_second_level_index_bits)] |
| for i in range(0, self.supp_first_level_index_max + 1) |
| ] |
| |
| # An array of supp data blocks. |
| self.supp_data = [ |
| [-1 for i in range(0, 1 << self.supp_data_offset_bits)] |
| for i in range(0, (self.supp_first_level_index_max + 1) * |
| (1 << self.supp_second_level_index_bits)) |
| ] |
| |
| def splat(self, value): |
| for i in range(0, len(self.bmp_data)): |
| for j in range(0, len(self.bmp_data[i])): |
| self.bmp_data[i][j] = value |
| |
| for i in range(0, len(self.supp_data)): |
| for j in range(0, len(self.supp_data[i])): |
| self.supp_data[i][j] = value |
| |
| def set_value(self, cp, value): |
| if cp <= 0xffff: |
| data_block_index = self.bmp_lookup[ |
| self.get_bmp_first_level_index(cp)] |
| self.bmp_data[data_block_index][ |
| self.get_bmp_data_offset(cp)] = value |
| else: |
| second_lookup_index = self.supp_lookup1[ |
| self.get_supp_first_level_index(cp)] |
| data_block_index = self.supp_lookup2[second_lookup_index][ |
| self.get_supp_second_level_index(cp)] |
| self.supp_data[data_block_index][ |
| self.get_supp_data_offset(cp)] = value |
| |
| def get_value(self, cp): |
| if cp <= 0xffff: |
| data_block_index = self.bmp_lookup[ |
| self.get_bmp_first_level_index(cp)] |
| return self.bmp_data[data_block_index][ |
| self.get_bmp_data_offset(cp)] |
| else: |
| second_lookup_index = self.supp_lookup1[ |
| self.get_supp_first_level_index(cp)] |
| data_block_index = self.supp_lookup2[second_lookup_index][ |
| self.get_supp_second_level_index(cp)] |
| return self.supp_data[data_block_index][ |
| self.get_supp_data_offset(cp)] |
| |
| def fill_from_unicode_property(self, unicode_property): |
| self.splat(unicode_property.get_default_value()) |
| for cp in range(0, 0x110000): |
| self.set_value(cp, unicode_property.get_value(cp)) |
| |
| def verify(self, unicode_property): |
| for cp in range(0, 0x110000): |
| expected_value = unicode_property.get_value(cp) |
| actual_value = self.get_value(cp) |
| assert(expected_value == actual_value) |
| |
| def freeze(self): |
| """Compress internal trie representation. |
| |
| Don't mutate the trie after calling this method. |
| """ |
| def remap_indexes(indexes, old_idx, new_idx): |
| def map_index(idx): |
| if idx == old_idx: |
| return new_idx |
| elif idx > old_idx: |
| return idx - 1 |
| else: |
| return idx |
| |
| # NOTE: Python 2's `map` function returns a list. Where Python 3's |
| # `map` function returns an iterator. To work around this the |
| # result of the `map` is explicitly converted to a `list`. |
| return list(map(map_index, indexes)) |
| |
| # If self.bmp_data contains identical data blocks, keep the first one, |
| # remove duplicates and change the indexes in self.bmp_lookup to point |
| # to the first one. |
| i = 0 |
| while i < len(self.bmp_data): |
| j = i + 1 |
| while j < len(self.bmp_data): |
| if self.bmp_data[i] == self.bmp_data[j]: |
| self.bmp_data.pop(j) |
| self.bmp_lookup = \ |
| remap_indexes(self.bmp_lookup, old_idx=j, new_idx=i) |
| else: |
| j += 1 |
| i += 1 |
| |
| # For supp tables, perform bottom-up deduplication: first, deduplicate |
| # data blocks. The algorithm is the same as above, but operates on |
| # self.supp_data/supp_lookup2. |
| i = 0 |
| while i < len(self.supp_data): |
| j = i + 1 |
| while j < len(self.supp_data): |
| if self.supp_data[i] == self.supp_data[j]: |
| self.supp_data.pop(j) |
| for k in range(0, len(self.supp_lookup2)): |
| self.supp_lookup2[k] = \ |
| remap_indexes(self.supp_lookup2[k], |
| old_idx=j, new_idx=i) |
| else: |
| j += 1 |
| i += 1 |
| |
| # Next, deduplicate second-level lookup tables. |
| # Same as above, but for supp_lookup1/supp_lookup2. |
| i = 0 |
| while i < len(self.supp_lookup2): |
| j = i + 1 |
| while j < len(self.supp_lookup2): |
| if self.supp_lookup2[i] == self.supp_lookup2[j]: |
| self.supp_lookup2.pop(j) |
| self.supp_lookup1 = \ |
| remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i) |
| else: |
| j += 1 |
| i += 1 |
| |
| def _int_to_le_bytes(self, data, width): |
| if width == 1: |
| assert(data & ~0xff == 0) |
| return [data] |
| if width == 2: |
| assert(data & ~0xffff == 0) |
| return [data & 0xff, data & 0xff00] |
| assert(False) |
| |
| def _int_list_to_le_bytes(self, ints, width): |
| return [ |
| byte |
| for elt in ints |
| for byte in self._int_to_le_bytes(elt, width)] |
| |
| def serialize(self, unicode_property): |
| self.bmp_lookup_bytes_per_entry = 1 if len(self.bmp_data) < 256 else 2 |
| self.bmp_data_bytes_per_entry = 1 |
| |
| self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 \ |
| else 2 |
| self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 \ |
| else 2 |
| self.supp_data_bytes_per_entry = 1 |
| |
| bmp_lookup_words = list(self.bmp_lookup) |
| bmp_data_words = [ |
| unicode_property.to_numeric_value(elt) |
| for block in self.bmp_data |
| for elt in block] |
| |
| supp_lookup1_words = list(self.supp_lookup1) |
| supp_lookup2_words = [ |
| elt for block in self.supp_lookup2 for elt in block] |
| supp_data_words = [ |
| unicode_property.to_numeric_value(elt) |
| for block in self.supp_data |
| for elt in block] |
| |
| bmp_lookup_bytes = self._int_list_to_le_bytes( |
| bmp_lookup_words, self.bmp_lookup_bytes_per_entry) |
| bmp_data_bytes = self._int_list_to_le_bytes( |
| bmp_data_words, self.bmp_data_bytes_per_entry) |
| |
| supp_lookup1_bytes = self._int_list_to_le_bytes( |
| supp_lookup1_words, self.supp_lookup1_bytes_per_entry) |
| supp_lookup2_bytes = self._int_list_to_le_bytes( |
| supp_lookup2_words, self.supp_lookup2_bytes_per_entry) |
| supp_data_bytes = self._int_list_to_le_bytes( |
| supp_data_words, self.supp_data_bytes_per_entry) |
| |
| self.trie_bytes = [] |
| |
| self.bmp_lookup_bytes_offset = 0 |
| self.trie_bytes += bmp_lookup_bytes |
| |
| self.bmp_data_bytes_offset = len(self.trie_bytes) |
| self.trie_bytes += bmp_data_bytes |
| |
| self.supp_lookup1_bytes_offset = len(self.trie_bytes) |
| self.trie_bytes += supp_lookup1_bytes |
| |
| self.supp_lookup2_bytes_offset = len(self.trie_bytes) |
| self.trie_bytes += supp_lookup2_bytes |
| |
| self.supp_data_bytes_offset = len(self.trie_bytes) |
| self.trie_bytes += supp_data_bytes |
| |
| |
| def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_table): |
| any_value = \ |
| grapheme_cluster_break_table.symbolic_values |
| |
| # Rules to determine extended grapheme cluster boundaries, as defined in |
| # 'Grapheme Break Chart', |
| # http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html, |
| # Unicode 6.3.0. |
| # |
| # The Unicode 7.0.0 draft does not change these rules. |
| # |
| # As in the referenced document, the rules are specified in order of |
| # decreasing priority. |
| rules = [ |
| (['CR'], 'no_boundary', ['LF']), |
| (['Control', 'CR', 'LF'], 'boundary', any_value), |
| (any_value, 'boundary', ['Control', 'CR', 'LF']), |
| (['L'], 'no_boundary', ['L', 'V', 'LV', 'LVT']), |
| (['LV', 'V'], 'no_boundary', ['V', 'T']), |
| (['LVT', 'T'], 'no_boundary', ['T']), |
| (['Regional_Indicator'], 'no_boundary', ['Regional_Indicator']), |
| (any_value, 'no_boundary', ['Extend']), |
| (any_value, 'no_boundary', ['SpacingMark']), |
| (['Prepend'], 'no_boundary', any_value), |
| (any_value, 'boundary', any_value), |
| ] |
| |
| # Expand the rules into a matrix. |
| rules_matrix = {} |
| for first in any_value: |
| rules_matrix[first] = \ |
| dict.fromkeys(any_value, None) |
| |
| # Iterate over rules in the order of increasing priority. |
| for first_list, action, second_list in reversed(rules): |
| for first in first_list: |
| for second in second_list: |
| rules_matrix[first][second] = action |
| |
| # Make sure we can pack one row of the matrix into a 'uint16_t'. |
| assert(len(any_value) <= 16) |
| |
| result = [] |
| for first in any_value: |
| # Retrieve a row that corresponds to this first code point. |
| row = rules_matrix[first] |
| |
| # Change strings into bits. |
| bits = [row[second] == 'no_boundary' for second in any_value] |
| |
| # Pack bits into an integer. |
| packed = sum([bits[i] * pow(2, i) for i in range(0, len(bits))]) |
| |
| result += [packed] |
| |
| return result |
| |
| |
| def get_grapheme_cluster_break_tests_as_utf8(grapheme_break_test_file_name): |
| def _convert_line(line): |
| # Strip comments. |
| line = re.sub('#.*', '', line).strip() |
| |
| if line == "": |
| return None |
| |
| test = "" |
| curr_bytes = 0 |
| boundaries = [] |
| |
| # Match a list of code points. |
| for token in line.split(" "): |
| if token == u"÷": |
| boundaries += [curr_bytes] |
| elif token == u"×": |
| pass |
| else: |
| code_point = int(token, 16) |
| # Tests from Unicode spec have isolated surrogates in them. |
| # Our segmentation algorithm works on UTF-8 sequences, so |
| # encoding a surrogate would produce an invalid code unit |
| # sequence. Instead of trying to emulate the maximal subpart |
| # algorithm for inserting U+FFFD in Python, we just replace |
| # every isolated surrogate with U+200B, which also has |
| # Grapheme_Cluster_Break equal to 'Control' and test |
| # separately that we handle ill-formed UTF-8 sequences. |
| if code_point >= 0xd800 and code_point <= 0xdfff: |
| code_point = 0x200b |
| code_point = (b'\U%(cp)08x' % {b'cp': code_point}).decode( |
| 'unicode_escape', 'strict') |
| as_utf8_bytes = bytearray(code_point.encode('utf8', 'strict')) |
| as_utf8_escaped = ''.join( |
| ['\\x%(byte)02x' % {'byte': byte} |
| for byte in as_utf8_bytes]) |
| test += as_utf8_escaped |
| curr_bytes += len(as_utf8_bytes) |
| |
| return (test, boundaries) |
| |
| # Self-test. |
| assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ( |
| '\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [0, 5, 8])) |
| assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [0, 3])) |
| |
| result = [] |
| |
| with codecs.open( |
| grapheme_break_test_file_name, |
| encoding='utf-8', |
| errors='strict') as f: |
| for line in f: |
| test = _convert_line(line) |
| if test: |
| result += [test] |
| |
| return result |
| |
| |
| def get_grapheme_cluster_break_tests_as_unicode_scalars( |
| grapheme_break_test_file_name): |
| def _convert_line(line): |
| # Strip comments. |
| line = re.sub('#.*', '', line).strip() |
| |
| if line == "": |
| return None |
| |
| test = [] |
| curr_code_points = 0 |
| boundaries = [] |
| |
| # Match a list of code points. |
| for token in line.split(" "): |
| if token == "÷": |
| boundaries += [curr_code_points] |
| elif token == "×": |
| pass |
| else: |
| code_point = int(token, 16) |
| # Tests from Unicode spec have isolated surrogates in them. Our |
| # segmentation algorithm works on UTF-16 sequences, so encoding |
| # a surrogate would produce an invalid code unit sequence. |
| # Instead of trying to emulate the maximal subpart algorithm |
| # for inserting U+FFFD in Python, we just replace every |
| # isolated surrogate with U+200B, which also has |
| # Grapheme_Cluster_Break equal to 'Control' and test separately |
| # that we handle ill-formed UTF-8 sequences. |
| if code_point >= 0xd800 and code_point <= 0xdfff: |
| code_point = 0x200b |
| test += [code_point] |
| curr_code_points += 1 |
| |
| return (test, boundaries) |
| |
| # Self-test. |
| assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([ |
| 0x0903, 0x0308, 0xac01], [0, 2, 3])) |
| assert(_convert_line('÷ D800 ÷ # abc') == ([0x200b], [0, 1])) |
| |
| result = [] |
| |
| with open(grapheme_break_test_file_name, 'rb') as f: |
| for line in f: |
| test = _convert_line(line) |
| if test: |
| result += [test] |
| |
| return result |