utils/GYBUnicodeDataUtils.py - third_party/swift - Git at Google

 # ===--- GYBUnicodeDataUtils.py ----------------------*- coding: utf-8 -*-===//
 #
 # This source file is part of the Swift.org open source project
 #
 # Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
 # Licensed under Apache License v2.0 with Runtime Library Exception
 #
 # See https://swift.org/LICENSE.txt for license information
 # See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors

 import codecs
 import re


 class UnicodeProperty(object):
     """Abstract base class for Unicode properties."""

     def __init__(self):
         raise NotImplementedError(
             "UnicodeProperty.__init__ is not implemented.")

     def get_default_value(self):
         raise NotImplementedError(
             "UnicodeProperty.get_default_value is not implemented.")

     def get_value(self, cp):
         raise NotImplementedError(
             "UnicodeProperty.get_value is not implemented.")

     def to_numeric_value(self, value):
         raise NotImplementedError(
             "UnicodeProperty.to_numeric_value is not implemented.")

     def get_numeric_value(self, cp):
         raise NotImplementedError(
             "UnicodeProperty.get_numeric_value is not implemented.")


 class GraphemeClusterBreakPropertyTable(UnicodeProperty):
     """Grapheme_Cluster_Break property."""

     # An array of tuples (start_code_point, end_code_point, value).
     property_value_ranges = []

     property_values = [None for i in range(0, 0x110000)]

     # Note: Numeric values (including the names) should be consistent with
     # '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
     # 'GraphemeClusterBreakProperty' in the compiler C++ code.  If there is a
     # reason for either of those to differ, then this mapping can be overridden
     # after an instance of this class is created.
     numeric_value_table = {
         'Other': 0,
         'CR': 1,
         'LF': 2,
         'Control': 3,
         'Extend': 4,
         'Regional_Indicator': 5,
         'Prepend': 6,
         'SpacingMark': 7,
         'L': 8,
         'V': 9,
         'T': 10,
         'LV': 11,
         'LVT': 12,
     }

     def __init__(self, grapheme_break_property_file_name):
         # Build 'self.symbolic_values' -- an array that maps numeric property
         # values to symbolic values.
         self.symbolic_values = \
             [None] * (max(self.numeric_value_table.values()) + 1)
         for k, v in self.numeric_value_table.items():
             self.symbolic_values[v] = k

         # Load the data file.
         with codecs.open(
                 grapheme_break_property_file_name,
                 encoding='utf-8',
                 errors='strict') as f:
             for line in f:
                 # Strip comments.
                 line = re.sub('#.*', '', line)

                 # Single code point?
                 m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
                 if m:
                     code_point = int(m.group(1), 16)
                     value = m.group(2)
                     self.property_value_ranges += \
                         [(code_point, code_point, value)]
                     continue

                 # Range of code points?
                 m = re.match(
                     '([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
                 if m:
                     start_code_point = int(m.group(1), 16)
                     end_code_point = int(m.group(2), 16)
                     value = m.group(3)
                     self.property_value_ranges += \
                         [(start_code_point, end_code_point, value)]

         # Prepare a flat lookup table for fast access.
         for cp in range(0, 0x110000):
             self.property_values[cp] = self.get_default_value()

         for start_code_pt, end_code_pt, val in self.property_value_ranges:
             for cp in range(start_code_pt, end_code_pt + 1):
                 self.property_values[cp] = val

     def get_default_value(self):
         return 'Other'

     def get_value(self, cp):
         return self.property_values[cp]

     def to_numeric_value(self, value):
         return self.numeric_value_table[value]

     def get_numeric_value(self, cp):
         return self.to_numeric_value(self.get_value(cp))


 # BMP code points are 16-bit values.  The code point value is split as
 # follows:
 #
 #   8 bits                     8 bits
 # +-------------------------+-------------------------+
 # | 15 14 13 12 11 10  9  8 |  7  6  5  4  3  2  1  0 |
 # +-------------------------+-------------------------+
 #   first-level index          data offset
 #
 # Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
 # The code point value is split as follows:
 #
 #   5 bits           8 bits                     8 bits
 # +----------------+-------------------------+-------------------------+
 # | 20 19 18 17 16 | 15 14 13 12 11 10  9  8 |  7  6  5  4  3  2  1  0 |
 # +----------------+-------------------------+-------------------------+
 #  first-level       second-level index         data offset
 #  index
 #
 # The actual number of bits are just trie parameters.  They affect the size of
 # the lookup tables (and thus, lookup time), but do not change the overall
 # structure of the trie.
 #
 # Here and below 'supp' stands for 'supplementary characters'.
 #
 # Property data for BMP code points is stored as a one-stage trie.
 # A trie with one lookup table consists of two memory blocks:
 #
 #         First-level lookup table
 #  +-----+-----+-----+-----+--...--+
 #  |  *  |  *  |  *  |  *  |       |
 #  +--|--+--|--+--|--+--|--+--...--+
 #     |     |     |      \          The references don't form
 #     |      \____|       \___,        a systematic pattern
 #     |           |           |
 #     |           |           |     Data storage
 #   +-V--------++-V--------++-V--------++---...---+
 #   | data     || data     || data     ||         |
 #   +----------++----------++----------++---...---+
 #
 # In order to fetch data for a given code point, you need to:
 # * load from the first-level lookup table using first-level index; this will
 #   give you the number of the data block that you should use.
 # * load from the data block applying the data offset.
 #
 # Property data for supplementary code points is stored as a two-stage trie.
 # A trie with two-stage lookup tables consists of three memory blocks.  The
 # following drawing explains how it is implemented:
 #
 #         First-level lookup table
 #       +-----+-----+-----+-----+-----+--...--+
 #       |  *  |  *  |  *  |  *  |  *  |       |
 #       +--|--+--|--+--|--+--|--+--|--+--...--+
 #          |     |     |     |      \          The references don't form
 #      ,__/      |      \____|       \___,        a systematic pattern
 #     /          |           |           |
 #    |           |           |           | Second-level lookup table
 #  +-V--------++-V--------++-V--------++-V--------++---...---+
 #  | ******** || ******** || ******** ||          ||         |
 #  +-||||||||-++-||||||||-++-||||||||-++----------++---...---+
 #    \\\|////    ||||||VV    |VVV|V|V
 #     \\|///     ||||||     /    | |
 #      \|//      ||||||    /     | |
 #       |/       ||||| \__|___.   \ \       The references don't form
 #       |        |||| \___|__. \   | \         a systematic pattern
 #       |        ||| \____|   \ \__|  \
 #       |        || \_____|__. \___|___\       ...___.
 #       |        | \______|   \____|    \___,        |  Data storage
 #     +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
 #     | data  || data  || data  || data  ||       ||       ||         |
 #     +-------++-------++-------++-------++-------++-------++---...---+
 #
 # In order to fetch data for a given code point, you need to:
 # * load from the first-level lookup table using first-level index; this will
 #   give you the number of the second-level lookup table that you should use.
 # * load from the chosen second-level lookup table using the second-level
 #   index, which will give you the number of the data block that you should
 #   use.
 # * load from the data block applying the data offset.
 #
 # First- and second-level lookup tables in the general case contain 16-bit
 # words; that will be sufficient to store a trie that does not compress at all.
 # But in many cases, after trie compression there will be fewer than 256
 # unique second-level lookup tables and/or data storage blocks, which allows
 # one to use 8-bit words in lookup tables.
 #
 # The bitwidth of data depends on the application of the trie.
 #
 # The supp tables contain entries for BMP code units to simplify trie
 # implementation, but those BMP entries are filled with the default value, so
 # they compress well.
 class UnicodeTrieGenerator(object):
     # Note: if you change any of these parameters, don't forget to update the
     # ASCII art above.
     bmp_first_level_index_bits = 8

     supp_first_level_index_bits = 5
     supp_second_level_index_bits = 8

     def get_bmp_first_level_index(self, cp):
         return cp >> self.bmp_data_offset_bits

     def get_bmp_data_offset(self, cp):
         return cp & ((1 << self.bmp_data_offset_bits) - 1)

     def get_supp_first_level_index(self, cp):
         return cp >> \
             (self.supp_second_level_index_bits + self.supp_data_offset_bits)

     def get_supp_second_level_index(self, cp):
         return (cp >> self.supp_data_offset_bits) & \
             ((1 << self.supp_second_level_index_bits) - 1)

     def get_supp_data_offset(self, cp):
         return cp & ((1 << self.supp_data_offset_bits) - 1)

     def __init__(self):
         """Create a trie generator with default parameters."""
         pass

     def create_tables(self):
         """Compute derived parameter values and create internal data
         structures.

         Don't change parameter values after calling this method.
         """
         self.bmp_data_offset_bits = 16 - self.bmp_first_level_index_bits

         self.supp_data_offset_bits = \
             21 - self.supp_first_level_index_bits - \
             self.supp_second_level_index_bits

         # The maximum value of the first level index for supp tables.  It is
         # not equal to ((1 << supp_first_level_index_bits) - 1), because
         # maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
         # 0x10ffff.
         self.supp_first_level_index_max = \
             0x10ffff >> \
             (self.supp_second_level_index_bits + self.supp_data_offset_bits)

         # A mapping from BMP first-level index to BMP data block index.
         self.bmp_lookup = \
             [i for i in range(0, 1 << self.bmp_first_level_index_bits)]

         # An array of BMP data blocks.
         self.bmp_data = [
             [-1 for i in range(0, 1 << self.bmp_data_offset_bits)]
             for i in range(0, 1 << self.bmp_first_level_index_bits)
         ]

         # A mapping from supp first-level index to an index of the second-level
         # lookup table.
         self.supp_lookup1 = \
             [i for i in range(0, self.supp_first_level_index_max + 1)]

         # An array of second-level lookup tables.  Each second-level lookup
         # table is a mapping from a supp second-level index to supp data block
         # index.
         self.supp_lookup2 = [
             [j for j in range(i << self.supp_second_level_index_bits,
                               (i + 1) << self.supp_second_level_index_bits)]
             for i in range(0, self.supp_first_level_index_max + 1)
         ]

         # An array of supp data blocks.
         self.supp_data = [
             [-1 for i in range(0, 1 << self.supp_data_offset_bits)]
             for i in range(0, (self.supp_first_level_index_max + 1) *
                            (1 << self.supp_second_level_index_bits))
         ]

     def splat(self, value):
         for i in range(0, len(self.bmp_data)):
             for j in range(0, len(self.bmp_data[i])):
                 self.bmp_data[i][j] = value

         for i in range(0, len(self.supp_data)):
             for j in range(0, len(self.supp_data[i])):
                 self.supp_data[i][j] = value

     def set_value(self, cp, value):
         if cp <= 0xffff:
             data_block_index = self.bmp_lookup[
                 self.get_bmp_first_level_index(cp)]
             self.bmp_data[data_block_index][
                 self.get_bmp_data_offset(cp)] = value
         else:
             second_lookup_index = self.supp_lookup1[
                 self.get_supp_first_level_index(cp)]
             data_block_index = self.supp_lookup2[second_lookup_index][
                 self.get_supp_second_level_index(cp)]
             self.supp_data[data_block_index][
                 self.get_supp_data_offset(cp)] = value

     def get_value(self, cp):
         if cp <= 0xffff:
             data_block_index = self.bmp_lookup[
                 self.get_bmp_first_level_index(cp)]
             return self.bmp_data[data_block_index][
                 self.get_bmp_data_offset(cp)]
         else:
             second_lookup_index = self.supp_lookup1[
                 self.get_supp_first_level_index(cp)]
             data_block_index = self.supp_lookup2[second_lookup_index][
                 self.get_supp_second_level_index(cp)]
             return self.supp_data[data_block_index][
                 self.get_supp_data_offset(cp)]

     def fill_from_unicode_property(self, unicode_property):
         self.splat(unicode_property.get_default_value())
         for cp in range(0, 0x110000):
             self.set_value(cp, unicode_property.get_value(cp))

     def verify(self, unicode_property):
         for cp in range(0, 0x110000):
             expected_value = unicode_property.get_value(cp)
             actual_value = self.get_value(cp)
             assert(expected_value == actual_value)

     def freeze(self):
         """Compress internal trie representation.

         Don't mutate the trie after calling this method.
         """
         def remap_indexes(indexes, old_idx, new_idx):
             def map_index(idx):
                 if idx == old_idx:
                     return new_idx
                 elif idx > old_idx:
                     return idx - 1
                 else:
                     return idx

             # NOTE: Python 2's `map` function returns a list. Where Python 3's
             # `map` function returns an iterator. To work around this the
             # result of the `map` is explicitly converted to a `list`.
             return list(map(map_index, indexes))

         # If self.bmp_data contains identical data blocks, keep the first one,
         # remove duplicates and change the indexes in self.bmp_lookup to point
         # to the first one.
         i = 0
         while i < len(self.bmp_data):
             j = i + 1
             while j < len(self.bmp_data):
                 if self.bmp_data[i] == self.bmp_data[j]:
                     self.bmp_data.pop(j)
                     self.bmp_lookup = \
                         remap_indexes(self.bmp_lookup, old_idx=j, new_idx=i)
                 else:
                     j += 1
             i += 1

         # For supp tables, perform bottom-up deduplication: first, deduplicate
         # data blocks.  The algorithm is the same as above, but operates on
         # self.supp_data/supp_lookup2.
         i = 0
         while i < len(self.supp_data):
             j = i + 1
             while j < len(self.supp_data):
                 if self.supp_data[i] == self.supp_data[j]:
                     self.supp_data.pop(j)
                     for k in range(0, len(self.supp_lookup2)):
                         self.supp_lookup2[k] = \
                             remap_indexes(self.supp_lookup2[k],
                                           old_idx=j, new_idx=i)
                 else:
                     j += 1
             i += 1

         # Next, deduplicate second-level lookup tables.
         # Same as above, but for supp_lookup1/supp_lookup2.
         i = 0
         while i < len(self.supp_lookup2):
             j = i + 1
             while j < len(self.supp_lookup2):
                 if self.supp_lookup2[i] == self.supp_lookup2[j]:
                     self.supp_lookup2.pop(j)
                     self.supp_lookup1 = \
                         remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
                 else:
                     j += 1
             i += 1

     def _int_to_le_bytes(self, data, width):
         if width == 1:
             assert(data & ~0xff == 0)
             return [data]
         if width == 2:
             assert(data & ~0xffff == 0)
             return [data & 0xff, data & 0xff00]
         assert(False)

     def _int_list_to_le_bytes(self, ints, width):
         return [
             byte
             for elt in ints
             for byte in self._int_to_le_bytes(elt, width)]

     def serialize(self, unicode_property):
         self.bmp_lookup_bytes_per_entry = 1 if len(self.bmp_data) < 256 else 2
         self.bmp_data_bytes_per_entry = 1

         self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 \
             else 2
         self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 \
             else 2
         self.supp_data_bytes_per_entry = 1

         bmp_lookup_words = list(self.bmp_lookup)
         bmp_data_words = [
             unicode_property.to_numeric_value(elt)
             for block in self.bmp_data
             for elt in block]

         supp_lookup1_words = list(self.supp_lookup1)
         supp_lookup2_words = [
             elt for block in self.supp_lookup2 for elt in block]
         supp_data_words = [
             unicode_property.to_numeric_value(elt)
             for block in self.supp_data
             for elt in block]

         bmp_lookup_bytes = self._int_list_to_le_bytes(
             bmp_lookup_words, self.bmp_lookup_bytes_per_entry)
         bmp_data_bytes = self._int_list_to_le_bytes(
             bmp_data_words, self.bmp_data_bytes_per_entry)

         supp_lookup1_bytes = self._int_list_to_le_bytes(
             supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
         supp_lookup2_bytes = self._int_list_to_le_bytes(
             supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
         supp_data_bytes = self._int_list_to_le_bytes(
             supp_data_words, self.supp_data_bytes_per_entry)

         self.trie_bytes = []

         self.bmp_lookup_bytes_offset = 0
         self.trie_bytes += bmp_lookup_bytes

         self.bmp_data_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += bmp_data_bytes

         self.supp_lookup1_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += supp_lookup1_bytes

         self.supp_lookup2_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += supp_lookup2_bytes

         self.supp_data_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += supp_data_bytes


 def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_table):
     any_value = \
         grapheme_cluster_break_table.symbolic_values

     # Rules to determine extended grapheme cluster boundaries, as defined in
     # 'Grapheme Break Chart',
     # http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
     # Unicode 6.3.0.
     #
     # The Unicode 7.0.0 draft does not change these rules.
     #
     # As in the referenced document, the rules are specified in order of
     # decreasing priority.
     rules = [
         (['CR'], 'no_boundary', ['LF']),
         (['Control', 'CR', 'LF'], 'boundary', any_value),
         (any_value, 'boundary', ['Control', 'CR', 'LF']),
         (['L'], 'no_boundary', ['L', 'V', 'LV', 'LVT']),
         (['LV', 'V'], 'no_boundary', ['V', 'T']),
         (['LVT', 'T'], 'no_boundary', ['T']),
         (['Regional_Indicator'], 'no_boundary', ['Regional_Indicator']),
         (any_value, 'no_boundary', ['Extend']),
         (any_value, 'no_boundary', ['SpacingMark']),
         (['Prepend'], 'no_boundary', any_value),
         (any_value, 'boundary', any_value),
     ]

     # Expand the rules into a matrix.
     rules_matrix = {}
     for first in any_value:
         rules_matrix[first] = \
             dict.fromkeys(any_value, None)

     # Iterate over rules in the order of increasing priority.
     for first_list, action, second_list in reversed(rules):
         for first in first_list:
             for second in second_list:
                 rules_matrix[first][second] = action

     # Make sure we can pack one row of the matrix into a 'uint16_t'.
     assert(len(any_value) <= 16)

     result = []
     for first in any_value:
         # Retrieve a row that corresponds to this first code point.
         row = rules_matrix[first]

         # Change strings into bits.
         bits = [row[second] == 'no_boundary' for second in any_value]

         # Pack bits into an integer.
         packed = sum([bits[i] * pow(2, i) for i in range(0, len(bits))])

         result += [packed]

     return result


 def get_grapheme_cluster_break_tests_as_utf8(grapheme_break_test_file_name):
     def _convert_line(line):
         # Strip comments.
         line = re.sub('#.*', '', line).strip()

         if line == "":
             return None

         test = ""
         curr_bytes = 0
         boundaries = []

         # Match a list of code points.
         for token in line.split(" "):
             if token == u"÷":
                 boundaries += [curr_bytes]
             elif token == u"×":
                 pass
             else:
                 code_point = int(token, 16)
                 # Tests from Unicode spec have isolated surrogates in them.
                 # Our segmentation algorithm works on UTF-8 sequences, so
                 # encoding a surrogate would produce an invalid code unit
                 # sequence. Instead of trying to emulate the maximal subpart
                 # algorithm for inserting U+FFFD in Python, we just replace
                 # every isolated surrogate with U+200B, which also has
                 # Grapheme_Cluster_Break equal to 'Control' and test
                 # separately that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
                 code_point = (b'\U%(cp)08x' % {b'cp': code_point}).decode(
                     'unicode_escape', 'strict')
                 as_utf8_bytes = bytearray(code_point.encode('utf8', 'strict'))
                 as_utf8_escaped = ''.join(
                     ['\\x%(byte)02x' % {'byte': byte}
                      for byte in as_utf8_bytes])
                 test += as_utf8_escaped
                 curr_bytes += len(as_utf8_bytes)

         return (test, boundaries)

     # Self-test.
     assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == (
         '\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [0, 5, 8]))
     assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [0, 3]))

     result = []

     with codecs.open(
             grapheme_break_test_file_name,
             encoding='utf-8',
             errors='strict') as f:
         for line in f:
             test = _convert_line(line)
             if test:
                 result += [test]

     return result


 def get_grapheme_cluster_break_tests_as_unicode_scalars(
         grapheme_break_test_file_name):
     def _convert_line(line):
         # Strip comments.
         line = re.sub('#.*', '', line).strip()

         if line == "":
             return None

         test = []
         curr_code_points = 0
         boundaries = []

         # Match a list of code points.
         for token in line.split(" "):
             if token == "÷":
                 boundaries += [curr_code_points]
             elif token == "×":
                 pass
             else:
                 code_point = int(token, 16)
                 # Tests from Unicode spec have isolated surrogates in them. Our
                 # segmentation algorithm works on UTF-16 sequences, so encoding
                 # a surrogate would produce an invalid code unit sequence.
                 # Instead of trying to emulate the maximal subpart algorithm
                 # for inserting U+FFFD in Python, we just replace every
                 # isolated surrogate with U+200B, which also has
                 # Grapheme_Cluster_Break equal to 'Control' and test separately
                 # that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
                 test += [code_point]
                 curr_code_points += 1

         return (test, boundaries)

     # Self-test.
     assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([
         0x0903, 0x0308, 0xac01], [0, 2, 3]))
     assert(_convert_line('÷ D800 ÷ # abc') == ([0x200b], [0, 1]))

     result = []

     with open(grapheme_break_test_file_name, 'rb') as f:
         for line in f:
             test = _convert_line(line)
             if test:
                 result += [test]

     return result
	# ===--- GYBUnicodeDataUtils.py ----------------------- coding: utf-8 --===//
	#
	# This source file is part of the Swift.org open source project
	#
	# Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
	# Licensed under Apache License v2.0 with Runtime Library Exception
	#
	# See https://swift.org/LICENSE.txt for license information
	# See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors

	import codecs
	import re


	class UnicodeProperty(object):
	"""Abstract base class for Unicode properties."""

	def __init__(self):
	raise NotImplementedError(
	"UnicodeProperty.__init__ is not implemented.")

	def get_default_value(self):
	raise NotImplementedError(
	"UnicodeProperty.get_default_value is not implemented.")

	def get_value(self, cp):
	raise NotImplementedError(
	"UnicodeProperty.get_value is not implemented.")

	def to_numeric_value(self, value):
	raise NotImplementedError(
	"UnicodeProperty.to_numeric_value is not implemented.")

	def get_numeric_value(self, cp):
	raise NotImplementedError(
	"UnicodeProperty.get_numeric_value is not implemented.")


	class GraphemeClusterBreakPropertyTable(UnicodeProperty):
	"""Grapheme_Cluster_Break property."""

	# An array of tuples (start_code_point, end_code_point, value).
	property_value_ranges = []

	property_values = [None for i in range(0, 0x110000)]

	# Note: Numeric values (including the names) should be consistent with
	# '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
	# 'GraphemeClusterBreakProperty' in the compiler C++ code. If there is a
	# reason for either of those to differ, then this mapping can be overridden
	# after an instance of this class is created.
	numeric_value_table = {
	'Other': 0,
	'CR': 1,
	'LF': 2,
	'Control': 3,
	'Extend': 4,
	'Regional_Indicator': 5,
	'Prepend': 6,
	'SpacingMark': 7,
	'L': 8,
	'V': 9,
	'T': 10,
	'LV': 11,
	'LVT': 12,
	}

	def __init__(self, grapheme_break_property_file_name):
	# Build 'self.symbolic_values' -- an array that maps numeric property
	# values to symbolic values.
	self.symbolic_values = \
	[None] * (max(self.numeric_value_table.values()) + 1)
	for k, v in self.numeric_value_table.items():
	self.symbolic_values[v] = k

	# Load the data file.
	with codecs.open(
	grapheme_break_property_file_name,
	encoding='utf-8',
	errors='strict') as f:
	for line in f:
	# Strip comments.
	line = re.sub('#.*', '', line)

	# Single code point?
	m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
	if m:
	code_point = int(m.group(1), 16)
	value = m.group(2)
	self.property_value_ranges += \
	[(code_point, code_point, value)]
	continue

	# Range of code points?
	m = re.match(
	'([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
	if m:
	start_code_point = int(m.group(1), 16)
	end_code_point = int(m.group(2), 16)
	value = m.group(3)
	self.property_value_ranges += \
	[(start_code_point, end_code_point, value)]

	# Prepare a flat lookup table for fast access.
	for cp in range(0, 0x110000):
	self.property_values[cp] = self.get_default_value()

	for start_code_pt, end_code_pt, val in self.property_value_ranges:
	for cp in range(start_code_pt, end_code_pt + 1):
	self.property_values[cp] = val

	def get_default_value(self):
	return 'Other'

	def get_value(self, cp):
	return self.property_values[cp]

	def to_numeric_value(self, value):
	return self.numeric_value_table[value]

	def get_numeric_value(self, cp):
	return self.to_numeric_value(self.get_value(cp))


	# BMP code points are 16-bit values. The code point value is split as
	# follows:
	#
	# 8 bits 8 bits
	# +-------------------------+-------------------------+
	# \| 15 14 13 12 11 10 9 8 \| 7 6 5 4 3 2 1 0 \|
	# +-------------------------+-------------------------+
	# first-level index data offset
	#
	# Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
	# The code point value is split as follows:
	#
	# 5 bits 8 bits 8 bits
	# +----------------+-------------------------+-------------------------+
	# \| 20 19 18 17 16 \| 15 14 13 12 11 10 9 8 \| 7 6 5 4 3 2 1 0 \|
	# +----------------+-------------------------+-------------------------+
	# first-level second-level index data offset
	# index
	#
	# The actual number of bits are just trie parameters. They affect the size of
	# the lookup tables (and thus, lookup time), but do not change the overall
	# structure of the trie.
	#
	# Here and below 'supp' stands for 'supplementary characters'.
	#
	# Property data for BMP code points is stored as a one-stage trie.
	# A trie with one lookup table consists of two memory blocks:
	#
	# First-level lookup table
	# +-----+-----+-----+-----+--...--+
	# \| * \| * \| * \| * \| \|
	# +--\|--+--\|--+--\|--+--\|--+--...--+
	# \| \| \| \ The references don't form
	# \| \____\| \___, a systematic pattern
	# \| \| \|
	# \| \| \| Data storage
	# +-V--------++-V--------++-V--------++---...---+
	# \| data \|\| data \|\| data \|\| \|
	# +----------++----------++----------++---...---+
	#
	# In order to fetch data for a given code point, you need to:
	# * load from the first-level lookup table using first-level index; this will
	# give you the number of the data block that you should use.
	# * load from the data block applying the data offset.
	#
	# Property data for supplementary code points is stored as a two-stage trie.
	# A trie with two-stage lookup tables consists of three memory blocks. The
	# following drawing explains how it is implemented:
	#
	# First-level lookup table
	# +-----+-----+-----+-----+-----+--...--+
	# \| * \| * \| * \| * \| * \| \|
	# +--\|--+--\|--+--\|--+--\|--+--\|--+--...--+
	# \| \| \| \| \ The references don't form
	# ,__/ \| \____\| \___, a systematic pattern
	# / \| \| \|
	# \| \| \| \| Second-level lookup table
	# +-V--------++-V--------++-V--------++-V--------++---...---+
	# \| ****** \|\| **** \|\| ****** \|\| \|\| \|
	# +-\|\|\|\|\|\|\|\|-++-\|\|\|\|\|\|\|\|-++-\|\|\|\|\|\|\|\|-++----------++---...---+
	# \\\\|//// \|\|\|\|\|\|VV \|VVV\|V\|V
	# \\\|/// \|\|\|\|\|\| / \| \|
	# \\|// \|\|\|\|\|\| / \| \|
	# \|/ \|\|\|\|\| \__\|___. \ \ The references don't form
	# \| \|\|\|\| \___\|__. \ \| \ a systematic pattern
	# \| \|\|\| \____\| \ \__\| \
	# \| \|\| \_____\|__. \___\|___\ ...___.
	# \| \| \______\| \____\| \___, \| Data storage
	# +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
	# \| data \|\| data \|\| data \|\| data \|\| \|\| \|\| \|
	# +-------++-------++-------++-------++-------++-------++---...---+
	#
	# In order to fetch data for a given code point, you need to:
	# * load from the first-level lookup table using first-level index; this will
	# give you the number of the second-level lookup table that you should use.
	# * load from the chosen second-level lookup table using the second-level
	# index, which will give you the number of the data block that you should
	# use.
	# * load from the data block applying the data offset.
	#
	# First- and second-level lookup tables in the general case contain 16-bit
	# words; that will be sufficient to store a trie that does not compress at all.
	# But in many cases, after trie compression there will be fewer than 256
	# unique second-level lookup tables and/or data storage blocks, which allows
	# one to use 8-bit words in lookup tables.
	#
	# The bitwidth of data depends on the application of the trie.
	#
	# The supp tables contain entries for BMP code units to simplify trie
	# implementation, but those BMP entries are filled with the default value, so
	# they compress well.
	class UnicodeTrieGenerator(object):
	# Note: if you change any of these parameters, don't forget to update the
	# ASCII art above.
	bmp_first_level_index_bits = 8

	supp_first_level_index_bits = 5
	supp_second_level_index_bits = 8

	def get_bmp_first_level_index(self, cp):
	return cp >> self.bmp_data_offset_bits

	def get_bmp_data_offset(self, cp):
	return cp & ((1 << self.bmp_data_offset_bits) - 1)

	def get_supp_first_level_index(self, cp):
	return cp >> \
	(self.supp_second_level_index_bits + self.supp_data_offset_bits)

	def get_supp_second_level_index(self, cp):
	return (cp >> self.supp_data_offset_bits) & \
	((1 << self.supp_second_level_index_bits) - 1)

	def get_supp_data_offset(self, cp):
	return cp & ((1 << self.supp_data_offset_bits) - 1)

	def __init__(self):
	"""Create a trie generator with default parameters."""
	pass

	def create_tables(self):
	"""Compute derived parameter values and create internal data
	structures.

	Don't change parameter values after calling this method.
	"""
	self.bmp_data_offset_bits = 16 - self.bmp_first_level_index_bits

	self.supp_data_offset_bits = \
	21 - self.supp_first_level_index_bits - \
	self.supp_second_level_index_bits

	# The maximum value of the first level index for supp tables. It is
	# not equal to ((1 << supp_first_level_index_bits) - 1), because
	# maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
	# 0x10ffff.
	self.supp_first_level_index_max = \
	0x10ffff >> \
	(self.supp_second_level_index_bits + self.supp_data_offset_bits)

	# A mapping from BMP first-level index to BMP data block index.
	self.bmp_lookup = \
	[i for i in range(0, 1 << self.bmp_first_level_index_bits)]

	# An array of BMP data blocks.
	self.bmp_data = [
	[-1 for i in range(0, 1 << self.bmp_data_offset_bits)]
	for i in range(0, 1 << self.bmp_first_level_index_bits)
	]

	# A mapping from supp first-level index to an index of the second-level
	# lookup table.
	self.supp_lookup1 = \
	[i for i in range(0, self.supp_first_level_index_max + 1)]

	# An array of second-level lookup tables. Each second-level lookup
	# table is a mapping from a supp second-level index to supp data block
	# index.
	self.supp_lookup2 = [
	[j for j in range(i << self.supp_second_level_index_bits,
	(i + 1) << self.supp_second_level_index_bits)]
	for i in range(0, self.supp_first_level_index_max + 1)
	]

	# An array of supp data blocks.
	self.supp_data = [
	[-1 for i in range(0, 1 << self.supp_data_offset_bits)]
	for i in range(0, (self.supp_first_level_index_max + 1) *
	(1 << self.supp_second_level_index_bits))
	]

	def splat(self, value):
	for i in range(0, len(self.bmp_data)):
	for j in range(0, len(self.bmp_data[i])):
	self.bmp_data[i][j] = value

	for i in range(0, len(self.supp_data)):
	for j in range(0, len(self.supp_data[i])):
	self.supp_data[i][j] = value

	def set_value(self, cp, value):
	if cp <= 0xffff:
	data_block_index = self.bmp_lookup[
	self.get_bmp_first_level_index(cp)]
	self.bmp_data[data_block_index][
	self.get_bmp_data_offset(cp)] = value
	else:
	second_lookup_index = self.supp_lookup1[
	self.get_supp_first_level_index(cp)]
	data_block_index = self.supp_lookup2[second_lookup_index][
	self.get_supp_second_level_index(cp)]
	self.supp_data[data_block_index][
	self.get_supp_data_offset(cp)] = value

	def get_value(self, cp):
	if cp <= 0xffff:
	data_block_index = self.bmp_lookup[
	self.get_bmp_first_level_index(cp)]
	return self.bmp_data[data_block_index][
	self.get_bmp_data_offset(cp)]
	else:
	second_lookup_index = self.supp_lookup1[
	self.get_supp_first_level_index(cp)]
	data_block_index = self.supp_lookup2[second_lookup_index][
	self.get_supp_second_level_index(cp)]
	return self.supp_data[data_block_index][
	self.get_supp_data_offset(cp)]

	def fill_from_unicode_property(self, unicode_property):
	self.splat(unicode_property.get_default_value())
	for cp in range(0, 0x110000):
	self.set_value(cp, unicode_property.get_value(cp))

	def verify(self, unicode_property):
	for cp in range(0, 0x110000):
	expected_value = unicode_property.get_value(cp)
	actual_value = self.get_value(cp)
	assert(expected_value == actual_value)

	def freeze(self):
	"""Compress internal trie representation.

	Don't mutate the trie after calling this method.
	"""
	def remap_indexes(indexes, old_idx, new_idx):
	def map_index(idx):
	if idx == old_idx:
	return new_idx
	elif idx > old_idx:
	return idx - 1
	else:
	return idx

	# NOTE: Python 2's `map` function returns a list. Where Python 3's
	# `map` function returns an iterator. To work around this the
	# result of the `map` is explicitly converted to a `list`.
	return list(map(map_index, indexes))

	# If self.bmp_data contains identical data blocks, keep the first one,
	# remove duplicates and change the indexes in self.bmp_lookup to point
	# to the first one.
	i = 0
	while i < len(self.bmp_data):
	j = i + 1
	while j < len(self.bmp_data):
	if self.bmp_data[i] == self.bmp_data[j]:
	self.bmp_data.pop(j)
	self.bmp_lookup = \
	remap_indexes(self.bmp_lookup, old_idx=j, new_idx=i)
	else:
	j += 1
	i += 1

	# For supp tables, perform bottom-up deduplication: first, deduplicate
	# data blocks. The algorithm is the same as above, but operates on
	# self.supp_data/supp_lookup2.
	i = 0
	while i < len(self.supp_data):
	j = i + 1
	while j < len(self.supp_data):
	if self.supp_data[i] == self.supp_data[j]:
	self.supp_data.pop(j)
	for k in range(0, len(self.supp_lookup2)):
	self.supp_lookup2[k] = \
	remap_indexes(self.supp_lookup2[k],
	old_idx=j, new_idx=i)
	else:
	j += 1
	i += 1

	# Next, deduplicate second-level lookup tables.
	# Same as above, but for supp_lookup1/supp_lookup2.
	i = 0
	while i < len(self.supp_lookup2):
	j = i + 1
	while j < len(self.supp_lookup2):
	if self.supp_lookup2[i] == self.supp_lookup2[j]:
	self.supp_lookup2.pop(j)
	self.supp_lookup1 = \
	remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
	else:
	j += 1
	i += 1

	def _int_to_le_bytes(self, data, width):
	if width == 1:
	assert(data & ~0xff == 0)
	return [data]
	if width == 2:
	assert(data & ~0xffff == 0)
	return [data & 0xff, data & 0xff00]
	assert(False)

	def _int_list_to_le_bytes(self, ints, width):
	return [
	byte
	for elt in ints
	for byte in self._int_to_le_bytes(elt, width)]

	def serialize(self, unicode_property):
	self.bmp_lookup_bytes_per_entry = 1 if len(self.bmp_data) < 256 else 2
	self.bmp_data_bytes_per_entry = 1

	self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 \
	else 2
	self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 \
	else 2
	self.supp_data_bytes_per_entry = 1

	bmp_lookup_words = list(self.bmp_lookup)
	bmp_data_words = [
	unicode_property.to_numeric_value(elt)
	for block in self.bmp_data
	for elt in block]

	supp_lookup1_words = list(self.supp_lookup1)
	supp_lookup2_words = [
	elt for block in self.supp_lookup2 for elt in block]
	supp_data_words = [
	unicode_property.to_numeric_value(elt)
	for block in self.supp_data
	for elt in block]

	bmp_lookup_bytes = self._int_list_to_le_bytes(
	bmp_lookup_words, self.bmp_lookup_bytes_per_entry)
	bmp_data_bytes = self._int_list_to_le_bytes(
	bmp_data_words, self.bmp_data_bytes_per_entry)

	supp_lookup1_bytes = self._int_list_to_le_bytes(
	supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
	supp_lookup2_bytes = self._int_list_to_le_bytes(
	supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
	supp_data_bytes = self._int_list_to_le_bytes(
	supp_data_words, self.supp_data_bytes_per_entry)

	self.trie_bytes = []

	self.bmp_lookup_bytes_offset = 0
	self.trie_bytes += bmp_lookup_bytes

	self.bmp_data_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += bmp_data_bytes

	self.supp_lookup1_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += supp_lookup1_bytes

	self.supp_lookup2_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += supp_lookup2_bytes

	self.supp_data_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += supp_data_bytes


	def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_table):
	any_value = \
	grapheme_cluster_break_table.symbolic_values

	# Rules to determine extended grapheme cluster boundaries, as defined in
	# 'Grapheme Break Chart',
	# http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
	# Unicode 6.3.0.
	#
	# The Unicode 7.0.0 draft does not change these rules.
	#
	# As in the referenced document, the rules are specified in order of
	# decreasing priority.
	rules = [
	(['CR'], 'no_boundary', ['LF']),
	(['Control', 'CR', 'LF'], 'boundary', any_value),
	(any_value, 'boundary', ['Control', 'CR', 'LF']),
	(['L'], 'no_boundary', ['L', 'V', 'LV', 'LVT']),
	(['LV', 'V'], 'no_boundary', ['V', 'T']),
	(['LVT', 'T'], 'no_boundary', ['T']),
	(['Regional_Indicator'], 'no_boundary', ['Regional_Indicator']),
	(any_value, 'no_boundary', ['Extend']),
	(any_value, 'no_boundary', ['SpacingMark']),
	(['Prepend'], 'no_boundary', any_value),
	(any_value, 'boundary', any_value),
	]

	# Expand the rules into a matrix.
	rules_matrix = {}
	for first in any_value:
	rules_matrix[first] = \
	dict.fromkeys(any_value, None)

	# Iterate over rules in the order of increasing priority.
	for first_list, action, second_list in reversed(rules):
	for first in first_list:
	for second in second_list:
	rules_matrix[first][second] = action

	# Make sure we can pack one row of the matrix into a 'uint16_t'.
	assert(len(any_value) <= 16)

	result = []
	for first in any_value:
	# Retrieve a row that corresponds to this first code point.
	row = rules_matrix[first]

	# Change strings into bits.
	bits = [row[second] == 'no_boundary' for second in any_value]

	# Pack bits into an integer.
	packed = sum([bits[i] * pow(2, i) for i in range(0, len(bits))])

	result += [packed]

	return result


	def get_grapheme_cluster_break_tests_as_utf8(grapheme_break_test_file_name):
	def _convert_line(line):
	# Strip comments.
	line = re.sub('#.*', '', line).strip()

	if line == "":
	return None

	test = ""
	curr_bytes = 0
	boundaries = []

	# Match a list of code points.
	for token in line.split(" "):
	if token == u"÷":
	boundaries += [curr_bytes]
	elif token == u"×":
	pass
	else:
	code_point = int(token, 16)
	# Tests from Unicode spec have isolated surrogates in them.
	# Our segmentation algorithm works on UTF-8 sequences, so
	# encoding a surrogate would produce an invalid code unit
	# sequence. Instead of trying to emulate the maximal subpart
	# algorithm for inserting U+FFFD in Python, we just replace
	# every isolated surrogate with U+200B, which also has
	# Grapheme_Cluster_Break equal to 'Control' and test
	# separately that we handle ill-formed UTF-8 sequences.
	if code_point >= 0xd800 and code_point <= 0xdfff:
	code_point = 0x200b
	code_point = (b'\U%(cp)08x' % {b'cp': code_point}).decode(
	'unicode_escape', 'strict')
	as_utf8_bytes = bytearray(code_point.encode('utf8', 'strict'))
	as_utf8_escaped = ''.join(
	['\\x%(byte)02x' % {'byte': byte}
	for byte in as_utf8_bytes])
	test += as_utf8_escaped
	curr_bytes += len(as_utf8_bytes)

	return (test, boundaries)

	# Self-test.
	assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == (
	'\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [0, 5, 8]))
	assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [0, 3]))

	result = []

	with codecs.open(
	grapheme_break_test_file_name,
	encoding='utf-8',
	errors='strict') as f:
	for line in f:
	test = _convert_line(line)
	if test:
	result += [test]

	return result


	def get_grapheme_cluster_break_tests_as_unicode_scalars(
	grapheme_break_test_file_name):
	def _convert_line(line):
	# Strip comments.
	line = re.sub('#.*', '', line).strip()

	if line == "":
	return None

	test = []
	curr_code_points = 0
	boundaries = []

	# Match a list of code points.
	for token in line.split(" "):
	if token == "÷":
	boundaries += [curr_code_points]
	elif token == "×":
	pass
	else:
	code_point = int(token, 16)
	# Tests from Unicode spec have isolated surrogates in them. Our
	# segmentation algorithm works on UTF-16 sequences, so encoding
	# a surrogate would produce an invalid code unit sequence.
	# Instead of trying to emulate the maximal subpart algorithm
	# for inserting U+FFFD in Python, we just replace every
	# isolated surrogate with U+200B, which also has
	# Grapheme_Cluster_Break equal to 'Control' and test separately
	# that we handle ill-formed UTF-8 sequences.
	if code_point >= 0xd800 and code_point <= 0xdfff:
	code_point = 0x200b
	test += [code_point]
	curr_code_points += 1

	return (test, boundaries)

	# Self-test.
	assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([
	0x0903, 0x0308, 0xac01], [0, 2, 3]))
	assert(_convert_line('÷ D800 ÷ # abc') == ([0x200b], [0, 1]))

	result = []

	with open(grapheme_break_test_file_name, 'rb') as f:
	for line in f:
	test = _convert_line(line)
	if test:
	result += [test]

	return result