utils/GYBUnicodeDataUtils.py - third_party/swift - Git at Google

 ##===--------------------------------------------------*- coding: utf-8 -*-===##
 ##
 ## This source file is part of the Swift.org open source project
 ##
 ## Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
 ## Licensed under Apache License v2.0 with Runtime Library Exception
 ##
 ## See http://swift.org/LICENSE.txt for license information
 ## See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 ##
 ##===----------------------------------------------------------------------===##

 import re
 import sys
 import codecs

 class UnicodeProperty(object):
     """Abstract base class for Unicode properties."""

     def __init__(self):
         raise NotImplemented

     def get_default_value(self):
         raise NotImplemented

     def get_value(self, cp):
         raise NotImplemented

     def to_numeric_value(self, value):
         raise NotImplemented

     def get_numeric_value(self, cp):
         raise NotImplemented

 class GraphemeClusterBreakPropertyTable(UnicodeProperty):
     """Grapheme_Cluster_Break property."""

     # An array of tuples (start_code_point, end_code_point, value).
     property_value_ranges = []

     property_values = [ None for i in range(0, 0x110000) ]

     # Note: Numeric values should be consistent with
     # '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
     # 'GraphemeClusterBreakProperty' in the compiler C++ code.  If there is a
     # reason for either of those to differ, then this mapping can be overridden
     # after an instance of this class is created.
     numeric_value_table = {
       'Other': 0,
       'CR': 1,
       'LF': 2,
       'Control': 3,
       'Extend': 4,
       'Regional_Indicator': 5,
       'Prepend': 6,
       'SpacingMark': 7,
       'L': 8,
       'V': 9,
       'T': 10,
       'LV': 11,
       'LVT': 12,
     }

     def __init__(self, grapheme_break_property_file_name):
         # Build 'self.symbolic_values' -- an array that maps numeric property
         # values to symbolic values.
         self.symbolic_values = \
             [ None ] * (max(self.numeric_value_table.values()) + 1)
         for k,v in self.numeric_value_table.items():
             self.symbolic_values[v] = k

         # Load the data file.
         with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
             for line in f:
                 # Strip comments.
                 line = re.sub('#.*', '', line)

                 # Single code point?
                 m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
                 if m:
                     code_point = int(m.group(1), 16)
                     value = m.group(2)
                     self.property_value_ranges += \
                         [(code_point, code_point, value)]
                     continue

                 # Range of code points?
                 m = re.match('([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
                 if m:
                     start_code_point = int(m.group(1), 16)
                     end_code_point = int(m.group(2), 16)
                     value = m.group(3)
                     self.property_value_ranges += \
                         [(start_code_point, end_code_point, value)]

         # Prepare a flat lookup table for fast access.
         for cp in range(0, 0x110000):
             self.property_values[cp] = self.get_default_value()

         for start_code_point,end_code_point,value in self.property_value_ranges:
             for cp in range(start_code_point, end_code_point + 1):
                 self.property_values[cp] = value

     def get_default_value(self):
         return 'Other'

     def get_value(self, cp):
         return self.property_values[cp]

     def to_numeric_value(self, value):
         return self.numeric_value_table[value]

     def get_numeric_value(self, cp):
         return self.to_numeric_value(self.get_value(cp))


 # BMP code points are 16-bit values.  The code point value is split as
 # follows:
 #
 #   8 bits                     8 bits
 # +-------------------------+-------------------------+
 # | 15 14 13 12 11 10  9  8 |  7  6  5  4  3  2  1  0 |
 # +-------------------------+-------------------------+
 #   first-level index          data offset
 #
 # Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
 # The code point value is split as follows:
 #
 #   5 bits           8 bits                     8 bits
 # +----------------+-------------------------+-------------------------+
 # | 20 19 18 17 16 | 15 14 13 12 11 10  9  8 |  7  6  5  4  3  2  1  0 |
 # +----------------+-------------------------+-------------------------+
 #  first-level       second-level index         data offset
 #  index
 #
 # The actual number of bits are just trie parameters.  They affect the size of
 # the lookup tables (and thus, lookup time), but do not change the overall
 # structure of the trie.
 #
 # Here and below 'supp' stands for 'supplementary characters'.
 #
 # Property data for BMP code points is stored as a one-stage trie.
 # A trie with one lookup table consists of two memory blocks:
 #
 #         First-level lookup table
 #  +-----+-----+-----+-----+--...--+
 #  |  *  |  *  |  *  |  *  |       |
 #  +--|--+--|--+--|--+--|--+--...--+
 #     |     |     |      \          The references don't form
 #     |      \____|       \___,        a systematic pattern
 #     |           |           |
 #     |           |           |     Data storage
 #   +-V--------++-V--------++-V--------++---...---+
 #   | data     || data     || data     ||         |
 #   +----------++----------++----------++---...---+
 #
 # In order to fetch data for a given code point, you need to:
 # * load from the first-level lookup table using first-level index; this will
 #   give you the number of the data block that you should use.
 # * load from the data block applying the data offset.
 #
 # Property data for supplementary code points is stored as a two-stage trie.
 # A trie with two-stage lookup tables consists of three memory blocks.  The
 # following drawing explains how it is implemented:
 #
 #         First-level lookup table
 #       +-----+-----+-----+-----+-----+--...--+
 #       |  *  |  *  |  *  |  *  |  *  |       |
 #       +--|--+--|--+--|--+--|--+--|--+--...--+
 #          |     |     |     |      \          The references don't form
 #      ,__/      |      \____|       \___,        a systematic pattern
 #     /          |           |           |
 #    |           |           |           | Second-level lookup table
 #  +-V--------++-V--------++-V--------++-V--------++---...---+
 #  | ******** || ******** || ******** ||          ||         |
 #  +-||||||||-++-||||||||-++-||||||||-++----------++---...---+
 #    \\\|////    ||||||VV    |VVV|V|V
 #     \\|///     ||||||     /    | |
 #      \|//      ||||||    /     | |
 #       |/       ||||| \__|___.   \ \       The references don't form
 #       |        |||| \___|__. \   | \         a systematic pattern
 #       |        ||| \____|   \ \__|  \
 #       |        || \_____|__. \___|___\       ...___.
 #       |        | \______|   \____|    \___,        |  Data storage
 #     +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
 #     | data  || data  || data  || data  ||       ||       ||         |
 #     +-------++-------++-------++-------++-------++-------++---...---+
 #
 # In order to fetch data for a given code point, you need to:
 # * load from the first-level lookup table using first-level index; this will
 #   give you the number of the second-level lookup table that you should use.
 # * load from the chosen second-level lookup table using the second-level
 #   index, which will give you the number of the data block that you should
 #   use.
 # * load from the data block applying the data offset.
 #
 # First- and second-level lookup tables in the general case contain 16-bit
 # words; that will be sufficient to store a trie that does not compress at all.
 # But in many cases, after trie compression there will be fewer than 256
 # unique second-level lookup tables and/or data storage blocks, which allows
 # one to use 8-bit words in lookup tables.
 #
 # The bitwidth of data depends on the application of the trie.
 #
 # The supp tables contain entries for BMP code units to simplify trie
 # implementation, but those BMP entries are filled with the default value, so
 # they compress well.
 class UnicodeTrieGenerator(object):
     # Note: if you change any of these parameters, don't forget to update the
     # ASCII art above.
     BMP_first_level_index_bits = 8

     supp_first_level_index_bits = 5
     supp_second_level_index_bits = 8

     def get_BMP_first_level_index(self, cp):
         return cp >> self.BMP_data_offset_bits

     def get_BMP_data_offset(self, cp):
         return cp & ((1 << self.BMP_data_offset_bits) - 1)

     def get_supp_first_level_index(self, cp):
         return cp >> (self.supp_second_level_index_bits + self.supp_data_offset_bits)

     def get_supp_second_level_index(self, cp):
         return (cp >> self.supp_data_offset_bits) & ((1 << self.supp_second_level_index_bits) - 1)

     def get_supp_data_offset(self, cp):
         return cp & ((1 << self.supp_data_offset_bits) - 1)

     def __init__(self):
         """Create a trie generator with default parameters."""
         pass

     def create_tables(self):
         """Compute derived parameter values and create internal data
         structures.

         Don't change parameter values after calling this method.
         """

         self.BMP_data_offset_bits = 16 - self.BMP_first_level_index_bits

         self.supp_data_offset_bits = \
             21 - self.supp_first_level_index_bits - \
             self.supp_second_level_index_bits

         # The maximum value of the first level index for supp tables.  It is
         # not equal to ((1 << supp_first_level_index_bits) - 1), because
         # maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
         # 0x10ffff.
         self.supp_first_level_index_max = \
             0x10ffff >> (self.supp_second_level_index_bits +
                 self.supp_data_offset_bits)

         # A mapping from BMP first-level index to BMP data block index.
         self.BMP_lookup = [ i for i in range(0, 1 << self.BMP_first_level_index_bits) ]

         # An array of BMP data blocks.
         self.BMP_data = [
             [ -1 for i in range(0, 1 << self.BMP_data_offset_bits) ]
                 for i in range(0, 1 << self.BMP_first_level_index_bits) ]

         # A mapping from supp first-level index to an index of the second-level
         # lookup table.
         self.supp_lookup1 = [ i for i in range(0, self.supp_first_level_index_max + 1) ]

         # An array of second-level lookup tables.  Each second-level lookup
         # table is a mapping from a supp second-level index to supp data block
         # index.
         self.supp_lookup2 = [
             [ j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits) ]
                 for i in range(0, self.supp_first_level_index_max + 1) ]

         # An array of supp data blocks.
         self.supp_data = [
             [ -1 for i in range(0, 1 << self.supp_data_offset_bits) ]
                 for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ]

     def splat(self, value):
         for i in range(0, len(self.BMP_data)):
             for j in range(0, len(self.BMP_data[i])):
                 self.BMP_data[i][j] = value

         for i in range(0, len(self.supp_data)):
             for j in range(0, len(self.supp_data[i])):
                 self.supp_data[i][j] = value

     def set_value(self, cp, value):
         if cp <= 0xffff:
             data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
             self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)] = value
         else:
             second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
             data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
             self.supp_data[data_block_index][self.get_supp_data_offset(cp)] = value

     def get_value(self, cp):
         if cp <= 0xffff:
             data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
             return self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)]
         else:
             second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
             data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
             return self.supp_data[data_block_index][self.get_supp_data_offset(cp)]

     def fill_from_unicode_property(self, unicode_property):
         self.splat(unicode_property.get_default_value())
         for cp in range(0, 0x110000):
             self.set_value(cp, unicode_property.get_value(cp))

     def verify(self, unicode_property):
         for cp in range(0, 0x110000):
             expectedValue = unicode_property.get_value(cp)
             actualValue = self.get_value(cp)
             assert(expectedValue == actualValue)

     def freeze(self):
         """Compress internal trie representation.

         Don't mutate the trie after calling this method.

         """

         def remap_indexes(indexes, old_idx, new_idx):
             def map_index(idx):
                 if idx == old_idx:
                     return new_idx
                 elif idx > old_idx:
                     return idx - 1
                 else:
                     return idx

             # NOTE: Python 2's `map` function returns a list. Where Python 3's
             # `map` function returns an iterator. To work around this the
             # result of the `map` is explicitly converted to a `list`.
             return list(map(map_index, indexes))

         # If self.BMP_data contains identical data blocks, keep the first one,
         # remove duplicates and change the indexes in self.BMP_lookup to point to
         # the first one.
         i = 0
         while i < len(self.BMP_data):
             j = i + 1
             while j < len(self.BMP_data):
                 if self.BMP_data[i] == self.BMP_data[j]:
                     self.BMP_data.pop(j)
                     self.BMP_lookup = \
                         remap_indexes(self.BMP_lookup, old_idx=j, new_idx=i)
                 else:
                     j += 1
             i += 1

         # For supp tables, perform bottom-up deduplication: first, deduplicate
         # data blocks.  The algorithm is the same as above, but operates on
         # self.supp_data/supp_lookup2.
         i = 0
         while i < len(self.supp_data):
             j = i + 1
             while j < len(self.supp_data):
                 if self.supp_data[i] == self.supp_data[j]:
                     self.supp_data.pop(j)
                     for k in range(0, len(self.supp_lookup2)):
                         self.supp_lookup2[k] = \
                             remap_indexes(self.supp_lookup2[k], old_idx=j,
                                 new_idx=i)
                 else:
                     j += 1
             i += 1

         # Next, deduplicate second-level lookup tables.
         # Same as above, but for supp_lookup1/supp_lookup2.
         i = 0
         while i < len(self.supp_lookup2):
             j = i + 1
             while j < len(self.supp_lookup2):
                 if self.supp_lookup2[i] == self.supp_lookup2[j]:
                     self.supp_lookup2.pop(j)
                     self.supp_lookup1 = \
                         remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
                 else:
                     j += 1
             i += 1

     def _int_to_LE_bytes(self, data, width):
         if width == 1:
             assert(data & ~0xff == 0)
             return [ data ]
         if width == 2:
             assert(data & ~0xffff == 0)
             return [ data & 0xff, data & 0xff00 ]
         assert(False)

     def _int_list_to_LE_bytes(self, ints, width):
         return [
             byte
             for elt in ints
             for byte in self._int_to_LE_bytes(elt, width) ]

     def serialize(self, unicode_property):
         self.BMP_lookup_bytes_per_entry = 1 if len(self.BMP_data) < 256 else 2
         self.BMP_data_bytes_per_entry = 1

         self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 else 2
         self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 else 2
         self.supp_data_bytes_per_entry = 1

         BMP_lookup_words = [ elt for elt in self.BMP_lookup ]
         BMP_data_words = [
             unicode_property.to_numeric_value(elt)
             for block in self.BMP_data
             for elt in block ]

         supp_lookup1_words = [ elt for elt in self.supp_lookup1 ]
         supp_lookup2_words = [ elt for block in self.supp_lookup2 for elt in block ]
         supp_data_words = [
             unicode_property.to_numeric_value(elt)
             for block in self.supp_data
             for elt in block ]

         BMP_lookup_bytes = self._int_list_to_LE_bytes(
             BMP_lookup_words, self.BMP_lookup_bytes_per_entry)
         BMP_data_bytes = self._int_list_to_LE_bytes(
             BMP_data_words, self.BMP_data_bytes_per_entry)

         supp_lookup1_bytes = self._int_list_to_LE_bytes(
             supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
         supp_lookup2_bytes = self._int_list_to_LE_bytes(
             supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
         supp_data_bytes = self._int_list_to_LE_bytes(
             supp_data_words, self.supp_data_bytes_per_entry)

         self.trie_bytes = []

         self.BMP_lookup_bytes_offset = 0
         self.trie_bytes += BMP_lookup_bytes

         self.BMP_data_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += BMP_data_bytes

         self.supp_lookup1_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += supp_lookup1_bytes

         self.supp_lookup2_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += supp_lookup2_bytes

         self.supp_data_bytes_offset = len(self.trie_bytes)
         self.trie_bytes += supp_data_bytes

 def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_property_table):
     any_value = \
         grapheme_cluster_break_property_table.symbolic_values

     # Rules to determine extended grapheme cluster boundaries, as defined in
     # 'Grapheme Break Chart',
     # http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
     # Unicode 6.3.0.
     #
     # The Unicode 7.0.0 draft does not change these rules.
     #
     # As in the referenced document, the rules are specified in order of
     # decreasing priority.
     rules = [
       ( [ 'CR' ], 'no_boundary', [ 'LF' ] ),
       ( [ 'Control', 'CR', 'LF' ], 'boundary', any_value ),
       ( any_value, 'boundary', [ 'Control', 'CR', 'LF' ] ),
       ( [ 'L' ], 'no_boundary', [ 'L', 'V', 'LV', 'LVT' ] ),
       ( [ 'LV', 'V' ], 'no_boundary', [ 'V', 'T' ] ),
       ( [ 'LVT', 'T' ], 'no_boundary', [ 'T' ] ),
       ( [ 'Regional_Indicator' ], 'no_boundary', [ 'Regional_Indicator' ] ),
       ( any_value, 'no_boundary', [ 'Extend' ] ),
       ( any_value, 'no_boundary', [ 'SpacingMark' ] ),
       ( [ 'Prepend' ], 'no_boundary', any_value ),
       ( any_value, 'boundary', any_value ),
     ]

     # Expand the rules into a matrix.
     rules_matrix = {}
     for first in any_value:
         rules_matrix[first] = \
             dict.fromkeys(any_value, None)

     # Iterate over rules in the order of increasing priority.
     for firstList,action,secondList in reversed(rules):
         for first in firstList:
             for second in secondList:
                 rules_matrix[first][second] = action

     # Make sure we can pack one row of the matrix into a 'uint16_t'.
     assert(len(any_value) <= 16)

     result = []
     for first in any_value:
         # Retrieve a row that corresponds to this first code point.
         row = rules_matrix[first]

         # Change strings into bits.
         bits = [ row[second] == 'no_boundary'
             for second in any_value ]

         # Pack bits into an integer.
         packed = sum([ bits[i] * pow(2, i) for i in range(0, len(bits)) ])

         result += [ packed ]

     return result

 def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
     def _convert_line(line):
         # Strip comments.
         line = re.sub('#.*', '', line).strip()

         if line == "":
             return None

         test = ""
         curr_bytes = 0
         boundaries = []

         # Match a list of code points.
         for token in line.split(" "):
             if token == u"÷":
                 boundaries += [ curr_bytes ]
             elif token == u"×":
                 pass
             else:
                 code_point = int(token, 16)
                 # Tests from Unicode spec have isolated surrogates in them.  Our
                 # segmentation algorithm works on UTF-8 sequences, so encoding a
                 # surrogate would produce an invalid code unit sequence.  Instead
                 # of trying to emulate the maximal subpart algorithm for inserting
                 # U+FFFD in Python, we just replace every isolated surrogate with
                 # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
                 # and test separately that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
                 code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
                 as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
                 as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
                 test += as_UTF8_escaped
                 curr_bytes += len(as_UTF8_bytes)

         return (test, boundaries)

     # Self-test.
     assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
     assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))

     result = []

     with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
         for line in f:
             test = _convert_line(line)
             if test:
                 result += [ test ]

     return result

 def get_grapheme_cluster_break_tests_as_unicode_scalars(grapheme_break_test_file_name):
     def _convert_line(line):
         # Strip comments.
         line = re.sub('#.*', '', line).strip()

         if line == "":
             return None

         test = []
         curr_code_points = 0
         boundaries = []

         # Match a list of code points.
         for token in line.split(" "):
             if token == "÷":
                 boundaries += [ curr_code_points ]
             elif token == "×":
                 pass
             else:
                 code_point = int(token, 16)
                 # Tests from Unicode spec have isolated surrogates in them.  Our
                 # segmentation algorithm works on UTF-16 sequences, so encoding a
                 # surrogate would produce an invalid code unit sequence.  Instead
                 # of trying to emulate the maximal subpart algorithm for inserting
                 # U+FFFD in Python, we just replace every isolated surrogate with
                 # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
                 # and test separately that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
                 test += [ code_point ]
                 curr_code_points += 1

         return (test, boundaries)

     # Self-test.
     assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([ 0x0903, 0x0308, 0xac01 ], [ 0, 2, 3 ]))
     assert(_convert_line('÷ D800 ÷ # abc') == ([ 0x200b ], [ 0, 1 ]))

     result = []

     with open(grapheme_break_test_file_name, 'rb') as f:
         for line in f:
             test = _convert_line(line)
             if test:
                 result += [ test ]

     return result
	##===--------------------------------------------------- coding: utf-8 --===##
	##
	## This source file is part of the Swift.org open source project
	##
	## Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
	## Licensed under Apache License v2.0 with Runtime Library Exception
	##
	## See http://swift.org/LICENSE.txt for license information
	## See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
	##
	##===----------------------------------------------------------------------===##

	import re
	import sys
	import codecs

	class UnicodeProperty(object):
	"""Abstract base class for Unicode properties."""

	def __init__(self):
	raise NotImplemented

	def get_default_value(self):
	raise NotImplemented

	def get_value(self, cp):
	raise NotImplemented

	def to_numeric_value(self, value):
	raise NotImplemented

	def get_numeric_value(self, cp):
	raise NotImplemented

	class GraphemeClusterBreakPropertyTable(UnicodeProperty):
	"""Grapheme_Cluster_Break property."""

	# An array of tuples (start_code_point, end_code_point, value).
	property_value_ranges = []

	property_values = [ None for i in range(0, 0x110000) ]

	# Note: Numeric values should be consistent with
	# '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
	# 'GraphemeClusterBreakProperty' in the compiler C++ code. If there is a
	# reason for either of those to differ, then this mapping can be overridden
	# after an instance of this class is created.
	numeric_value_table = {
	'Other': 0,
	'CR': 1,
	'LF': 2,
	'Control': 3,
	'Extend': 4,
	'Regional_Indicator': 5,
	'Prepend': 6,
	'SpacingMark': 7,
	'L': 8,
	'V': 9,
	'T': 10,
	'LV': 11,
	'LVT': 12,
	}

	def __init__(self, grapheme_break_property_file_name):
	# Build 'self.symbolic_values' -- an array that maps numeric property
	# values to symbolic values.
	self.symbolic_values = \
	[ None ] * (max(self.numeric_value_table.values()) + 1)
	for k,v in self.numeric_value_table.items():
	self.symbolic_values[v] = k

	# Load the data file.
	with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
	for line in f:
	# Strip comments.
	line = re.sub('#.*', '', line)

	# Single code point?
	m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
	if m:
	code_point = int(m.group(1), 16)
	value = m.group(2)
	self.property_value_ranges += \
	[(code_point, code_point, value)]
	continue

	# Range of code points?
	m = re.match('([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
	if m:
	start_code_point = int(m.group(1), 16)
	end_code_point = int(m.group(2), 16)
	value = m.group(3)
	self.property_value_ranges += \
	[(start_code_point, end_code_point, value)]

	# Prepare a flat lookup table for fast access.
	for cp in range(0, 0x110000):
	self.property_values[cp] = self.get_default_value()

	for start_code_point,end_code_point,value in self.property_value_ranges:
	for cp in range(start_code_point, end_code_point + 1):
	self.property_values[cp] = value

	def get_default_value(self):
	return 'Other'

	def get_value(self, cp):
	return self.property_values[cp]

	def to_numeric_value(self, value):
	return self.numeric_value_table[value]

	def get_numeric_value(self, cp):
	return self.to_numeric_value(self.get_value(cp))


	# BMP code points are 16-bit values. The code point value is split as
	# follows:
	#
	# 8 bits 8 bits
	# +-------------------------+-------------------------+
	# \| 15 14 13 12 11 10 9 8 \| 7 6 5 4 3 2 1 0 \|
	# +-------------------------+-------------------------+
	# first-level index data offset
	#
	# Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
	# The code point value is split as follows:
	#
	# 5 bits 8 bits 8 bits
	# +----------------+-------------------------+-------------------------+
	# \| 20 19 18 17 16 \| 15 14 13 12 11 10 9 8 \| 7 6 5 4 3 2 1 0 \|
	# +----------------+-------------------------+-------------------------+
	# first-level second-level index data offset
	# index
	#
	# The actual number of bits are just trie parameters. They affect the size of
	# the lookup tables (and thus, lookup time), but do not change the overall
	# structure of the trie.
	#
	# Here and below 'supp' stands for 'supplementary characters'.
	#
	# Property data for BMP code points is stored as a one-stage trie.
	# A trie with one lookup table consists of two memory blocks:
	#
	# First-level lookup table
	# +-----+-----+-----+-----+--...--+
	# \| * \| * \| * \| * \| \|
	# +--\|--+--\|--+--\|--+--\|--+--...--+
	# \| \| \| \ The references don't form
	# \| \____\| \___, a systematic pattern
	# \| \| \|
	# \| \| \| Data storage
	# +-V--------++-V--------++-V--------++---...---+
	# \| data \|\| data \|\| data \|\| \|
	# +----------++----------++----------++---...---+
	#
	# In order to fetch data for a given code point, you need to:
	# * load from the first-level lookup table using first-level index; this will
	# give you the number of the data block that you should use.
	# * load from the data block applying the data offset.
	#
	# Property data for supplementary code points is stored as a two-stage trie.
	# A trie with two-stage lookup tables consists of three memory blocks. The
	# following drawing explains how it is implemented:
	#
	# First-level lookup table
	# +-----+-----+-----+-----+-----+--...--+
	# \| * \| * \| * \| * \| * \| \|
	# +--\|--+--\|--+--\|--+--\|--+--\|--+--...--+
	# \| \| \| \| \ The references don't form
	# ,__/ \| \____\| \___, a systematic pattern
	# / \| \| \|
	# \| \| \| \| Second-level lookup table
	# +-V--------++-V--------++-V--------++-V--------++---...---+
	# \| ****** \|\| **** \|\| ****** \|\| \|\| \|
	# +-\|\|\|\|\|\|\|\|-++-\|\|\|\|\|\|\|\|-++-\|\|\|\|\|\|\|\|-++----------++---...---+
	# \\\\|//// \|\|\|\|\|\|VV \|VVV\|V\|V
	# \\\|/// \|\|\|\|\|\| / \| \|
	# \\|// \|\|\|\|\|\| / \| \|
	# \|/ \|\|\|\|\| \__\|___. \ \ The references don't form
	# \| \|\|\|\| \___\|__. \ \| \ a systematic pattern
	# \| \|\|\| \____\| \ \__\| \
	# \| \|\| \_____\|__. \___\|___\ ...___.
	# \| \| \______\| \____\| \___, \| Data storage
	# +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
	# \| data \|\| data \|\| data \|\| data \|\| \|\| \|\| \|
	# +-------++-------++-------++-------++-------++-------++---...---+
	#
	# In order to fetch data for a given code point, you need to:
	# * load from the first-level lookup table using first-level index; this will
	# give you the number of the second-level lookup table that you should use.
	# * load from the chosen second-level lookup table using the second-level
	# index, which will give you the number of the data block that you should
	# use.
	# * load from the data block applying the data offset.
	#
	# First- and second-level lookup tables in the general case contain 16-bit
	# words; that will be sufficient to store a trie that does not compress at all.
	# But in many cases, after trie compression there will be fewer than 256
	# unique second-level lookup tables and/or data storage blocks, which allows
	# one to use 8-bit words in lookup tables.
	#
	# The bitwidth of data depends on the application of the trie.
	#
	# The supp tables contain entries for BMP code units to simplify trie
	# implementation, but those BMP entries are filled with the default value, so
	# they compress well.
	class UnicodeTrieGenerator(object):
	# Note: if you change any of these parameters, don't forget to update the
	# ASCII art above.
	BMP_first_level_index_bits = 8

	supp_first_level_index_bits = 5
	supp_second_level_index_bits = 8

	def get_BMP_first_level_index(self, cp):
	return cp >> self.BMP_data_offset_bits

	def get_BMP_data_offset(self, cp):
	return cp & ((1 << self.BMP_data_offset_bits) - 1)

	def get_supp_first_level_index(self, cp):
	return cp >> (self.supp_second_level_index_bits + self.supp_data_offset_bits)

	def get_supp_second_level_index(self, cp):
	return (cp >> self.supp_data_offset_bits) & ((1 << self.supp_second_level_index_bits) - 1)

	def get_supp_data_offset(self, cp):
	return cp & ((1 << self.supp_data_offset_bits) - 1)

	def __init__(self):
	"""Create a trie generator with default parameters."""
	pass

	def create_tables(self):
	"""Compute derived parameter values and create internal data
	structures.

	Don't change parameter values after calling this method.
	"""

	self.BMP_data_offset_bits = 16 - self.BMP_first_level_index_bits

	self.supp_data_offset_bits = \
	21 - self.supp_first_level_index_bits - \
	self.supp_second_level_index_bits

	# The maximum value of the first level index for supp tables. It is
	# not equal to ((1 << supp_first_level_index_bits) - 1), because
	# maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
	# 0x10ffff.
	self.supp_first_level_index_max = \
	0x10ffff >> (self.supp_second_level_index_bits +
	self.supp_data_offset_bits)

	# A mapping from BMP first-level index to BMP data block index.
	self.BMP_lookup = [ i for i in range(0, 1 << self.BMP_first_level_index_bits) ]

	# An array of BMP data blocks.
	self.BMP_data = [
	[ -1 for i in range(0, 1 << self.BMP_data_offset_bits) ]
	for i in range(0, 1 << self.BMP_first_level_index_bits) ]

	# A mapping from supp first-level index to an index of the second-level
	# lookup table.
	self.supp_lookup1 = [ i for i in range(0, self.supp_first_level_index_max + 1) ]

	# An array of second-level lookup tables. Each second-level lookup
	# table is a mapping from a supp second-level index to supp data block
	# index.
	self.supp_lookup2 = [
	[ j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits) ]
	for i in range(0, self.supp_first_level_index_max + 1) ]

	# An array of supp data blocks.
	self.supp_data = [
	[ -1 for i in range(0, 1 << self.supp_data_offset_bits) ]
	for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ]

	def splat(self, value):
	for i in range(0, len(self.BMP_data)):
	for j in range(0, len(self.BMP_data[i])):
	self.BMP_data[i][j] = value

	for i in range(0, len(self.supp_data)):
	for j in range(0, len(self.supp_data[i])):
	self.supp_data[i][j] = value

	def set_value(self, cp, value):
	if cp <= 0xffff:
	data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
	self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)] = value
	else:
	second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
	data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
	self.supp_data[data_block_index][self.get_supp_data_offset(cp)] = value

	def get_value(self, cp):
	if cp <= 0xffff:
	data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
	return self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)]
	else:
	second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
	data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
	return self.supp_data[data_block_index][self.get_supp_data_offset(cp)]

	def fill_from_unicode_property(self, unicode_property):
	self.splat(unicode_property.get_default_value())
	for cp in range(0, 0x110000):
	self.set_value(cp, unicode_property.get_value(cp))

	def verify(self, unicode_property):
	for cp in range(0, 0x110000):
	expectedValue = unicode_property.get_value(cp)
	actualValue = self.get_value(cp)
	assert(expectedValue == actualValue)

	def freeze(self):
	"""Compress internal trie representation.

	Don't mutate the trie after calling this method.

	"""

	def remap_indexes(indexes, old_idx, new_idx):
	def map_index(idx):
	if idx == old_idx:
	return new_idx
	elif idx > old_idx:
	return idx - 1
	else:
	return idx

	# NOTE: Python 2's `map` function returns a list. Where Python 3's
	# `map` function returns an iterator. To work around this the
	# result of the `map` is explicitly converted to a `list`.
	return list(map(map_index, indexes))

	# If self.BMP_data contains identical data blocks, keep the first one,
	# remove duplicates and change the indexes in self.BMP_lookup to point to
	# the first one.
	i = 0
	while i < len(self.BMP_data):
	j = i + 1
	while j < len(self.BMP_data):
	if self.BMP_data[i] == self.BMP_data[j]:
	self.BMP_data.pop(j)
	self.BMP_lookup = \
	remap_indexes(self.BMP_lookup, old_idx=j, new_idx=i)
	else:
	j += 1
	i += 1

	# For supp tables, perform bottom-up deduplication: first, deduplicate
	# data blocks. The algorithm is the same as above, but operates on
	# self.supp_data/supp_lookup2.
	i = 0
	while i < len(self.supp_data):
	j = i + 1
	while j < len(self.supp_data):
	if self.supp_data[i] == self.supp_data[j]:
	self.supp_data.pop(j)
	for k in range(0, len(self.supp_lookup2)):
	self.supp_lookup2[k] = \
	remap_indexes(self.supp_lookup2[k], old_idx=j,
	new_idx=i)
	else:
	j += 1
	i += 1

	# Next, deduplicate second-level lookup tables.
	# Same as above, but for supp_lookup1/supp_lookup2.
	i = 0
	while i < len(self.supp_lookup2):
	j = i + 1
	while j < len(self.supp_lookup2):
	if self.supp_lookup2[i] == self.supp_lookup2[j]:
	self.supp_lookup2.pop(j)
	self.supp_lookup1 = \
	remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
	else:
	j += 1
	i += 1

	def _int_to_LE_bytes(self, data, width):
	if width == 1:
	assert(data & ~0xff == 0)
	return [ data ]
	if width == 2:
	assert(data & ~0xffff == 0)
	return [ data & 0xff, data & 0xff00 ]
	assert(False)

	def _int_list_to_LE_bytes(self, ints, width):
	return [
	byte
	for elt in ints
	for byte in self._int_to_LE_bytes(elt, width) ]

	def serialize(self, unicode_property):
	self.BMP_lookup_bytes_per_entry = 1 if len(self.BMP_data) < 256 else 2
	self.BMP_data_bytes_per_entry = 1

	self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 else 2
	self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 else 2
	self.supp_data_bytes_per_entry = 1

	BMP_lookup_words = [ elt for elt in self.BMP_lookup ]
	BMP_data_words = [
	unicode_property.to_numeric_value(elt)
	for block in self.BMP_data
	for elt in block ]

	supp_lookup1_words = [ elt for elt in self.supp_lookup1 ]
	supp_lookup2_words = [ elt for block in self.supp_lookup2 for elt in block ]
	supp_data_words = [
	unicode_property.to_numeric_value(elt)
	for block in self.supp_data
	for elt in block ]

	BMP_lookup_bytes = self._int_list_to_LE_bytes(
	BMP_lookup_words, self.BMP_lookup_bytes_per_entry)
	BMP_data_bytes = self._int_list_to_LE_bytes(
	BMP_data_words, self.BMP_data_bytes_per_entry)

	supp_lookup1_bytes = self._int_list_to_LE_bytes(
	supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
	supp_lookup2_bytes = self._int_list_to_LE_bytes(
	supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
	supp_data_bytes = self._int_list_to_LE_bytes(
	supp_data_words, self.supp_data_bytes_per_entry)

	self.trie_bytes = []

	self.BMP_lookup_bytes_offset = 0
	self.trie_bytes += BMP_lookup_bytes

	self.BMP_data_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += BMP_data_bytes

	self.supp_lookup1_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += supp_lookup1_bytes

	self.supp_lookup2_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += supp_lookup2_bytes

	self.supp_data_bytes_offset = len(self.trie_bytes)
	self.trie_bytes += supp_data_bytes

	def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_property_table):
	any_value = \
	grapheme_cluster_break_property_table.symbolic_values

	# Rules to determine extended grapheme cluster boundaries, as defined in
	# 'Grapheme Break Chart',
	# http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
	# Unicode 6.3.0.
	#
	# The Unicode 7.0.0 draft does not change these rules.
	#
	# As in the referenced document, the rules are specified in order of
	# decreasing priority.
	rules = [
	( [ 'CR' ], 'no_boundary', [ 'LF' ] ),
	( [ 'Control', 'CR', 'LF' ], 'boundary', any_value ),
	( any_value, 'boundary', [ 'Control', 'CR', 'LF' ] ),
	( [ 'L' ], 'no_boundary', [ 'L', 'V', 'LV', 'LVT' ] ),
	( [ 'LV', 'V' ], 'no_boundary', [ 'V', 'T' ] ),
	( [ 'LVT', 'T' ], 'no_boundary', [ 'T' ] ),
	( [ 'Regional_Indicator' ], 'no_boundary', [ 'Regional_Indicator' ] ),
	( any_value, 'no_boundary', [ 'Extend' ] ),
	( any_value, 'no_boundary', [ 'SpacingMark' ] ),
	( [ 'Prepend' ], 'no_boundary', any_value ),
	( any_value, 'boundary', any_value ),
	]

	# Expand the rules into a matrix.
	rules_matrix = {}
	for first in any_value:
	rules_matrix[first] = \
	dict.fromkeys(any_value, None)

	# Iterate over rules in the order of increasing priority.
	for firstList,action,secondList in reversed(rules):
	for first in firstList:
	for second in secondList:
	rules_matrix[first][second] = action

	# Make sure we can pack one row of the matrix into a 'uint16_t'.
	assert(len(any_value) <= 16)

	result = []
	for first in any_value:
	# Retrieve a row that corresponds to this first code point.
	row = rules_matrix[first]

	# Change strings into bits.
	bits = [ row[second] == 'no_boundary'
	for second in any_value ]

	# Pack bits into an integer.
	packed = sum([ bits[i] * pow(2, i) for i in range(0, len(bits)) ])

	result += [ packed ]

	return result

	def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
	def _convert_line(line):
	# Strip comments.
	line = re.sub('#.*', '', line).strip()

	if line == "":
	return None

	test = ""
	curr_bytes = 0
	boundaries = []

	# Match a list of code points.
	for token in line.split(" "):
	if token == u"÷":
	boundaries += [ curr_bytes ]
	elif token == u"×":
	pass
	else:
	code_point = int(token, 16)
	# Tests from Unicode spec have isolated surrogates in them. Our
	# segmentation algorithm works on UTF-8 sequences, so encoding a
	# surrogate would produce an invalid code unit sequence. Instead
	# of trying to emulate the maximal subpart algorithm for inserting
	# U+FFFD in Python, we just replace every isolated surrogate with
	# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
	# and test separately that we handle ill-formed UTF-8 sequences.
	if code_point >= 0xd800 and code_point <= 0xdfff:
	code_point = 0x200b
	code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
	as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
	as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
	test += as_UTF8_escaped
	curr_bytes += len(as_UTF8_bytes)

	return (test, boundaries)

	# Self-test.
	assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
	assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))

	result = []

	with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
	for line in f:
	test = _convert_line(line)
	if test:
	result += [ test ]

	return result

	def get_grapheme_cluster_break_tests_as_unicode_scalars(grapheme_break_test_file_name):
	def _convert_line(line):
	# Strip comments.
	line = re.sub('#.*', '', line).strip()

	if line == "":
	return None

	test = []
	curr_code_points = 0
	boundaries = []

	# Match a list of code points.
	for token in line.split(" "):
	if token == "÷":
	boundaries += [ curr_code_points ]
	elif token == "×":
	pass
	else:
	code_point = int(token, 16)
	# Tests from Unicode spec have isolated surrogates in them. Our
	# segmentation algorithm works on UTF-16 sequences, so encoding a
	# surrogate would produce an invalid code unit sequence. Instead
	# of trying to emulate the maximal subpart algorithm for inserting
	# U+FFFD in Python, we just replace every isolated surrogate with
	# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
	# and test separately that we handle ill-formed UTF-8 sequences.
	if code_point >= 0xd800 and code_point <= 0xdfff:
	code_point = 0x200b
	test += [ code_point ]
	curr_code_points += 1

	return (test, boundaries)

	# Self-test.
	assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([ 0x0903, 0x0308, 0xac01 ], [ 0, 2, 3 ]))
	assert(_convert_line('÷ D800 ÷ # abc') == ([ 0x200b ], [ 0, 1 ]))

	result = []

	with open(grapheme_break_test_file_name, 'rb') as f:
	for line in f:
	test = _convert_line(line)
	if test:
	result += [ test ]

	return result