blob: 8f2fdff394118a96f00f2114cf4f7698d13e2943 [file] [log] [blame]
##===--------------------------------------------------*- coding: utf-8 -*-===##
##
## This source file is part of the Swift.org open source project
##
## Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
## Licensed under Apache License v2.0 with Runtime Library Exception
##
## See http://swift.org/LICENSE.txt for license information
## See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
##
##===----------------------------------------------------------------------===##
import re
import sys
import codecs
class UnicodeProperty(object):
"""Abstract base class for Unicode properties."""
def __init__(self):
raise NotImplemented
def get_default_value(self):
raise NotImplemented
def get_value(self, cp):
raise NotImplemented
def to_numeric_value(self, value):
raise NotImplemented
def get_numeric_value(self, cp):
raise NotImplemented
class GraphemeClusterBreakPropertyTable(UnicodeProperty):
"""Grapheme_Cluster_Break property."""
# An array of tuples (start_code_point, end_code_point, value).
property_value_ranges = []
property_values = [ None for i in range(0, 0x110000) ]
# Note: Numeric values should be consistent with
# '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
# 'GraphemeClusterBreakProperty' in the compiler C++ code. If there is a
# reason for either of those to differ, then this mapping can be overridden
# after an instance of this class is created.
numeric_value_table = {
'Other': 0,
'CR': 1,
'LF': 2,
'Control': 3,
'Extend': 4,
'Regional_Indicator': 5,
'Prepend': 6,
'SpacingMark': 7,
'L': 8,
'V': 9,
'T': 10,
'LV': 11,
'LVT': 12,
}
def __init__(self, grapheme_break_property_file_name):
# Build 'self.symbolic_values' -- an array that maps numeric property
# values to symbolic values.
self.symbolic_values = \
[ None ] * (max(self.numeric_value_table.values()) + 1)
for k,v in self.numeric_value_table.items():
self.symbolic_values[v] = k
# Load the data file.
with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
# Strip comments.
line = re.sub('#.*', '', line)
# Single code point?
m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
if m:
code_point = int(m.group(1), 16)
value = m.group(2)
self.property_value_ranges += \
[(code_point, code_point, value)]
continue
# Range of code points?
m = re.match('([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
if m:
start_code_point = int(m.group(1), 16)
end_code_point = int(m.group(2), 16)
value = m.group(3)
self.property_value_ranges += \
[(start_code_point, end_code_point, value)]
# Prepare a flat lookup table for fast access.
for cp in range(0, 0x110000):
self.property_values[cp] = self.get_default_value()
for start_code_point,end_code_point,value in self.property_value_ranges:
for cp in range(start_code_point, end_code_point + 1):
self.property_values[cp] = value
def get_default_value(self):
return 'Other'
def get_value(self, cp):
return self.property_values[cp]
def to_numeric_value(self, value):
return self.numeric_value_table[value]
def get_numeric_value(self, cp):
return self.to_numeric_value(self.get_value(cp))
# BMP code points are 16-bit values. The code point value is split as
# follows:
#
# 8 bits 8 bits
# +-------------------------+-------------------------+
# | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 |
# +-------------------------+-------------------------+
# first-level index data offset
#
# Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
# The code point value is split as follows:
#
# 5 bits 8 bits 8 bits
# +----------------+-------------------------+-------------------------+
# | 20 19 18 17 16 | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 |
# +----------------+-------------------------+-------------------------+
# first-level second-level index data offset
# index
#
# The actual number of bits are just trie parameters. They affect the size of
# the lookup tables (and thus, lookup time), but do not change the overall
# structure of the trie.
#
# Here and below 'supp' stands for 'supplementary characters'.
#
# Property data for BMP code points is stored as a one-stage trie.
# A trie with one lookup table consists of two memory blocks:
#
# First-level lookup table
# +-----+-----+-----+-----+--...--+
# | * | * | * | * | |
# +--|--+--|--+--|--+--|--+--...--+
# | | | \ The references don't form
# | \____| \___, a systematic pattern
# | | |
# | | | Data storage
# +-V--------++-V--------++-V--------++---...---+
# | data || data || data || |
# +----------++----------++----------++---...---+
#
# In order to fetch data for a given code point, you need to:
# * load from the first-level lookup table using first-level index; this will
# give you the number of the data block that you should use.
# * load from the data block applying the data offset.
#
# Property data for supplementary code points is stored as a two-stage trie.
# A trie with two-stage lookup tables consists of three memory blocks. The
# following drawing explains how it is implemented:
#
# First-level lookup table
# +-----+-----+-----+-----+-----+--...--+
# | * | * | * | * | * | |
# +--|--+--|--+--|--+--|--+--|--+--...--+
# | | | | \ The references don't form
# ,__/ | \____| \___, a systematic pattern
# / | | |
# | | | | Second-level lookup table
# +-V--------++-V--------++-V--------++-V--------++---...---+
# | ******** || ******** || ******** || || |
# +-||||||||-++-||||||||-++-||||||||-++----------++---...---+
# \\\|//// ||||||VV |VVV|V|V
# \\|/// |||||| / | |
# \|// |||||| / | |
# |/ ||||| \__|___. \ \ The references don't form
# | |||| \___|__. \ | \ a systematic pattern
# | ||| \____| \ \__| \
# | || \_____|__. \___|___\ ...___.
# | | \______| \____| \___, | Data storage
# +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
# | data || data || data || data || || || |
# +-------++-------++-------++-------++-------++-------++---...---+
#
# In order to fetch data for a given code point, you need to:
# * load from the first-level lookup table using first-level index; this will
# give you the number of the second-level lookup table that you should use.
# * load from the chosen second-level lookup table using the second-level
# index, which will give you the number of the data block that you should
# use.
# * load from the data block applying the data offset.
#
# First- and second-level lookup tables in the general case contain 16-bit
# words; that will be sufficient to store a trie that does not compress at all.
# But in many cases, after trie compression there will be fewer than 256
# unique second-level lookup tables and/or data storage blocks, which allows
# one to use 8-bit words in lookup tables.
#
# The bitwidth of data depends on the application of the trie.
#
# The supp tables contain entries for BMP code units to simplify trie
# implementation, but those BMP entries are filled with the default value, so
# they compress well.
class UnicodeTrieGenerator(object):
# Note: if you change any of these parameters, don't forget to update the
# ASCII art above.
BMP_first_level_index_bits = 8
supp_first_level_index_bits = 5
supp_second_level_index_bits = 8
def get_BMP_first_level_index(self, cp):
return cp >> self.BMP_data_offset_bits
def get_BMP_data_offset(self, cp):
return cp & ((1 << self.BMP_data_offset_bits) - 1)
def get_supp_first_level_index(self, cp):
return cp >> (self.supp_second_level_index_bits + self.supp_data_offset_bits)
def get_supp_second_level_index(self, cp):
return (cp >> self.supp_data_offset_bits) & ((1 << self.supp_second_level_index_bits) - 1)
def get_supp_data_offset(self, cp):
return cp & ((1 << self.supp_data_offset_bits) - 1)
def __init__(self):
"""Create a trie generator with default parameters."""
pass
def create_tables(self):
"""Compute derived parameter values and create internal data
structures.
Don't change parameter values after calling this method.
"""
self.BMP_data_offset_bits = 16 - self.BMP_first_level_index_bits
self.supp_data_offset_bits = \
21 - self.supp_first_level_index_bits - \
self.supp_second_level_index_bits
# The maximum value of the first level index for supp tables. It is
# not equal to ((1 << supp_first_level_index_bits) - 1), because
# maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
# 0x10ffff.
self.supp_first_level_index_max = \
0x10ffff >> (self.supp_second_level_index_bits +
self.supp_data_offset_bits)
# A mapping from BMP first-level index to BMP data block index.
self.BMP_lookup = [ i for i in range(0, 1 << self.BMP_first_level_index_bits) ]
# An array of BMP data blocks.
self.BMP_data = [
[ -1 for i in range(0, 1 << self.BMP_data_offset_bits) ]
for i in range(0, 1 << self.BMP_first_level_index_bits) ]
# A mapping from supp first-level index to an index of the second-level
# lookup table.
self.supp_lookup1 = [ i for i in range(0, self.supp_first_level_index_max + 1) ]
# An array of second-level lookup tables. Each second-level lookup
# table is a mapping from a supp second-level index to supp data block
# index.
self.supp_lookup2 = [
[ j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits) ]
for i in range(0, self.supp_first_level_index_max + 1) ]
# An array of supp data blocks.
self.supp_data = [
[ -1 for i in range(0, 1 << self.supp_data_offset_bits) ]
for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ]
def splat(self, value):
for i in range(0, len(self.BMP_data)):
for j in range(0, len(self.BMP_data[i])):
self.BMP_data[i][j] = value
for i in range(0, len(self.supp_data)):
for j in range(0, len(self.supp_data[i])):
self.supp_data[i][j] = value
def set_value(self, cp, value):
if cp <= 0xffff:
data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)] = value
else:
second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
self.supp_data[data_block_index][self.get_supp_data_offset(cp)] = value
def get_value(self, cp):
if cp <= 0xffff:
data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
return self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)]
else:
second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
return self.supp_data[data_block_index][self.get_supp_data_offset(cp)]
def fill_from_unicode_property(self, unicode_property):
self.splat(unicode_property.get_default_value())
for cp in range(0, 0x110000):
self.set_value(cp, unicode_property.get_value(cp))
def verify(self, unicode_property):
for cp in range(0, 0x110000):
expectedValue = unicode_property.get_value(cp)
actualValue = self.get_value(cp)
assert(expectedValue == actualValue)
def freeze(self):
"""Compress internal trie representation.
Don't mutate the trie after calling this method.
"""
def remap_indexes(indexes, old_idx, new_idx):
def map_index(idx):
if idx == old_idx:
return new_idx
elif idx > old_idx:
return idx - 1
else:
return idx
# NOTE: Python 2's `map` function returns a list. Where Python 3's
# `map` function returns an iterator. To work around this the
# result of the `map` is explicitly converted to a `list`.
return list(map(map_index, indexes))
# If self.BMP_data contains identical data blocks, keep the first one,
# remove duplicates and change the indexes in self.BMP_lookup to point to
# the first one.
i = 0
while i < len(self.BMP_data):
j = i + 1
while j < len(self.BMP_data):
if self.BMP_data[i] == self.BMP_data[j]:
self.BMP_data.pop(j)
self.BMP_lookup = \
remap_indexes(self.BMP_lookup, old_idx=j, new_idx=i)
else:
j += 1
i += 1
# For supp tables, perform bottom-up deduplication: first, deduplicate
# data blocks. The algorithm is the same as above, but operates on
# self.supp_data/supp_lookup2.
i = 0
while i < len(self.supp_data):
j = i + 1
while j < len(self.supp_data):
if self.supp_data[i] == self.supp_data[j]:
self.supp_data.pop(j)
for k in range(0, len(self.supp_lookup2)):
self.supp_lookup2[k] = \
remap_indexes(self.supp_lookup2[k], old_idx=j,
new_idx=i)
else:
j += 1
i += 1
# Next, deduplicate second-level lookup tables.
# Same as above, but for supp_lookup1/supp_lookup2.
i = 0
while i < len(self.supp_lookup2):
j = i + 1
while j < len(self.supp_lookup2):
if self.supp_lookup2[i] == self.supp_lookup2[j]:
self.supp_lookup2.pop(j)
self.supp_lookup1 = \
remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
else:
j += 1
i += 1
def _int_to_LE_bytes(self, data, width):
if width == 1:
assert(data & ~0xff == 0)
return [ data ]
if width == 2:
assert(data & ~0xffff == 0)
return [ data & 0xff, data & 0xff00 ]
assert(False)
def _int_list_to_LE_bytes(self, ints, width):
return [
byte
for elt in ints
for byte in self._int_to_LE_bytes(elt, width) ]
def serialize(self, unicode_property):
self.BMP_lookup_bytes_per_entry = 1 if len(self.BMP_data) < 256 else 2
self.BMP_data_bytes_per_entry = 1
self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 else 2
self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 else 2
self.supp_data_bytes_per_entry = 1
BMP_lookup_words = [ elt for elt in self.BMP_lookup ]
BMP_data_words = [
unicode_property.to_numeric_value(elt)
for block in self.BMP_data
for elt in block ]
supp_lookup1_words = [ elt for elt in self.supp_lookup1 ]
supp_lookup2_words = [ elt for block in self.supp_lookup2 for elt in block ]
supp_data_words = [
unicode_property.to_numeric_value(elt)
for block in self.supp_data
for elt in block ]
BMP_lookup_bytes = self._int_list_to_LE_bytes(
BMP_lookup_words, self.BMP_lookup_bytes_per_entry)
BMP_data_bytes = self._int_list_to_LE_bytes(
BMP_data_words, self.BMP_data_bytes_per_entry)
supp_lookup1_bytes = self._int_list_to_LE_bytes(
supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
supp_lookup2_bytes = self._int_list_to_LE_bytes(
supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
supp_data_bytes = self._int_list_to_LE_bytes(
supp_data_words, self.supp_data_bytes_per_entry)
self.trie_bytes = []
self.BMP_lookup_bytes_offset = 0
self.trie_bytes += BMP_lookup_bytes
self.BMP_data_bytes_offset = len(self.trie_bytes)
self.trie_bytes += BMP_data_bytes
self.supp_lookup1_bytes_offset = len(self.trie_bytes)
self.trie_bytes += supp_lookup1_bytes
self.supp_lookup2_bytes_offset = len(self.trie_bytes)
self.trie_bytes += supp_lookup2_bytes
self.supp_data_bytes_offset = len(self.trie_bytes)
self.trie_bytes += supp_data_bytes
def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_property_table):
any_value = \
grapheme_cluster_break_property_table.symbolic_values
# Rules to determine extended grapheme cluster boundaries, as defined in
# 'Grapheme Break Chart',
# http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
# Unicode 6.3.0.
#
# The Unicode 7.0.0 draft does not change these rules.
#
# As in the referenced document, the rules are specified in order of
# decreasing priority.
rules = [
( [ 'CR' ], 'no_boundary', [ 'LF' ] ),
( [ 'Control', 'CR', 'LF' ], 'boundary', any_value ),
( any_value, 'boundary', [ 'Control', 'CR', 'LF' ] ),
( [ 'L' ], 'no_boundary', [ 'L', 'V', 'LV', 'LVT' ] ),
( [ 'LV', 'V' ], 'no_boundary', [ 'V', 'T' ] ),
( [ 'LVT', 'T' ], 'no_boundary', [ 'T' ] ),
( [ 'Regional_Indicator' ], 'no_boundary', [ 'Regional_Indicator' ] ),
( any_value, 'no_boundary', [ 'Extend' ] ),
( any_value, 'no_boundary', [ 'SpacingMark' ] ),
( [ 'Prepend' ], 'no_boundary', any_value ),
( any_value, 'boundary', any_value ),
]
# Expand the rules into a matrix.
rules_matrix = {}
for first in any_value:
rules_matrix[first] = \
dict.fromkeys(any_value, None)
# Iterate over rules in the order of increasing priority.
for firstList,action,secondList in reversed(rules):
for first in firstList:
for second in secondList:
rules_matrix[first][second] = action
# Make sure we can pack one row of the matrix into a 'uint16_t'.
assert(len(any_value) <= 16)
result = []
for first in any_value:
# Retrieve a row that corresponds to this first code point.
row = rules_matrix[first]
# Change strings into bits.
bits = [ row[second] == 'no_boundary'
for second in any_value ]
# Pack bits into an integer.
packed = sum([ bits[i] * pow(2, i) for i in range(0, len(bits)) ])
result += [ packed ]
return result
def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
def _convert_line(line):
# Strip comments.
line = re.sub('#.*', '', line).strip()
if line == "":
return None
test = ""
curr_bytes = 0
boundaries = []
# Match a list of code points.
for token in line.split(" "):
if token == u"÷":
boundaries += [ curr_bytes ]
elif token == u"×":
pass
else:
code_point = int(token, 16)
# Tests from Unicode spec have isolated surrogates in them. Our
# segmentation algorithm works on UTF-8 sequences, so encoding a
# surrogate would produce an invalid code unit sequence. Instead
# of trying to emulate the maximal subpart algorithm for inserting
# U+FFFD in Python, we just replace every isolated surrogate with
# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
# and test separately that we handle ill-formed UTF-8 sequences.
if code_point >= 0xd800 and code_point <= 0xdfff:
code_point = 0x200b
code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
test += as_UTF8_escaped
curr_bytes += len(as_UTF8_bytes)
return (test, boundaries)
# Self-test.
assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
result = []
with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
test = _convert_line(line)
if test:
result += [ test ]
return result
def get_grapheme_cluster_break_tests_as_unicode_scalars(grapheme_break_test_file_name):
def _convert_line(line):
# Strip comments.
line = re.sub('#.*', '', line).strip()
if line == "":
return None
test = []
curr_code_points = 0
boundaries = []
# Match a list of code points.
for token in line.split(" "):
if token == "÷":
boundaries += [ curr_code_points ]
elif token == "×":
pass
else:
code_point = int(token, 16)
# Tests from Unicode spec have isolated surrogates in them. Our
# segmentation algorithm works on UTF-16 sequences, so encoding a
# surrogate would produce an invalid code unit sequence. Instead
# of trying to emulate the maximal subpart algorithm for inserting
# U+FFFD in Python, we just replace every isolated surrogate with
# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
# and test separately that we handle ill-formed UTF-8 sequences.
if code_point >= 0xd800 and code_point <= 0xdfff:
code_point = 0x200b
test += [ code_point ]
curr_code_points += 1
return (test, boundaries)
# Self-test.
assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([ 0x0903, 0x0308, 0xac01 ], [ 0, 2, 3 ]))
assert(_convert_line('÷ D800 ÷ # abc') == ([ 0x200b ], [ 0, 1 ]))
result = []
with open(grapheme_break_test_file_name, 'rb') as f:
for line in f:
test = _convert_line(line)
if test:
result += [ test ]
return result