third_party/rust_crates/vendor/idna/src/make_uts46_mapping_table.py - fuchsia - Git at Google

 # Copyright 2013-2014 The rust-url developers.
 #
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.

 # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
 # You can get the latest idna table from
 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt

 from __future__ import print_function
 import collections
 import itertools

 print('''\
 // Copyright 2013-2014 The rust-url developers.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 // Generated by make_idna_table.py
 ''')

 txt = open("IdnaMappingTable.txt")

 def escape_char(c):
     return "\\u{%x}" % ord(c[0])

 def char(s):
     return unichr(int(s, 16))

 strtab = collections.OrderedDict()
 strtab_offset = 0

 def strtab_slice(s):
     global strtab, strtab_offset

     if s in strtab:
         return strtab[s]
     else:
         utf8_len = len(s.encode('utf8'))
         c = (strtab_offset, utf8_len)
         strtab[s] = c
         strtab_offset += utf8_len
         return c

 def rust_slice(s):
     start = s[0]
     length = s[1]
     start_lo = start & 0xff
     start_hi = start >> 8
     assert length <= 255
     assert start_hi <= 255
     return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)

 ranges = []

 for line in txt:
     # remove comments
     line, _, _ = line.partition('#')
     # skip empty lines
     if len(line.strip()) == 0:
         continue
     fields = line.split(';')
     if fields[0].strip() == 'D800..DFFF':
         continue  # Surrogates don't occur in Rust strings.
     first, _, last = fields[0].strip().partition('..')
     if not last:
         last = first
     mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
     unicode_str = None
     if len(fields) > 2:
         if fields[2].strip():
             unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
         elif mapping == "Deviation":
             unicode_str = u''
     ranges.append((first, last, mapping, unicode_str))

 def mergeable_key(r):
     mapping = r[2]

     # These types have associated data, so we should not merge them.
     if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
         return r
     assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
     return mapping

 grouped_ranges = itertools.groupby(ranges, key=mergeable_key)

 optimized_ranges = []

 for (k, g) in grouped_ranges:
     group = list(g)
     if len(group) == 1:
         optimized_ranges.append(group[0])
         continue
     # Assert that nothing in the group has an associated unicode string.
     for g in group:
         if g[3] is not None and len(g[3]) > 2:
             assert not g[3][2].strip()
     # Assert that consecutive members of the group don't leave gaps in
     # the codepoint space.
     a, b = itertools.tee(group)
     next(b, None)
     for (g1, g2) in itertools.izip(a, b):
         last_char = int(g1[1], 16)
         next_char = int(g2[0], 16)
         if last_char + 1 == next_char:
             continue
         # There's a gap where surrogates would appear, but we don't have to
         # worry about that gap, as surrogates never appear in Rust strings.
         # Assert we're seeing the surrogate case here.
         assert last_char == 0xd7ff
         assert next_char == 0xe000
     first = group[0][0]
     last = group[-1][1]
     mapping = group[0][2]
     unicode_str = group[0][3]
     optimized_ranges.append((first, last, mapping, unicode_str))

 def is_single_char_range(r):
     (first, last, _, _) = r
     return first == last

 # We can reduce the size of the character range table and the index table to about 1/4
 # by merging runs of single character ranges and using character offsets from the start
 # of that range to retrieve the correct `Mapping` value
 def merge_single_char_ranges(ranges):
     current = []
     for r in ranges:
         if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
             current.append(r)
             continue
         if len(current) != 0:
             ret = current
             current = [r]
             yield ret
             continue
         current.append(r)
         ret = current
         current = []
         yield ret
     yield current

 optimized_ranges = list(merge_single_char_ranges(optimized_ranges))


 print("static TABLE: &'static [Range] = &[")

 for ranges in optimized_ranges:
     first = ranges[0][0]
     last = ranges[-1][1]
     print("    Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
                                                             escape_char(char(last))))

 print("];\n")

 print("static INDEX_TABLE: &'static [u16] = &[")

 SINGLE_MARKER = 1 << 15

 offset = 0
 for ranges in optimized_ranges:
     assert offset < SINGLE_MARKER

     block_len = len(ranges)
     single = SINGLE_MARKER if block_len == 1 else 0
     print("    %s," % (offset | single))
     offset += block_len

 print("];\n")

 print("static MAPPING_TABLE: &'static [Mapping] = &[")

 for ranges in optimized_ranges:
     for (first, last, mapping, unicode_str) in ranges:
         if unicode_str is not None:
             mapping += rust_slice(strtab_slice(unicode_str))
         print("    %s," % mapping)

 print("];\n")

 def escape_str(s):
     return [escape_char(c) for c in s]

 print("static STRING_TABLE: &'static str = \"%s\";"
       % '\\\n  '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()])))
	# Copyright 2013-2014 The rust-url developers.
	#
	# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	# option. This file may not be copied, modified, or distributed
	# except according to those terms.

	# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
	# You can get the latest idna table from
	# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt

	from __future__ import print_function
	import collections
	import itertools

	print('''\
	// Copyright 2013-2014 The rust-url developers.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	// Generated by make_idna_table.py
	''')

	txt = open("IdnaMappingTable.txt")

	def escape_char(c):
	return "\\u{%x}" % ord(c[0])

	def char(s):
	return unichr(int(s, 16))

	strtab = collections.OrderedDict()
	strtab_offset = 0

	def strtab_slice(s):
	global strtab, strtab_offset

	if s in strtab:
	return strtab[s]
	else:
	utf8_len = len(s.encode('utf8'))
	c = (strtab_offset, utf8_len)
	strtab[s] = c
	strtab_offset += utf8_len
	return c

	def rust_slice(s):
	start = s[0]
	length = s[1]
	start_lo = start & 0xff
	start_hi = start >> 8
	assert length <= 255
	assert start_hi <= 255
	return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)

	ranges = []

	for line in txt:
	# remove comments
	line, _, _ = line.partition('#')
	# skip empty lines
	if len(line.strip()) == 0:
	continue
	fields = line.split(';')
	if fields[0].strip() == 'D800..DFFF':
	continue # Surrogates don't occur in Rust strings.
	first, _, last = fields[0].strip().partition('..')
	if not last:
	last = first
	mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
	unicode_str = None
	if len(fields) > 2:
	if fields[2].strip():
	unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
	elif mapping == "Deviation":
	unicode_str = u''
	ranges.append((first, last, mapping, unicode_str))

	def mergeable_key(r):
	mapping = r[2]

	# These types have associated data, so we should not merge them.
	if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
	return r
	assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
	return mapping

	grouped_ranges = itertools.groupby(ranges, key=mergeable_key)

	optimized_ranges = []

	for (k, g) in grouped_ranges:
	group = list(g)
	if len(group) == 1:
	optimized_ranges.append(group[0])
	continue
	# Assert that nothing in the group has an associated unicode string.
	for g in group:
	if g[3] is not None and len(g[3]) > 2:
	assert not g[3][2].strip()
	# Assert that consecutive members of the group don't leave gaps in
	# the codepoint space.
	a, b = itertools.tee(group)
	next(b, None)
	for (g1, g2) in itertools.izip(a, b):
	last_char = int(g1[1], 16)
	next_char = int(g2[0], 16)
	if last_char + 1 == next_char:
	continue
	# There's a gap where surrogates would appear, but we don't have to
	# worry about that gap, as surrogates never appear in Rust strings.
	# Assert we're seeing the surrogate case here.
	assert last_char == 0xd7ff
	assert next_char == 0xe000
	first = group[0][0]
	last = group[-1][1]
	mapping = group[0][2]
	unicode_str = group[0][3]
	optimized_ranges.append((first, last, mapping, unicode_str))

	def is_single_char_range(r):
	(first, last, _, _) = r
	return first == last

	# We can reduce the size of the character range table and the index table to about 1/4
	# by merging runs of single character ranges and using character offsets from the start
	# of that range to retrieve the correct `Mapping` value
	def merge_single_char_ranges(ranges):
	current = []
	for r in ranges:
	if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
	current.append(r)
	continue
	if len(current) != 0:
	ret = current
	current = [r]
	yield ret
	continue
	current.append(r)
	ret = current
	current = []
	yield ret
	yield current

	optimized_ranges = list(merge_single_char_ranges(optimized_ranges))


	print("static TABLE: &'static [Range] = &[")

	for ranges in optimized_ranges:
	first = ranges[0][0]
	last = ranges[-1][1]
	print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
	escape_char(char(last))))

	print("];\n")

	print("static INDEX_TABLE: &'static [u16] = &[")

	SINGLE_MARKER = 1 << 15

	offset = 0
	for ranges in optimized_ranges:
	assert offset < SINGLE_MARKER

	block_len = len(ranges)
	single = SINGLE_MARKER if block_len == 1 else 0
	print(" %s," % (offset \| single))
	offset += block_len

	print("];\n")

	print("static MAPPING_TABLE: &'static [Mapping] = &[")

	for ranges in optimized_ranges:
	for (first, last, mapping, unicode_str) in ranges:
	if unicode_str is not None:
	mapping += rust_slice(strtab_slice(unicode_str))
	print(" %s," % mapping)

	print("];\n")

	def escape_str(s):
	return [escape_char(c) for c in s]

	print("static STRING_TABLE: &'static str = \"%s\";"
	% '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()])))