src/etc/unicode.py - third_party/rust - Git at Google

 #!/usr/bin/env python

 # This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
 # code covering the core properties. Since this is a pretty rare event we
 # just store this out-of-line and check the unicode.rs file into git.
 #
 # The emitted code is "the minimum we think is necessary for libcore", that
 # is, to support basic operations of the compiler and "most nontrivial rust
 # programs". It is not meant to be a complete implementation of unicode.
 # For that we recommend you use a proper binding to libicu.

 import fileinput, re, os, sys


 def fetch(f):
     if not os.path.exists(f):
         os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
                   % f)

     if not os.path.exists(f):
         sys.stderr.write("cannot load %s" % f)
         exit(1)


 def load_unicode_data(f):
     fetch(f)
     gencats = {}
     canon_decomp = {}
     compat_decomp = {}
     curr_cat = ""
     c_lo = 0
     c_hi = 0
     for line in fileinput.input(f):
         fields = line.split(";")
         if len(fields) != 15:
             continue
         [code, name, gencat, combine, bidi,
          decomp, deci, digit, num, mirror,
          old, iso, upcase, lowcsae, titlecase ] = fields

         code = int(code, 16)

         if decomp != "":
             if decomp.startswith('<'):
                 seq = []
                 for i in decomp.split()[1:]:
                     seq.append(int(i, 16))
                 compat_decomp[code] = seq
             else:
                 seq = []
                 for i in decomp.split():
                     seq.append(int(i, 16))
                 canon_decomp[code] = seq

         if curr_cat == "":
             curr_cat = gencat
             c_lo = code
             c_hi = code

         if curr_cat == gencat:
             c_hi = code
         else:
             if curr_cat not in gencats:
                 gencats[curr_cat] = []

             gencats[curr_cat].append((c_lo, c_hi))
             curr_cat = gencat
             c_lo = code
             c_hi = code

     return (canon_decomp, compat_decomp, gencats)


 def load_derived_core_properties(f):
     fetch(f)
     derivedprops = {}
     interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
     re1 = re.compile("^([0-9A-F]+) +; (\w+)")
     re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")

     for line in fileinput.input(f):
         prop = None
         d_lo = 0
         d_hi = 0
         m = re1.match(line)
         if m:
             d_lo = m.group(1)
             d_hi = m.group(1)
             prop = m.group(2)
         else:
             m = re2.match(line)
             if m:
                 d_lo = m.group(1)
                 d_hi = m.group(2)
                 prop = m.group(3)
             else:
                 continue
         if prop not in interestingprops:
             continue
         d_lo = int(d_lo, 16)
         d_hi = int(d_hi, 16)
         if prop not in derivedprops:
             derivedprops[prop] = []
         derivedprops[prop].append((d_lo, d_hi))
     return derivedprops

 def escape_char(c):
     if c <= 0xff:
         return "'\\x%2.2x'" % c
     if c <= 0xffff:
         return "'\\u%4.4x'" % c
     return "'\\U%8.8x'" % c

 def emit_property_module(f, mod, tbl):
     f.write("mod %s {\n" % mod)
     keys = tbl.keys()
     keys.sort()
     for cat in keys:
         f.write("    pure fn %s(c: char) -> bool {\n" % cat)
         f.write("        ret alt c {\n")
         prefix = ' '
         for pair in tbl[cat]:
             if pair[0] == pair[1]:
                 f.write("            %c %s\n" %
                         (prefix, escape_char(pair[0])))
             else:
                 f.write("            %c %s to %s\n" %
                         (prefix,
                          escape_char(pair[0]),
                          escape_char(pair[1])))
             prefix = '|'
         f.write("              { true }\n")
         f.write("            _ { false }\n")
         f.write("        };\n")
         f.write("    }\n\n")
     f.write("}\n")

 def emit_decomp_module(f, canon, compat):
     canon_keys = canon.keys()
     canon_keys.sort()

     compat_keys = compat.keys()
     compat_keys.sort()
     f.write("mod decompose {\n\n");
     f.write("    export canonical, compatibility;\n\n")
     f.write("    fn canonical(c: char, i: block(char)) "
             + "{ d(c, i, false); }\n\n")
     f.write("    fn compatibility(c: char, i: block(char)) "
             +"{ d(c, i, true); }\n\n")
     f.write("    fn d(c: char, i: block(char), k: bool) {\n")

     f.write("        if c <= '\\x7f' { i(c); ret; }\n")

     # First check the canonical decompositions
     f.write("        // Canonical decomposition\n")
     f.write("        alt c {\n")
     for char in canon_keys:
         f.write("          %s {\n" % escape_char(char))
         for d in canon[char]:
             f.write("            d(%s, i, k);\n"
                     % escape_char(d))
         f.write("          }\n")

     f.write("          _ { }\n")
     f.write("        }\n\n")

     # Bottom out if we're not doing compat.
     f.write("        if !k { i(c); ret; }\n\n ")

     # Then check the compatibility decompositions
     f.write("        // Compatibility decomposition\n")
     f.write("        alt c {\n")
     for char in compat_keys:
         f.write("          %s {\n" % escape_char(char))
         for d in compat[char]:
             f.write("            d(%s, i, k);\n"
                     % escape_char(d))
         f.write("          }\n")

     f.write("          _ { }\n")
     f.write("        }\n\n")

     # Finally bottom out.
     f.write("        i(c);\n")
     f.write("    }\n")
     f.write("}\n\n")

 r = "unicode.rs"
 for i in [r]:
     if os.path.exists(i):
         os.remove(i);
 rf = open(r, "w")

 (canon_decomp, compat_decomp, gencats) = load_unicode_data("UnicodeData.txt")
 emit_decomp_module(rf, canon_decomp, compat_decomp)
 emit_property_module(rf, "general_category", gencats)

 derived = load_derived_core_properties("DerivedCoreProperties.txt")
 emit_property_module(rf, "derived_property", derived)
	#!/usr/bin/env python

	# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
	# code covering the core properties. Since this is a pretty rare event we
	# just store this out-of-line and check the unicode.rs file into git.
	#
	# The emitted code is "the minimum we think is necessary for libcore", that
	# is, to support basic operations of the compiler and "most nontrivial rust
	# programs". It is not meant to be a complete implementation of unicode.
	# For that we recommend you use a proper binding to libicu.

	import fileinput, re, os, sys


	def fetch(f):
	if not os.path.exists(f):
	os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
	% f)

	if not os.path.exists(f):
	sys.stderr.write("cannot load %s" % f)
	exit(1)


	def load_unicode_data(f):
	fetch(f)
	gencats = {}
	canon_decomp = {}
	compat_decomp = {}
	curr_cat = ""
	c_lo = 0
	c_hi = 0
	for line in fileinput.input(f):
	fields = line.split(";")
	if len(fields) != 15:
	continue
	[code, name, gencat, combine, bidi,
	decomp, deci, digit, num, mirror,
	old, iso, upcase, lowcsae, titlecase ] = fields

	code = int(code, 16)

	if decomp != "":
	if decomp.startswith('<'):
	seq = []
	for i in decomp.split()[1:]:
	seq.append(int(i, 16))
	compat_decomp[code] = seq
	else:
	seq = []
	for i in decomp.split():
	seq.append(int(i, 16))
	canon_decomp[code] = seq

	if curr_cat == "":
	curr_cat = gencat
	c_lo = code
	c_hi = code

	if curr_cat == gencat:
	c_hi = code
	else:
	if curr_cat not in gencats:
	gencats[curr_cat] = []

	gencats[curr_cat].append((c_lo, c_hi))
	curr_cat = gencat
	c_lo = code
	c_hi = code

	return (canon_decomp, compat_decomp, gencats)


	def load_derived_core_properties(f):
	fetch(f)
	derivedprops = {}
	interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
	re1 = re.compile("^([0-9A-F]+) +; (\w+)")
	re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")

	for line in fileinput.input(f):
	prop = None
	d_lo = 0
	d_hi = 0
	m = re1.match(line)
	if m:
	d_lo = m.group(1)
	d_hi = m.group(1)
	prop = m.group(2)
	else:
	m = re2.match(line)
	if m:
	d_lo = m.group(1)
	d_hi = m.group(2)
	prop = m.group(3)
	else:
	continue
	if prop not in interestingprops:
	continue
	d_lo = int(d_lo, 16)
	d_hi = int(d_hi, 16)
	if prop not in derivedprops:
	derivedprops[prop] = []
	derivedprops[prop].append((d_lo, d_hi))
	return derivedprops

	def escape_char(c):
	if c <= 0xff:
	return "'\\x%2.2x'" % c
	if c <= 0xffff:
	return "'\\u%4.4x'" % c
	return "'\\U%8.8x'" % c

	def emit_property_module(f, mod, tbl):
	f.write("mod %s {\n" % mod)
	keys = tbl.keys()
	keys.sort()
	for cat in keys:
	f.write(" pure fn %s(c: char) -> bool {\n" % cat)
	f.write(" ret alt c {\n")
	prefix = ' '
	for pair in tbl[cat]:
	if pair[0] == pair[1]:
	f.write(" %c %s\n" %
	(prefix, escape_char(pair[0])))
	else:
	f.write(" %c %s to %s\n" %
	(prefix,
	escape_char(pair[0]),
	escape_char(pair[1])))
	prefix = '\|'
	f.write(" { true }\n")
	f.write(" _ { false }\n")
	f.write(" };\n")
	f.write(" }\n\n")
	f.write("}\n")

	def emit_decomp_module(f, canon, compat):
	canon_keys = canon.keys()
	canon_keys.sort()

	compat_keys = compat.keys()
	compat_keys.sort()
	f.write("mod decompose {\n\n");
	f.write(" export canonical, compatibility;\n\n")
	f.write(" fn canonical(c: char, i: block(char)) "
	+ "{ d(c, i, false); }\n\n")
	f.write(" fn compatibility(c: char, i: block(char)) "
	+"{ d(c, i, true); }\n\n")
	f.write(" fn d(c: char, i: block(char), k: bool) {\n")

	f.write(" if c <= '\\x7f' { i(c); ret; }\n")

	# First check the canonical decompositions
	f.write(" // Canonical decomposition\n")
	f.write(" alt c {\n")
	for char in canon_keys:
	f.write(" %s {\n" % escape_char(char))
	for d in canon[char]:
	f.write(" d(%s, i, k);\n"
	% escape_char(d))
	f.write(" }\n")

	f.write(" _ { }\n")
	f.write(" }\n\n")

	# Bottom out if we're not doing compat.
	f.write(" if !k { i(c); ret; }\n\n ")

	# Then check the compatibility decompositions
	f.write(" // Compatibility decomposition\n")
	f.write(" alt c {\n")
	for char in compat_keys:
	f.write(" %s {\n" % escape_char(char))
	for d in compat[char]:
	f.write(" d(%s, i, k);\n"
	% escape_char(d))
	f.write(" }\n")

	f.write(" _ { }\n")
	f.write(" }\n\n")

	# Finally bottom out.
	f.write(" i(c);\n")
	f.write(" }\n")
	f.write("}\n\n")

	r = "unicode.rs"
	for i in [r]:
	if os.path.exists(i):
	os.remove(i);
	rf = open(r, "w")

	(canon_decomp, compat_decomp, gencats) = load_unicode_data("UnicodeData.txt")
	emit_decomp_module(rf, canon_decomp, compat_decomp)
	emit_property_module(rf, "general_category", gencats)

	derived = load_derived_core_properties("DerivedCoreProperties.txt")
	emit_property_module(rf, "derived_property", derived)