src/libcore/unicode/printable.py - third_party/rust - Git at Google

 #!/usr/bin/env python

 # This script uses the following Unicode tables:
 # - UnicodeData.txt


 from collections import namedtuple
 import csv
 import os
 import subprocess

 NUM_CODEPOINTS=0x110000

 def to_ranges(iter):
     current = None
     for i in iter:
         if current is None or i != current[1] or i in (0x10000, 0x20000):
             if current is not None:
                 yield tuple(current)
             current = [i, i + 1]
         else:
             current[1] += 1
     if current is not None:
         yield tuple(current)

 def get_escaped(codepoints):
     for c in codepoints:
         if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
             yield c.value

 def get_file(f):
     try:
         return open(os.path.basename(f))
     except FileNotFoundError:
         subprocess.run(["curl", "-O", f], check=True)
         return open(os.path.basename(f))

 Codepoint = namedtuple('Codepoint', 'value class_')

 def get_codepoints(f):
     r = csv.reader(f, delimiter=";")
     prev_codepoint = 0
     class_first = None
     for row in r:
         codepoint = int(row[0], 16)
         name = row[1]
         class_ = row[2]

         if class_first is not None:
             if not name.endswith("Last>"):
                 raise ValueError("Missing Last after First")

         for c in range(prev_codepoint + 1, codepoint):
             yield Codepoint(c, class_first)

         class_first = None
         if name.endswith("First>"):
             class_first = class_

         yield Codepoint(codepoint, class_)
         prev_codepoint = codepoint

     if class_first is not None:
         raise ValueError("Missing Last after First")

     for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
         yield Codepoint(c, None)

 def compress_singletons(singletons):
     uppers = [] # (upper, # items in lowers)
     lowers = []

     for i in singletons:
         upper = i >> 8
         lower = i & 0xff
         if len(uppers) == 0 or uppers[-1][0] != upper:
             uppers.append((upper, 1))
         else:
             upper, count = uppers[-1]
             uppers[-1] = upper, count + 1
         lowers.append(lower)

     return uppers, lowers

 def compress_normal(normal):
     # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
     # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
     compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]

     prev_start = 0
     for start, count in normal:
         truelen = start - prev_start
         falselen = count
         prev_start = start + count

         assert truelen < 0x8000 and falselen < 0x8000
         entry = []
         if truelen > 0x7f:
             entry.append(0x80 | (truelen >> 8))
             entry.append(truelen & 0xff)
         else:
             entry.append(truelen & 0x7f)
         if falselen > 0x7f:
             entry.append(0x80 | (falselen >> 8))
             entry.append(falselen & 0xff)
         else:
             entry.append(falselen & 0x7f)

         compressed.append(entry)

     return compressed

 def print_singletons(uppers, lowers, uppersname, lowersname):
     print("const {}: &[(u8, u8)] = &[".format(uppersname))
     for u, c in uppers:
         print("    ({:#04x}, {}),".format(u, c))
     print("];")
     print("const {}: &[u8] = &[".format(lowersname))
     for i in range(0, len(lowers), 8):
         print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
     print("];")

 def print_normal(normal, normalname):
     print("const {}: &[u8] = &[".format(normalname))
     for v in normal:
         print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
     print("];")

 def main():
     file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")

     codepoints = get_codepoints(file)

     CUTOFF=0x10000
     singletons0 = []
     singletons1 = []
     normal0 = []
     normal1 = []
     extra = []

     for a, b in to_ranges(get_escaped(codepoints)):
         if a > 2 * CUTOFF:
             extra.append((a, b - a))
         elif a == b - 1:
             if a & CUTOFF:
                 singletons1.append(a & ~CUTOFF)
             else:
                 singletons0.append(a)
         elif a == b - 2:
             if a & CUTOFF:
                 singletons1.append(a & ~CUTOFF)
                 singletons1.append((a + 1) & ~CUTOFF)
             else:
                 singletons0.append(a)
                 singletons0.append(a + 1)
         else:
             if a >= 2 * CUTOFF:
                 extra.append((a, b - a))
             elif a & CUTOFF:
                 normal1.append((a & ~CUTOFF, b - a))
             else:
                 normal0.append((a, b - a))

     singletons0u, singletons0l = compress_singletons(singletons0)
     singletons1u, singletons1l = compress_singletons(singletons1)
     normal0 = compress_normal(normal0)
     normal1 = compress_normal(normal1)

     print("""\
 // NOTE: The following code was generated by "src/libcore/unicode/printable.py",
 //       do not edit directly!

 fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8],
          normal: &[u8]) -> bool {
     let xupper = (x >> 8) as u8;
     let mut lowerstart = 0;
     for &(upper, lowercount) in singletonuppers {
         let lowerend = lowerstart + lowercount as usize;
         if xupper == upper {
             for &lower in &singletonlowers[lowerstart..lowerend] {
                 if lower == x as u8 {
                     return false;
                 }
             }
         } else if xupper < upper {
             break;
         }
         lowerstart = lowerend;
     }

     let mut x = x as i32;
     let mut normal = normal.iter().cloned();
     let mut current = true;
     while let Some(v) = normal.next() {
         let len = if v & 0x80 != 0 {
             ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
         } else {
             v as i32
         };
         x -= len;
         if x < 0 {
             break;
         }
         current = !current;
     }
     current
 }

 pub(crate) fn is_printable(x: char) -> bool {
     let x = x as u32;
     let lower = x as u16;
     if x < 0x10000 {
         check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
     } else if x < 0x20000 {
         check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
     } else {\
 """)
     for a, b in extra:
         print("        if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
         print("            return false;")
         print("        }")
     print("""\
         true
     }
 }\
 """)
     print()
     print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
     print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
     print_normal(normal0, 'NORMAL0')
     print_normal(normal1, 'NORMAL1')

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python

	# This script uses the following Unicode tables:
	# - UnicodeData.txt


	from collections import namedtuple
	import csv
	import os
	import subprocess

	NUM_CODEPOINTS=0x110000

	def to_ranges(iter):
	current = None
	for i in iter:
	if current is None or i != current[1] or i in (0x10000, 0x20000):
	if current is not None:
	yield tuple(current)
	current = [i, i + 1]
	else:
	current[1] += 1
	if current is not None:
	yield tuple(current)

	def get_escaped(codepoints):
	for c in codepoints:
	if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
	yield c.value

	def get_file(f):
	try:
	return open(os.path.basename(f))
	except FileNotFoundError:
	subprocess.run(["curl", "-O", f], check=True)
	return open(os.path.basename(f))

	Codepoint = namedtuple('Codepoint', 'value class_')

	def get_codepoints(f):
	r = csv.reader(f, delimiter=";")
	prev_codepoint = 0
	class_first = None
	for row in r:
	codepoint = int(row[0], 16)
	name = row[1]
	class_ = row[2]

	if class_first is not None:
	if not name.endswith("Last>"):
	raise ValueError("Missing Last after First")

	for c in range(prev_codepoint + 1, codepoint):
	yield Codepoint(c, class_first)

	class_first = None
	if name.endswith("First>"):
	class_first = class_

	yield Codepoint(codepoint, class_)
	prev_codepoint = codepoint

	if class_first is not None:
	raise ValueError("Missing Last after First")

	for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
	yield Codepoint(c, None)

	def compress_singletons(singletons):
	uppers = [] # (upper, # items in lowers)
	lowers = []

	for i in singletons:
	upper = i >> 8
	lower = i & 0xff
	if len(uppers) == 0 or uppers[-1][0] != upper:
	uppers.append((upper, 1))
	else:
	upper, count = uppers[-1]
	uppers[-1] = upper, count + 1
	lowers.append(lower)

	return uppers, lowers

	def compress_normal(normal):
	# lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
	# lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
	compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]

	prev_start = 0
	for start, count in normal:
	truelen = start - prev_start
	falselen = count
	prev_start = start + count

	assert truelen < 0x8000 and falselen < 0x8000
	entry = []
	if truelen > 0x7f:
	entry.append(0x80 \| (truelen >> 8))
	entry.append(truelen & 0xff)
	else:
	entry.append(truelen & 0x7f)
	if falselen > 0x7f:
	entry.append(0x80 \| (falselen >> 8))
	entry.append(falselen & 0xff)
	else:
	entry.append(falselen & 0x7f)

	compressed.append(entry)

	return compressed

	def print_singletons(uppers, lowers, uppersname, lowersname):
	print("const {}: &[(u8, u8)] = &[".format(uppersname))
	for u, c in uppers:
	print(" ({:#04x}, {}),".format(u, c))
	print("];")
	print("const {}: &[u8] = &[".format(lowersname))
	for i in range(0, len(lowers), 8):
	print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
	print("];")

	def print_normal(normal, normalname):
	print("const {}: &[u8] = &[".format(normalname))
	for v in normal:
	print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
	print("];")

	def main():
	file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")

	codepoints = get_codepoints(file)

	CUTOFF=0x10000
	singletons0 = []
	singletons1 = []
	normal0 = []
	normal1 = []
	extra = []

	for a, b in to_ranges(get_escaped(codepoints)):
	if a > 2 * CUTOFF:
	extra.append((a, b - a))
	elif a == b - 1:
	if a & CUTOFF:
	singletons1.append(a & ~CUTOFF)
	else:
	singletons0.append(a)
	elif a == b - 2:
	if a & CUTOFF:
	singletons1.append(a & ~CUTOFF)
	singletons1.append((a + 1) & ~CUTOFF)
	else:
	singletons0.append(a)
	singletons0.append(a + 1)
	else:
	if a >= 2 * CUTOFF:
	extra.append((a, b - a))
	elif a & CUTOFF:
	normal1.append((a & ~CUTOFF, b - a))
	else:
	normal0.append((a, b - a))

	singletons0u, singletons0l = compress_singletons(singletons0)
	singletons1u, singletons1l = compress_singletons(singletons1)
	normal0 = compress_normal(normal0)
	normal1 = compress_normal(normal1)

	print("""\
	// NOTE: The following code was generated by "src/libcore/unicode/printable.py",
	// do not edit directly!

	fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8],
	normal: &[u8]) -> bool {
	let xupper = (x >> 8) as u8;
	let mut lowerstart = 0;
	for &(upper, lowercount) in singletonuppers {
	let lowerend = lowerstart + lowercount as usize;
	if xupper == upper {
	for &lower in &singletonlowers[lowerstart..lowerend] {
	if lower == x as u8 {
	return false;
	}
	}
	} else if xupper < upper {
	break;
	}
	lowerstart = lowerend;
	}

	let mut x = x as i32;
	let mut normal = normal.iter().cloned();
	let mut current = true;
	while let Some(v) = normal.next() {
	let len = if v & 0x80 != 0 {
	((v & 0x7f) as i32) << 8 \| normal.next().unwrap() as i32
	} else {
	v as i32
	};
	x -= len;
	if x < 0 {
	break;
	}
	current = !current;
	}
	current
	}

	pub(crate) fn is_printable(x: char) -> bool {
	let x = x as u32;
	let lower = x as u16;
	if x < 0x10000 {
	check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
	} else if x < 0x20000 {
	check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
	} else {\
	""")
	for a, b in extra:
	print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
	print(" return false;")
	print(" }")
	print("""\
	true
	}
	}\
	""")
	print()
	print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
	print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
	print_normal(normal0, 'NORMAL0')
	print_normal(normal1, 'NORMAL1')

	if __name__ == '__main__':
	main()