support/printable.py - third_party/github.com/fmtlib/fmt - Git at Google

 #!/usr/bin/env python3

 # This script is based on
 # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
 # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.

 # This script uses the following Unicode tables:
 # - UnicodeData.txt


 from collections import namedtuple
 import csv
 import os
 import subprocess

 NUM_CODEPOINTS=0x110000

 def to_ranges(iter):
     current = None
     for i in iter:
         if current is None or i != current[1] or i in (0x10000, 0x20000):
             if current is not None:
                 yield tuple(current)
             current = [i, i + 1]
         else:
             current[1] += 1
     if current is not None:
         yield tuple(current)

 def get_escaped(codepoints):
     for c in codepoints:
         if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
             yield c.value

 def get_file(f):
     try:
         return open(os.path.basename(f))
     except FileNotFoundError:
         subprocess.run(["curl", "-O", f], check=True)
         return open(os.path.basename(f))

 Codepoint = namedtuple('Codepoint', 'value class_')

 def get_codepoints(f):
     r = csv.reader(f, delimiter=";")
     prev_codepoint = 0
     class_first = None
     for row in r:
         codepoint = int(row[0], 16)
         name = row[1]
         class_ = row[2]

         if class_first is not None:
             if not name.endswith("Last>"):
                 raise ValueError("Missing Last after First")

         for c in range(prev_codepoint + 1, codepoint):
             yield Codepoint(c, class_first)

         class_first = None
         if name.endswith("First>"):
             class_first = class_

         yield Codepoint(codepoint, class_)
         prev_codepoint = codepoint

     if class_first is not None:
         raise ValueError("Missing Last after First")

     for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
         yield Codepoint(c, None)

 def compress_singletons(singletons):
     uppers = [] # (upper, # items in lowers)
     lowers = []

     for i in singletons:
         upper = i >> 8
         lower = i & 0xff
         if len(uppers) == 0 or uppers[-1][0] != upper:
             uppers.append((upper, 1))
         else:
             upper, count = uppers[-1]
             uppers[-1] = upper, count + 1
         lowers.append(lower)

     return uppers, lowers

 def compress_normal(normal):
     # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
     # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
     compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]

     prev_start = 0
     for start, count in normal:
         truelen = start - prev_start
         falselen = count
         prev_start = start + count

         assert truelen < 0x8000 and falselen < 0x8000
         entry = []
         if truelen > 0x7f:
             entry.append(0x80 | (truelen >> 8))
             entry.append(truelen & 0xff)
         else:
             entry.append(truelen & 0x7f)
         if falselen > 0x7f:
             entry.append(0x80 | (falselen >> 8))
             entry.append(falselen & 0xff)
         else:
             entry.append(falselen & 0x7f)

         compressed.append(entry)

     return compressed

 def print_singletons(uppers, lowers, uppersname, lowersname):
     print("  static constexpr singleton {}[] = {{".format(uppersname))
     for u, c in uppers:
         print("    {{{:#04x}, {}}},".format(u, c))
     print("  };")
     print("  static constexpr unsigned char {}[] = {{".format(lowersname))
     for i in range(0, len(lowers), 8):
         print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
     print("  };")

 def print_normal(normal, normalname):
     print("  static constexpr unsigned char {}[] = {{".format(normalname))
     for v in normal:
         print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
     print("  };")

 def main():
     file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")

     codepoints = get_codepoints(file)

     CUTOFF=0x10000
     singletons0 = []
     singletons1 = []
     normal0 = []
     normal1 = []
     extra = []

     for a, b in to_ranges(get_escaped(codepoints)):
         if a > 2 * CUTOFF:
             extra.append((a, b - a))
         elif a == b - 1:
             if a & CUTOFF:
                 singletons1.append(a & ~CUTOFF)
             else:
                 singletons0.append(a)
         elif a == b - 2:
             if a & CUTOFF:
                 singletons1.append(a & ~CUTOFF)
                 singletons1.append((a + 1) & ~CUTOFF)
             else:
                 singletons0.append(a)
                 singletons0.append(a + 1)
         else:
             if a >= 2 * CUTOFF:
                 extra.append((a, b - a))
             elif a & CUTOFF:
                 normal1.append((a & ~CUTOFF, b - a))
             else:
                 normal0.append((a, b - a))

     singletons0u, singletons0l = compress_singletons(singletons0)
     singletons1u, singletons1l = compress_singletons(singletons1)
     normal0 = compress_normal(normal0)
     normal1 = compress_normal(normal1)

     print("""\
 FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
 """)
     print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
     print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
     print_normal(normal0, 'normal0')
     print_normal(normal1, 'normal1')
     print("""\
   auto lower = static_cast<uint16_t>(cp);
   if (cp < 0x10000) {
     return is_printable(lower, singletons0,
                         sizeof(singletons0) / sizeof(*singletons0),
                         singletons0_lower, normal0, sizeof(normal0));
   }
   if (cp < 0x20000) {
     return is_printable(lower, singletons1,
                         sizeof(singletons1) / sizeof(*singletons1),
                         singletons1_lower, normal1, sizeof(normal1));
   }\
 """)
     for a, b in extra:
         print("  if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
     print("""\
   return cp < 0x{:x};
 }}\
 """.format(NUM_CODEPOINTS))

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python3

	# This script is based on
	# https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
	# distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.

	# This script uses the following Unicode tables:
	# - UnicodeData.txt


	from collections import namedtuple
	import csv
	import os
	import subprocess

	NUM_CODEPOINTS=0x110000

	def to_ranges(iter):
	current = None
	for i in iter:
	if current is None or i != current[1] or i in (0x10000, 0x20000):
	if current is not None:
	yield tuple(current)
	current = [i, i + 1]
	else:
	current[1] += 1
	if current is not None:
	yield tuple(current)

	def get_escaped(codepoints):
	for c in codepoints:
	if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
	yield c.value

	def get_file(f):
	try:
	return open(os.path.basename(f))
	except FileNotFoundError:
	subprocess.run(["curl", "-O", f], check=True)
	return open(os.path.basename(f))

	Codepoint = namedtuple('Codepoint', 'value class_')

	def get_codepoints(f):
	r = csv.reader(f, delimiter=";")
	prev_codepoint = 0
	class_first = None
	for row in r:
	codepoint = int(row[0], 16)
	name = row[1]
	class_ = row[2]

	if class_first is not None:
	if not name.endswith("Last>"):
	raise ValueError("Missing Last after First")

	for c in range(prev_codepoint + 1, codepoint):
	yield Codepoint(c, class_first)

	class_first = None
	if name.endswith("First>"):
	class_first = class_

	yield Codepoint(codepoint, class_)
	prev_codepoint = codepoint

	if class_first is not None:
	raise ValueError("Missing Last after First")

	for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
	yield Codepoint(c, None)

	def compress_singletons(singletons):
	uppers = [] # (upper, # items in lowers)
	lowers = []

	for i in singletons:
	upper = i >> 8
	lower = i & 0xff
	if len(uppers) == 0 or uppers[-1][0] != upper:
	uppers.append((upper, 1))
	else:
	upper, count = uppers[-1]
	uppers[-1] = upper, count + 1
	lowers.append(lower)

	return uppers, lowers

	def compress_normal(normal):
	# lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
	# lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
	compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]

	prev_start = 0
	for start, count in normal:
	truelen = start - prev_start
	falselen = count
	prev_start = start + count

	assert truelen < 0x8000 and falselen < 0x8000
	entry = []
	if truelen > 0x7f:
	entry.append(0x80 \| (truelen >> 8))
	entry.append(truelen & 0xff)
	else:
	entry.append(truelen & 0x7f)
	if falselen > 0x7f:
	entry.append(0x80 \| (falselen >> 8))
	entry.append(falselen & 0xff)
	else:
	entry.append(falselen & 0x7f)

	compressed.append(entry)

	return compressed

	def print_singletons(uppers, lowers, uppersname, lowersname):
	print(" static constexpr singleton {}[] = {{".format(uppersname))
	for u, c in uppers:
	print(" {{{:#04x}, {}}},".format(u, c))
	print(" };")
	print(" static constexpr unsigned char {}[] = {{".format(lowersname))
	for i in range(0, len(lowers), 8):
	print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
	print(" };")

	def print_normal(normal, normalname):
	print(" static constexpr unsigned char {}[] = {{".format(normalname))
	for v in normal:
	print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
	print(" };")

	def main():
	file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")

	codepoints = get_codepoints(file)

	CUTOFF=0x10000
	singletons0 = []
	singletons1 = []
	normal0 = []
	normal1 = []
	extra = []

	for a, b in to_ranges(get_escaped(codepoints)):
	if a > 2 * CUTOFF:
	extra.append((a, b - a))
	elif a == b - 1:
	if a & CUTOFF:
	singletons1.append(a & ~CUTOFF)
	else:
	singletons0.append(a)
	elif a == b - 2:
	if a & CUTOFF:
	singletons1.append(a & ~CUTOFF)
	singletons1.append((a + 1) & ~CUTOFF)
	else:
	singletons0.append(a)
	singletons0.append(a + 1)
	else:
	if a >= 2 * CUTOFF:
	extra.append((a, b - a))
	elif a & CUTOFF:
	normal1.append((a & ~CUTOFF, b - a))
	else:
	normal0.append((a, b - a))

	singletons0u, singletons0l = compress_singletons(singletons0)
	singletons1u, singletons1l = compress_singletons(singletons1)
	normal0 = compress_normal(normal0)
	normal1 = compress_normal(normal1)

	print("""\
	FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
	""")
	print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
	print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
	print_normal(normal0, 'normal0')
	print_normal(normal1, 'normal1')
	print("""\
	auto lower = static_cast<uint16_t>(cp);
	if (cp < 0x10000) {
	return is_printable(lower, singletons0,
	sizeof(singletons0) / sizeof(*singletons0),
	singletons0_lower, normal0, sizeof(normal0));
	}
	if (cp < 0x20000) {
	return is_printable(lower, singletons1,
	sizeof(singletons1) / sizeof(*singletons1),
	singletons1_lower, normal1, sizeof(normal1));
	}\
	""")
	for a, b in extra:
	print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
	print("""\
	return cp < 0x{:x};
	}}\
	""".format(NUM_CODEPOINTS))

	if __name__ == '__main__':
	main()