codegen/genUnicode.py - third_party/github.com/GNOME/libxml2 - Git at Google

 #!/usr/bin/env python3
 #
 # Original script modified in November 2003 to take advantage of
 # the character-validation range routines, and updated to the
 # current Unicode information (Version 4.0.1)
 #
 # NOTE: there is an 'alias' facility for blocks which are not present in
 #	the current release, but are needed for ABI compatibility.  This
 #	must be accomplished MANUALLY!  Please see the comments below under
 #     'blockAliases'
 #
 import sys
 import string
 import rangetab

 #
 # blockAliases is a small hack - it is used for mapping block names which
 # were were used in the 3.1 release, but are missing or changed in the current
 # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
 blockAliases = []
 blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
 blockAliases.append("Greek:GreekandCoptic")
 blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
 	"SupplementaryPrivateUseArea-B")

 # minTableSize gives the minimum number of ranges which must be present
 # before a range table is produced.  If there are less than this
 # number, inline comparisons are generated
 minTableSize = 8

 blockfile = "Blocks-4.0.1.txt"
 catfile = "UnicodeData-4.0.1.txt"


 #
 # Now process the "blocks" file, reducing it to a dictionary
 # indexed by blockname, containing a tuple with the applicable
 # block range
 #
 BlockNames = {}
 try:
     blocks = open(blockfile, "r")
 except:
     print("Missing %s, aborting ..." % blockfile)
     sys.exit(1)

 for line in blocks.readlines():
     if line[0] == '#':
         continue
     line = line.strip()
     if line == '':
         continue
     try:
         fields = line.split(';')
         range = fields[0].strip()
         (start, end) = range.split("..")
         name = fields[1].strip()
         name = name.replace(' ', '')
     except:
         print("Failed to process line: %s" % (line))
         continue
     start = int(start, 16)
     end = int(end, 16)
     try:
         BlockNames[name].append((start, end))
     except:
         BlockNames[name] = [(start, end)]
 blocks.close()
 print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))

 for block in blockAliases:
     alias = block.split(':')
     alist = alias[1].split(',')
     for comp in alist:
         if comp in BlockNames:
             if alias[0] not in BlockNames:
                 BlockNames[alias[0]] = []
             for r in BlockNames[comp]:
                 BlockNames[alias[0]].append(r)
         else:
             print("Alias %s: %s not in Blocks" % (alias[0], comp))
             continue

 #
 # Next process the Categories file. This is more complex, since
 # the file is in code sequence, and we need to invert it.  We use
 # a dictionary with index category-name, with each entry containing
 # all the ranges (codepoints) of that category.  Note that category
 # names comprise two parts - the general category, and the "subclass"
 # within that category.  Therefore, both "general category" (which is
 # the first character of the 2-character category-name) and the full
 # (2-character) name are entered into this dictionary.
 #
 try:
     data = open(catfile, "r")
 except:
     print("Missing %s, aborting ..." % catfile)
     sys.exit(1)

 nbchar = 0;
 Categories = {}
 for line in data.readlines():
     if line[0] == '#':
         continue
     line = line.strip()
     if line == '':
         continue
     try:
         fields = line.split(';')
         point = fields[0].strip()
         value = 0
         while point != '':
             value = value * 16
             if point[0] >= '0' and point[0] <= '9':
                 value = value + ord(point[0]) - ord('0')
             elif point[0] >= 'A' and point[0] <= 'F':
                 value = value + 10 + ord(point[0]) - ord('A')
             elif point[0] >= 'a' and point[0] <= 'f':
                 value = value + 10 + ord(point[0]) - ord('a')
             point = point[1:]
         name = fields[2]
     except:
         print("Failed to process line: %s" % (line))
         continue

     nbchar = nbchar + 1
     # update entry for "full name"
     try:
         Categories[name].append(value)
     except:
         try:
             Categories[name] = [value]
         except:
             print("Failed to process line: %s" % (line))
     # update "general category" name
     try:
         Categories[name[0]].append(value)
     except:
         try:
             Categories[name[0]] = [value]
         except:
             print("Failed to process line: %s" % (line))

 data.close()
 print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))

 #
 # The data is now all read.  Time to process it into a more useful form.
 #
 # reduce the number list into ranges
 for cat in Categories.keys():
     list = Categories[cat]
     start = -1
     prev = -1
     end = -1
     ranges = []
     for val in list:
         if start == -1:
             start = val
             prev = val
             continue
         elif val == prev + 1:
             prev = val
             continue
         elif prev == start:
             ranges.append((prev, prev))
             start = val
             prev = val
             continue
         else:
             ranges.append((start, prev))
             start = val
             prev = val
             continue
     if prev == start:
         ranges.append((prev, prev))
     else:
         ranges.append((start, prev))
     Categories[cat] = ranges

 #
 # Assure all data is in alphabetic order, since we will be doing binary
 # searches on the tables.
 #
 bkeys = sorted(BlockNames.keys())

 ckeys = sorted(Categories.keys())

 #
 # Generate the resulting files
 #
 try:
     output = open("codegen/unicode.inc", "w")
 except:
     print("Failed to open codegen/unicode.inc")
     sys.exit(1)

 #
 # For any categories with more than minTableSize ranges we generate
 # a range table suitable for xmlCharInRange
 #
 for name in ckeys:
     if len(Categories[name]) <= minTableSize or name == 'Cs':
         continue
     ranges = Categories[name]
     group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges)
     output.write("static const xmlChRangeGroup xml%sG = %s;\n\n" %
                  (name, group))

 for name in ckeys:
     if name == 'Cs':
         continue
     ranges = Categories[name]
     output.write("static int\nxmlUCSIsCat%s(int code) {\n" % name)
     if len(Categories[name]) > minTableSize:
         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
             % name)
     else:
         start = 1
         for range in ranges:
             (begin, end) = range;
             if start:
                 output.write("    return(");
                 start = 0
             else:
                 output.write(" ||\n           ");
             if (begin == end):
                 output.write("(code == %s)" % (hex(begin)))
             else:
                 output.write("((code >= %s) && (code <= %s))" % (
                          hex(begin), hex(end)))
     output.write(");\n}\n\n")

 #
 # Range tables for blocks
 #

 blockGroups = ''
 for block in bkeys:
     name = block.replace('-', '')
     ranges = BlockNames[block]
     group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges)
     output.write("\n")
     if blockGroups != '':
         blockGroups += ",\n"
     blockGroups += '  {"%s",\n   %s}' % (block, group)

 output.write("static const xmlUnicodeRange xmlUnicodeBlocks[] = {\n")
 output.write(blockGroups)
 output.write("\n};\n\n")

 output.close()
	#!/usr/bin/env python3
	#
	# Original script modified in November 2003 to take advantage of
	# the character-validation range routines, and updated to the
	# current Unicode information (Version 4.0.1)
	#
	# NOTE: there is an 'alias' facility for blocks which are not present in
	# the current release, but are needed for ABI compatibility. This
	# must be accomplished MANUALLY! Please see the comments below under
	# 'blockAliases'
	#
	import sys
	import string
	import rangetab

	#
	# blockAliases is a small hack - it is used for mapping block names which
	# were were used in the 3.1 release, but are missing or changed in the current
	# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
	blockAliases = []
	blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
	blockAliases.append("Greek:GreekandCoptic")
	blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
	"SupplementaryPrivateUseArea-B")

	# minTableSize gives the minimum number of ranges which must be present
	# before a range table is produced. If there are less than this
	# number, inline comparisons are generated
	minTableSize = 8

	blockfile = "Blocks-4.0.1.txt"
	catfile = "UnicodeData-4.0.1.txt"


	#
	# Now process the "blocks" file, reducing it to a dictionary
	# indexed by blockname, containing a tuple with the applicable
	# block range
	#
	BlockNames = {}
	try:
	blocks = open(blockfile, "r")
	except:
	print("Missing %s, aborting ..." % blockfile)
	sys.exit(1)

	for line in blocks.readlines():
	if line[0] == '#':
	continue
	line = line.strip()
	if line == '':
	continue
	try:
	fields = line.split(';')
	range = fields[0].strip()
	(start, end) = range.split("..")
	name = fields[1].strip()
	name = name.replace(' ', '')
	except:
	print("Failed to process line: %s" % (line))
	continue
	start = int(start, 16)
	end = int(end, 16)
	try:
	BlockNames[name].append((start, end))
	except:
	BlockNames[name] = [(start, end)]
	blocks.close()
	print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))

	for block in blockAliases:
	alias = block.split(':')
	alist = alias[1].split(',')
	for comp in alist:
	if comp in BlockNames:
	if alias[0] not in BlockNames:
	BlockNames[alias[0]] = []
	for r in BlockNames[comp]:
	BlockNames[alias[0]].append(r)
	else:
	print("Alias %s: %s not in Blocks" % (alias[0], comp))
	continue

	#
	# Next process the Categories file. This is more complex, since
	# the file is in code sequence, and we need to invert it. We use
	# a dictionary with index category-name, with each entry containing
	# all the ranges (codepoints) of that category. Note that category
	# names comprise two parts - the general category, and the "subclass"
	# within that category. Therefore, both "general category" (which is
	# the first character of the 2-character category-name) and the full
	# (2-character) name are entered into this dictionary.
	#
	try:
	data = open(catfile, "r")
	except:
	print("Missing %s, aborting ..." % catfile)
	sys.exit(1)

	nbchar = 0;
	Categories = {}
	for line in data.readlines():
	if line[0] == '#':
	continue
	line = line.strip()
	if line == '':
	continue
	try:
	fields = line.split(';')
	point = fields[0].strip()
	value = 0
	while point != '':
	value = value * 16
	if point[0] >= '0' and point[0] <= '9':
	value = value + ord(point[0]) - ord('0')
	elif point[0] >= 'A' and point[0] <= 'F':
	value = value + 10 + ord(point[0]) - ord('A')
	elif point[0] >= 'a' and point[0] <= 'f':
	value = value + 10 + ord(point[0]) - ord('a')
	point = point[1:]
	name = fields[2]
	except:
	print("Failed to process line: %s" % (line))
	continue

	nbchar = nbchar + 1
	# update entry for "full name"
	try:
	Categories[name].append(value)
	except:
	try:
	Categories[name] = [value]
	except:
	print("Failed to process line: %s" % (line))
	# update "general category" name
	try:
	Categories[name[0]].append(value)
	except:
	try:
	Categories[name[0]] = [value]
	except:
	print("Failed to process line: %s" % (line))

	data.close()
	print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))

	#
	# The data is now all read. Time to process it into a more useful form.
	#
	# reduce the number list into ranges
	for cat in Categories.keys():
	list = Categories[cat]
	start = -1
	prev = -1
	end = -1
	ranges = []
	for val in list:
	if start == -1:
	start = val
	prev = val
	continue
	elif val == prev + 1:
	prev = val
	continue
	elif prev == start:
	ranges.append((prev, prev))
	start = val
	prev = val
	continue
	else:
	ranges.append((start, prev))
	start = val
	prev = val
	continue
	if prev == start:
	ranges.append((prev, prev))
	else:
	ranges.append((start, prev))
	Categories[cat] = ranges

	#
	# Assure all data is in alphabetic order, since we will be doing binary
	# searches on the tables.
	#
	bkeys = sorted(BlockNames.keys())

	ckeys = sorted(Categories.keys())

	#
	# Generate the resulting files
	#
	try:
	output = open("codegen/unicode.inc", "w")
	except:
	print("Failed to open codegen/unicode.inc")
	sys.exit(1)

	#
	# For any categories with more than minTableSize ranges we generate
	# a range table suitable for xmlCharInRange
	#
	for name in ckeys:
	if len(Categories[name]) <= minTableSize or name == 'Cs':
	continue
	ranges = Categories[name]
	group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges)
	output.write("static const xmlChRangeGroup xml%sG = %s;\n\n" %
	(name, group))

	for name in ckeys:
	if name == 'Cs':
	continue
	ranges = Categories[name]
	output.write("static int\nxmlUCSIsCat%s(int code) {\n" % name)
	if len(Categories[name]) > minTableSize:
	output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
	% name)
	else:
	start = 1
	for range in ranges:
	(begin, end) = range;
	if start:
	output.write(" return(");
	start = 0
	else:
	output.write(" \|\|\n ");
	if (begin == end):
	output.write("(code == %s)" % (hex(begin)))
	else:
	output.write("((code >= %s) && (code <= %s))" % (
	hex(begin), hex(end)))
	output.write(");\n}\n\n")

	#
	# Range tables for blocks
	#

	blockGroups = ''
	for block in bkeys:
	name = block.replace('-', '')
	ranges = BlockNames[block]
	group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges)
	output.write("\n")
	if blockGroups != '':
	blockGroups += ",\n"
	blockGroups += ' {"%s",\n %s}' % (block, group)

	output.write("static const xmlUnicodeRange xmlUnicodeBlocks[] = {\n")
	output.write(blockGroups)
	output.write("\n};\n\n")

	output.close()