| #!/usr/bin/env python3 |
| # |
| # Original script modified in November 2003 to take advantage of |
| # the character-validation range routines, and updated to the |
| # current Unicode information (Version 4.0.1) |
| # |
| # NOTE: there is an 'alias' facility for blocks which are not present in |
| # the current release, but are needed for ABI compatibility. This |
| # must be accomplished MANUALLY! Please see the comments below under |
| # 'blockAliases' |
| # |
| import sys |
| import string |
| import rangetab |
| |
| # |
| # blockAliases is a small hack - it is used for mapping block names which |
| # were were used in the 3.1 release, but are missing or changed in the current |
| # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" |
| blockAliases = [] |
| blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") |
| blockAliases.append("Greek:GreekandCoptic") |
| blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + |
| "SupplementaryPrivateUseArea-B") |
| |
| # minTableSize gives the minimum number of ranges which must be present |
| # before a range table is produced. If there are less than this |
| # number, inline comparisons are generated |
| minTableSize = 8 |
| |
| blockfile = "Blocks-4.0.1.txt" |
| catfile = "UnicodeData-4.0.1.txt" |
| |
| |
| # |
| # Now process the "blocks" file, reducing it to a dictionary |
| # indexed by blockname, containing a tuple with the applicable |
| # block range |
| # |
| BlockNames = {} |
| try: |
| blocks = open(blockfile, "r") |
| except: |
| print("Missing %s, aborting ..." % blockfile) |
| sys.exit(1) |
| |
| for line in blocks.readlines(): |
| if line[0] == '#': |
| continue |
| line = line.strip() |
| if line == '': |
| continue |
| try: |
| fields = line.split(';') |
| range = fields[0].strip() |
| (start, end) = range.split("..") |
| name = fields[1].strip() |
| name = name.replace(' ', '') |
| except: |
| print("Failed to process line: %s" % (line)) |
| continue |
| start = int(start, 16) |
| end = int(end, 16) |
| try: |
| BlockNames[name].append((start, end)) |
| except: |
| BlockNames[name] = [(start, end)] |
| blocks.close() |
| print("Parsed %d blocks descriptions" % (len(BlockNames.keys()))) |
| |
| for block in blockAliases: |
| alias = block.split(':') |
| alist = alias[1].split(',') |
| for comp in alist: |
| if comp in BlockNames: |
| if alias[0] not in BlockNames: |
| BlockNames[alias[0]] = [] |
| for r in BlockNames[comp]: |
| BlockNames[alias[0]].append(r) |
| else: |
| print("Alias %s: %s not in Blocks" % (alias[0], comp)) |
| continue |
| |
| # |
| # Next process the Categories file. This is more complex, since |
| # the file is in code sequence, and we need to invert it. We use |
| # a dictionary with index category-name, with each entry containing |
| # all the ranges (codepoints) of that category. Note that category |
| # names comprise two parts - the general category, and the "subclass" |
| # within that category. Therefore, both "general category" (which is |
| # the first character of the 2-character category-name) and the full |
| # (2-character) name are entered into this dictionary. |
| # |
| try: |
| data = open(catfile, "r") |
| except: |
| print("Missing %s, aborting ..." % catfile) |
| sys.exit(1) |
| |
| nbchar = 0; |
| Categories = {} |
| for line in data.readlines(): |
| if line[0] == '#': |
| continue |
| line = line.strip() |
| if line == '': |
| continue |
| try: |
| fields = line.split(';') |
| point = fields[0].strip() |
| value = 0 |
| while point != '': |
| value = value * 16 |
| if point[0] >= '0' and point[0] <= '9': |
| value = value + ord(point[0]) - ord('0') |
| elif point[0] >= 'A' and point[0] <= 'F': |
| value = value + 10 + ord(point[0]) - ord('A') |
| elif point[0] >= 'a' and point[0] <= 'f': |
| value = value + 10 + ord(point[0]) - ord('a') |
| point = point[1:] |
| name = fields[2] |
| except: |
| print("Failed to process line: %s" % (line)) |
| continue |
| |
| nbchar = nbchar + 1 |
| # update entry for "full name" |
| try: |
| Categories[name].append(value) |
| except: |
| try: |
| Categories[name] = [value] |
| except: |
| print("Failed to process line: %s" % (line)) |
| # update "general category" name |
| try: |
| Categories[name[0]].append(value) |
| except: |
| try: |
| Categories[name[0]] = [value] |
| except: |
| print("Failed to process line: %s" % (line)) |
| |
| data.close() |
| print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))) |
| |
| # |
| # The data is now all read. Time to process it into a more useful form. |
| # |
| # reduce the number list into ranges |
| for cat in Categories.keys(): |
| list = Categories[cat] |
| start = -1 |
| prev = -1 |
| end = -1 |
| ranges = [] |
| for val in list: |
| if start == -1: |
| start = val |
| prev = val |
| continue |
| elif val == prev + 1: |
| prev = val |
| continue |
| elif prev == start: |
| ranges.append((prev, prev)) |
| start = val |
| prev = val |
| continue |
| else: |
| ranges.append((start, prev)) |
| start = val |
| prev = val |
| continue |
| if prev == start: |
| ranges.append((prev, prev)) |
| else: |
| ranges.append((start, prev)) |
| Categories[cat] = ranges |
| |
| # |
| # Assure all data is in alphabetic order, since we will be doing binary |
| # searches on the tables. |
| # |
| bkeys = sorted(BlockNames.keys()) |
| |
| ckeys = sorted(Categories.keys()) |
| |
| # |
| # Generate the resulting files |
| # |
| try: |
| output = open("codegen/unicode.inc", "w") |
| except: |
| print("Failed to open codegen/unicode.inc") |
| sys.exit(1) |
| |
| # |
| # For any categories with more than minTableSize ranges we generate |
| # a range table suitable for xmlCharInRange |
| # |
| for name in ckeys: |
| if len(Categories[name]) <= minTableSize or name == 'Cs': |
| continue |
| ranges = Categories[name] |
| group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges) |
| output.write("static const xmlChRangeGroup xml%sG = %s;\n\n" % |
| (name, group)) |
| |
| for name in ckeys: |
| if name == 'Cs': |
| continue |
| ranges = Categories[name] |
| output.write("static int\nxmlUCSIsCat%s(int code) {\n" % name) |
| if len(Categories[name]) > minTableSize: |
| output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" |
| % name) |
| else: |
| start = 1 |
| for range in ranges: |
| (begin, end) = range; |
| if start: |
| output.write(" return("); |
| start = 0 |
| else: |
| output.write(" ||\n "); |
| if (begin == end): |
| output.write("(code == %s)" % (hex(begin))) |
| else: |
| output.write("((code >= %s) && (code <= %s))" % ( |
| hex(begin), hex(end))) |
| output.write(");\n}\n\n") |
| |
| # |
| # Range tables for blocks |
| # |
| |
| blockGroups = '' |
| for block in bkeys: |
| name = block.replace('-', '') |
| ranges = BlockNames[block] |
| group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges) |
| output.write("\n") |
| if blockGroups != '': |
| blockGroups += ",\n" |
| blockGroups += ' {"%s",\n %s}' % (block, group) |
| |
| output.write("static const xmlUnicodeRange xmlUnicodeBlocks[] = {\n") |
| output.write(blockGroups) |
| output.write("\n};\n\n") |
| |
| output.close() |