| #!/usr/bin/env python3 | 
 | # Copyright (C) 1998, 1999 Tom Tromey | 
 | # Copyright (C) 2001 Red Hat Software | 
 | # | 
 | # This program is free software; you can redistribute it and/or modify | 
 | # it under the terms of the GNU General Public License as published by | 
 | # the Free Software Foundation; either version 2, or (at your option) | 
 | # any later version. | 
 | # | 
 | # This program is distributed in the hope that it will be useful, | 
 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
 | # GNU General Public License for more details. | 
 | # | 
 | # You should have received a copy of the GNU General Public License | 
 | # along with this program; if not, see <http://www.gnu.org/licenses/>. | 
 |  | 
 | """ | 
 | gen-casemap-txt.py - Generate test cases for case mapping from Unicode data. | 
 | See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html | 
 | Usage: | 
 |     I consider the output of this program to be unrestricted. | 
 |     Use it as you will. | 
 | """ | 
 |  | 
 | import sys | 
 | import argparse | 
 |  | 
 |  | 
 | def main(argv): | 
 |     parser = argparse.ArgumentParser( | 
 |         description="Generate test cases for case mapping from Unicode data") | 
 |     parser.add_argument("UNICODE-VERSION") | 
 |     parser.add_argument("UnicodeData.txt") | 
 |     parser.add_argument("SpecialCasing.txt") | 
 |     args = parser.parse_args(argv[1:]) | 
 |     version = getattr(args, "UNICODE-VERSION") | 
 |     filename_udata = getattr(args, "UnicodeData.txt") | 
 |     filename_casing = getattr(args, "SpecialCasing.txt") | 
 |  | 
 |     # Names of fields in Unicode data table. | 
 |     CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \ | 
 |         DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \ | 
 |         COMMENT, UPPER, LOWER, TITLE = range(15) | 
 |  | 
 |     # Names of fields in the SpecialCasing table | 
 |     CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5) | 
 |  | 
 |     upper = {} | 
 |     title = {} | 
 |     lower = {} | 
 |  | 
 |     def make_hex(codes): | 
 |         """Converts a string of white space separated code points encoded as | 
 |         hex values to a Unicode string. Any extra white space is ignored. | 
 |         """ | 
 |         return "".join([chr(int(c, 16)) for c in codes.split()]) | 
 |  | 
 |     def process_one(code, fields): | 
 |         type_ = fields[CATEGORY] | 
 |         if type_ == "Ll": | 
 |             upper[code] = make_hex(fields[UPPER]) | 
 |             lower[code] = chr(code) | 
 |             title[code] = make_hex(fields[TITLE]) | 
 |         elif type_ == "Lu": | 
 |             lower[code] = make_hex(fields[LOWER]) | 
 |             upper[code] = chr(code) | 
 |             title[code] = make_hex(fields[TITLE]) | 
 |         elif type_ == "Lt": | 
 |             upper[code] = make_hex(fields[UPPER]) | 
 |             lower[code] = make_hex(fields[LOWER]) | 
 |             title[code] = make_hex(fields[LOWER]) | 
 |  | 
 |     with open(filename_udata, encoding="utf-8") as fileobj: | 
 |         last_code = -1 | 
 |         for line in fileobj: | 
 |             line = line.strip() | 
 |             fields = [f.strip() for f in line.split(";")] | 
 |             if len(fields) != 15: | 
 |                 raise SystemExit( | 
 |                     "Entry for %s has wrong number of fields (%d)" % ( | 
 |                         fields[CODE], len(fields))) | 
 |  | 
 |             code = int(fields[CODE], 16) | 
 |  | 
 |             if code > last_code + 1: | 
 |                 # Found a gap | 
 |                 if fields[NAME].endswith("Last>"): | 
 |                     # Fill the gap with the last character read, | 
 |                     # since this was a range specified in the char database | 
 |                     gfields = fields | 
 |                 else: | 
 |                     # The gap represents undefined characters.  Only the type | 
 |                     # matters. | 
 |                     gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '', | 
 |                                '', '', '', ''] | 
 |  | 
 |                 last_code += 1 | 
 |                 while last_code < code: | 
 |                     gfields[CODE] = "%04x" % last_code | 
 |                     process_one(last_code, gfields) | 
 |                     last_code += 1 | 
 |  | 
 |             process_one(code, fields) | 
 |             last_code = code | 
 |  | 
 |     with open(filename_casing, encoding="utf-8") as fileobj: | 
 |         last_code = -1 | 
 |         for line in fileobj: | 
 |             # strip comments and skip empty lines | 
 |             line = line.split("#", 1)[0].strip() | 
 |             if not line: | 
 |                 continue | 
 |  | 
 |             # all lines end with ";" so just remove it | 
 |             line = line.rstrip(";").rstrip() | 
 |             fields = [f.strip() for f in line.split(";")] | 
 |             if len(fields) not in (4, 5): | 
 |                 raise SystemExit( | 
 |                     "Entry for %s has wrong number of fields (%d)" % ( | 
 |                         fields[CASE_CODE], len(fields))) | 
 |  | 
 |             if len(fields) == 5: | 
 |                 # Ignore conditional special cases - we'll handle them manually | 
 |                 continue | 
 |  | 
 |             code = int(fields[CASE_CODE], 16) | 
 |  | 
 |             upper[code] = make_hex(fields[CASE_UPPER]) | 
 |             lower[code] = make_hex(fields[CASE_LOWER]) | 
 |             title[code] = make_hex(fields[CASE_TITLE]) | 
 |  | 
 |     print_tests(version, upper, title, lower) | 
 |  | 
 |  | 
 | def print_tests(version, upper, title, lower): | 
 |     print("""\ | 
 | # Test cases generated from Unicode {} data | 
 | # by gen-casemap-txt.py. Do not edit. | 
 | # | 
 | # Some special hand crafted tests | 
 | # | 
 | tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE | 
 | tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I | 
 | tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I | 
 | tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE | 
 | tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I | 
 | tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I | 
 | # Test reordering of YPOGEGRAMMENI across other accents | 
 | \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t | 
 | \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t | 
 | # Handling of final and nonfinal sigma | 
 | \tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ 	 | 
 | \tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ	 | 
 | \tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ	 | 
 | # Lithuanian rule of i followed by letter with dot. Not at all sure | 
 | # about the titlecase part here | 
 | lt_LT\ti\u0117\ti\u0117\tIe\tIE\t | 
 | lt_LT\tie\u0307\tie\u0307\tIe\tIE\t | 
 | lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE | 
 | lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE | 
 | lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE | 
 | lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) | 
 | lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) | 
 | lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) | 
 | lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) | 
 | lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) | 
 | lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) | 
 | lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t | 
 | lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t | 
 | lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE | 
 | lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE | 
 | lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE | 
 | lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) | 
 | lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) | 
 | lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) | 
 | lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) | 
 | lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) | 
 | lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) | 
 | # Special case not at initial position | 
 | \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04 | 
 | # | 
 | # Now the automatic tests | 
 | #""".format(version)) | 
 |  | 
 |     for i in range(0x10ffff): | 
 |         if i == 0x3A3: | 
 |             # Greek sigma needs special tests | 
 |             continue | 
 |  | 
 |         up = upper.get(i, "") | 
 |         lo = lower.get(i, "") | 
 |         ti = title.get(i, "") | 
 |  | 
 |         if any([up, lo, ti]): | 
 |             print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i)) | 
 |  | 
 |  | 
 | if __name__ == "__main__": | 
 |     sys.exit(main(sys.argv)) |