| #!/usr/bin/env python3 |
| # Copyright (C) 1998, 1999 Tom Tromey |
| # Copyright (C) 2001 Red Hat Software |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2, or (at your option) |
| # any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with this program; if not, see <http://www.gnu.org/licenses/>. |
| |
| """ |
| gen-casemap-txt.py - Generate test cases for case mapping from Unicode data. |
| See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html |
| Usage: |
| I consider the output of this program to be unrestricted. |
| Use it as you will. |
| """ |
| |
| import sys |
| import argparse |
| |
| |
| def main(argv): |
| parser = argparse.ArgumentParser( |
| description="Generate test cases for case mapping from Unicode data") |
| parser.add_argument("UNICODE-VERSION") |
| parser.add_argument("UnicodeData.txt") |
| parser.add_argument("SpecialCasing.txt") |
| args = parser.parse_args(argv[1:]) |
| version = getattr(args, "UNICODE-VERSION") |
| filename_udata = getattr(args, "UnicodeData.txt") |
| filename_casing = getattr(args, "SpecialCasing.txt") |
| |
| # Names of fields in Unicode data table. |
| CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \ |
| DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \ |
| COMMENT, UPPER, LOWER, TITLE = range(15) |
| |
| # Names of fields in the SpecialCasing table |
| CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5) |
| |
| upper = {} |
| title = {} |
| lower = {} |
| |
| def make_hex(codes): |
| """Converts a string of white space separated code points encoded as |
| hex values to a Unicode string. Any extra white space is ignored. |
| """ |
| return "".join([chr(int(c, 16)) for c in codes.split()]) |
| |
| def process_one(code, fields): |
| type_ = fields[CATEGORY] |
| if type_ == "Ll": |
| upper[code] = make_hex(fields[UPPER]) |
| lower[code] = chr(code) |
| title[code] = make_hex(fields[TITLE]) |
| elif type_ == "Lu": |
| lower[code] = make_hex(fields[LOWER]) |
| upper[code] = chr(code) |
| title[code] = make_hex(fields[TITLE]) |
| elif type_ == "Lt": |
| upper[code] = make_hex(fields[UPPER]) |
| lower[code] = make_hex(fields[LOWER]) |
| title[code] = make_hex(fields[LOWER]) |
| |
| with open(filename_udata, encoding="utf-8") as fileobj: |
| last_code = -1 |
| for line in fileobj: |
| line = line.strip() |
| fields = [f.strip() for f in line.split(";")] |
| if len(fields) != 15: |
| raise SystemExit( |
| "Entry for %s has wrong number of fields (%d)" % ( |
| fields[CODE], len(fields))) |
| |
| code = int(fields[CODE], 16) |
| |
| if code > last_code + 1: |
| # Found a gap |
| if fields[NAME].endswith("Last>"): |
| # Fill the gap with the last character read, |
| # since this was a range specified in the char database |
| gfields = fields |
| else: |
| # The gap represents undefined characters. Only the type |
| # matters. |
| gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '', |
| '', '', '', ''] |
| |
| last_code += 1 |
| while last_code < code: |
| gfields[CODE] = "%04x" % last_code |
| process_one(last_code, gfields) |
| last_code += 1 |
| |
| process_one(code, fields) |
| last_code = code |
| |
| with open(filename_casing, encoding="utf-8") as fileobj: |
| last_code = -1 |
| for line in fileobj: |
| # strip comments and skip empty lines |
| line = line.split("#", 1)[0].strip() |
| if not line: |
| continue |
| |
| # all lines end with ";" so just remove it |
| line = line.rstrip(";").rstrip() |
| fields = [f.strip() for f in line.split(";")] |
| if len(fields) not in (4, 5): |
| raise SystemExit( |
| "Entry for %s has wrong number of fields (%d)" % ( |
| fields[CASE_CODE], len(fields))) |
| |
| if len(fields) == 5: |
| # Ignore conditional special cases - we'll handle them manually |
| continue |
| |
| code = int(fields[CASE_CODE], 16) |
| |
| upper[code] = make_hex(fields[CASE_UPPER]) |
| lower[code] = make_hex(fields[CASE_LOWER]) |
| title[code] = make_hex(fields[CASE_TITLE]) |
| |
| print_tests(version, upper, title, lower) |
| |
| |
| def print_tests(version, upper, title, lower): |
| print("""\ |
| # Test cases generated from Unicode {} data |
| # by gen-casemap-txt.py. Do not edit. |
| # |
| # Some special hand crafted tests |
| # |
| tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE |
| tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I |
| tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I |
| tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE |
| tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I |
| tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I |
| # Test reordering of YPOGEGRAMMENI across other accents |
| \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t |
| \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t |
| # Handling of final and nonfinal sigma |
| \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ |
| \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ |
| \tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ |
| # Lithuanian rule of i followed by letter with dot. Not at all sure |
| # about the titlecase part here |
| lt_LT\ti\u0117\ti\u0117\tIe\tIE\t |
| lt_LT\tie\u0307\tie\u0307\tIe\tIE\t |
| lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE |
| lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE |
| lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE |
| lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) |
| lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) |
| lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) |
| lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) |
| lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) |
| lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) |
| lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t |
| lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t |
| lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE |
| lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE |
| lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE |
| lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) |
| lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) |
| lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) |
| lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) |
| lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) |
| lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) |
| # Special case not at initial position |
| \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04 |
| # |
| # Now the automatic tests |
| #""".format(version)) |
| |
| for i in range(0x10ffff): |
| if i == 0x3A3: |
| # Greek sigma needs special tests |
| continue |
| |
| up = upper.get(i, "") |
| lo = lower.get(i, "") |
| ti = title.get(i, "") |
| |
| if any([up, lo, ti]): |
| print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i)) |
| |
| |
| if __name__ == "__main__": |
| sys.exit(main(sys.argv)) |