| from collections import Counter |
| import unicodedata |
| from google.protobuf import text_format |
| from gflanguages import languages_public_pb2 |
| |
| ATTRIBUTES = "base auxiliary marks punctuation index".split(" ") |
| |
| |
| def main(args=None): |
| for path in args: |
| with open(path, encoding="utf-8") as fp: |
| language = text_format.Parse( |
| fp.read(), languages_public_pb2.LanguageProto() |
| ) |
| changed = False |
| exemplar_values = {} |
| bases = language.exemplar_chars.base.split(" ") |
| marks = language.exemplar_chars.marks.split(" ") |
| if not len(bases) or bases == [""]: |
| continue |
| new_marks = [] |
| new_bases = [] |
| for chars in marks: |
| if not chars: |
| continue |
| if chars[0] != "\u25CC": |
| chars = "\u25CC" + chars |
| if chars not in new_marks: |
| new_marks.append(chars) |
| |
| for chars in bases: |
| if not chars: |
| continue |
| if chars[0] == "\u25CC": |
| chars = chars[1:] |
| cat = unicodedata.category(chars[0]) |
| if cat in ["Mn", "Mc"]: |
| if chars[0] != "\u25CC": |
| chars = "\u25CC" + chars |
| if chars not in new_marks: |
| new_marks.append(chars) |
| else: |
| new_bases.append(chars) |
| |
| language.exemplar_chars.base = " ".join(new_bases) |
| language.exemplar_chars.marks = " ".join(new_marks) |
| |
| with open(path, "w", encoding="utf-8") as fp: |
| fp.write(text_format.MessageToString(language, as_utf8=True)) |
| fp.close() |
| |
| |
| if __name__ == "__main__": |
| import sys |
| |
| main(args=sys.argv[1:]) |