| #!/usr/bin/env python3 |
| # |
| # Copyright 2016 Google Inc. All rights reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Compare emoji image file namings against unicode property data. |
| The intent of this script is to check if the resulting font will pass |
| the Android linter: |
| https://android.googlesource.com/platform/frameworks/base/+/master/tools/fonts/fontchain_linter.py |
| """ |
| from __future__ import print_function |
| |
| import argparse |
| import collections |
| import glob |
| import os |
| from os import path |
| import re |
| import sys |
| |
| from nototools import unicode_data |
| import add_aliases |
| |
| ZWJ = 0x200d |
| EMOJI_VS = 0xfe0f |
| |
| END_TAG = 0xe007f |
| |
| def _make_tag_set(): |
| tag_set = set() |
| tag_set |= set(range(0xe0030, 0xe003a)) # 0-9 |
| tag_set |= set(range(0xe0061, 0xe007b)) # a-z |
| tag_set.add(END_TAG) |
| return tag_set |
| |
| TAG_SET = _make_tag_set() |
| |
| _namedata = None |
| |
| def seq_name(seq): |
| global _namedata |
| |
| if not _namedata: |
| def strip_vs_map(seq_map): |
| return { |
| unicode_data.strip_emoji_vs(k): v |
| for k, v in seq_map.items()} |
| _namedata = [ |
| strip_vs_map(unicode_data.get_emoji_combining_sequences()), |
| strip_vs_map(unicode_data.get_emoji_flag_sequences()), |
| strip_vs_map(unicode_data.get_emoji_modifier_sequences()), |
| strip_vs_map(unicode_data.get_emoji_zwj_sequences()), |
| ] |
| |
| if len(seq) == 1: |
| return unicode_data.name(seq[0], None) |
| |
| for data in _namedata: |
| if seq in data: |
| return data[seq] |
| if EMOJI_VS in seq: |
| non_vs_seq = unicode_data.strip_emoji_vs(seq) |
| for data in _namedata: |
| if non_vs_seq in data: |
| return data[non_vs_seq] |
| |
| return None |
| |
| |
| def _check_no_vs(sorted_seq_to_filepath): |
| """Our image data does not use emoji presentation variation selectors.""" |
| for seq, fp in sorted_seq_to_filepath.items(): |
| if EMOJI_VS in seq: |
| print(f'check no VS: {EMOJI_VS} in path: {fp}') |
| |
| |
| def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version): |
| """Ensure all cps in these sequences are valid emoji cps or specific cps |
| used in forming emoji sequences. This is a 'pre-check' that reports |
| this specific problem.""" |
| |
| coverage_pass = True |
| |
| valid_cps = set(unicode_data.get_emoji()) |
| if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE: |
| valid_cps |= unicode_data.proposed_emoji_cps() |
| else: |
| valid_cps = set( |
| cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version) |
| valid_cps.add(0x200d) # ZWJ |
| valid_cps.add(0x20e3) # combining enclosing keycap |
| valid_cps.add(0xfe0f) # variation selector (emoji presentation) |
| valid_cps.add(0xfe82b) # PUA value for unknown flag |
| valid_cps |= TAG_SET # used in subregion tag sequences |
| |
| not_emoji = {} |
| for seq, fp in sorted_seq_to_filepath.items(): |
| for cp in seq: |
| if cp not in valid_cps: |
| if cp not in not_emoji: |
| not_emoji[cp] = [] |
| not_emoji[cp].append(fp) |
| |
| if len(not_emoji): |
| print( |
| f'check valid emoji cps: {len(not_emoji)} non-emoji cp found', file=sys.stderr) |
| for cp in sorted(not_emoji): |
| fps = not_emoji[cp] |
| print( |
| f'check the following cp: {cp} - {not_emoji.get(cp)[0]} (in {len(fps)} sequences)', file=sys.stderr) |
| coverage_pass = False |
| |
| if not coverage_pass: |
| exit("Please fix the problems mentioned above or run: make BYPASS_SEQUENCE_CHECK='True'") |
| |
| |
| def _check_zwj(sorted_seq_to_filepath): |
| """Ensure zwj is only between two appropriate emoji. This is a 'pre-check' |
| that reports this specific problem.""" |
| |
| for seq, fp in sorted_seq_to_filepath.items(): |
| if ZWJ not in seq: |
| continue |
| if seq[0] == ZWJ: |
| print(f'check zwj: zwj at head of sequence in {fp}', file=sys.stderr) |
| if len(seq) == 1: |
| continue |
| if seq[-1] == ZWJ: |
| print(f'check zwj: zwj at end of sequence in {fp}', file=sys.stderr) |
| for i, cp in enumerate(seq): |
| if cp == ZWJ: |
| if i > 0: |
| pcp = seq[i-1] |
| if pcp != EMOJI_VS and not unicode_data.is_emoji(pcp): |
| print( |
| f'check zwj: non-emoji {pcp} precedes ZWJ in {fp}', |
| file=sys.stderr) |
| if i < len(seq) - 1: |
| fcp = seq[i+1] |
| if not unicode_data.is_emoji(fcp): |
| print( |
| f'check zwj: non-emoji {fcp} follows ZWJ in {fp}', |
| file=sys.stderr) |
| |
| |
| def _check_flags(sorted_seq_to_filepath): |
| """Ensure regional indicators are only in sequences of one or two, and |
| never mixed.""" |
| for seq, fp in sorted_seq_to_filepath.items(): |
| have_reg = None |
| for cp in seq: |
| is_reg = unicode_data.is_regional_indicator(cp) |
| if have_reg == None: |
| have_reg = is_reg |
| elif have_reg != is_reg: |
| print( |
| f'check flags: mix of regional and non-regional in {fp}', |
| file=sys.stderr) |
| if have_reg and len(seq) > 2: |
| # We provide dummy glyphs for regional indicators, so there are sequences |
| # with single regional indicator symbols, the len check handles this. |
| print( |
| f'check flags: regional indicator sequence length != 2 in {fp}', |
| file=sys.stderr) |
| |
| def _check_tags(sorted_seq_to_filepath): |
| """Ensure tag sequences (for subregion flags) conform to the spec. We don't |
| validate against CLDR, just that there's a sequence of 2 or more tags starting |
| and ending with the appropriate codepoints.""" |
| |
| BLACK_FLAG = 0x1f3f4 |
| BLACK_FLAG_SET = set([BLACK_FLAG]) |
| for seq, fp in sorted_seq_to_filepath.items(): |
| seq_set = set(cp for cp in seq) |
| overlap_set = seq_set & TAG_SET |
| if not overlap_set: |
| continue |
| if seq[0] != BLACK_FLAG: |
| print(f'check tags: bad start tag in {fp}') |
| elif seq[-1] != END_TAG: |
| print(f'check tags: bad end tag in {fp}') |
| elif len(seq) < 4: |
| print(f'check tags: sequence too short in {fp}') |
| elif seq_set - TAG_SET != BLACK_FLAG_SET: |
| print(f'check tags: non-tag items in {fp}') |
| |
| |
| def _check_skintone(sorted_seq_to_filepath): |
| """Ensure skin tone modifiers are not applied to emoji that are not defined |
| to take them. May appear standalone, though. Also check that emoji that take |
| skin tone modifiers have a complete set.""" |
| base_to_modifiers = collections.defaultdict(set) |
| for seq, fp in sorted_seq_to_filepath.items(): |
| for i, cp in enumerate(seq): |
| if unicode_data.is_skintone_modifier(cp): |
| if i == 0: |
| if len(seq) > 1: |
| print( |
| f'check skintone: skin color selector first in sequence {fp}', |
| file=sys.stderr) |
| # standalone are ok |
| continue |
| pcp = seq[i-1] |
| if not unicode_data.is_emoji_modifier_base(pcp): |
| print( |
| f'check skintone: emoji skintone modifier applied to non-base at {i}: {fp}', |
| file=sys.stderr) |
| else: |
| if pcp not in base_to_modifiers: |
| base_to_modifiers[pcp] = set() |
| base_to_modifiers[pcp].add(cp) |
| |
| for cp, modifiers in sorted(base_to_modifiers.items()): |
| if len(modifiers) != 5: |
| print( |
| 'check skintone: base %04x has %d modifiers defined (%s) in %s' % ( |
| cp, len(modifiers), |
| ', '.join('%04x' % cp for cp in sorted(modifiers)), fp), |
| file=sys.stderr) |
| |
| |
| def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version): |
| """Verify that zwj sequences are valid for the given unicode version.""" |
| for seq, fp in sorted_seq_to_filepath.items(): |
| if ZWJ not in seq: |
| continue |
| age = unicode_data.get_emoji_sequence_age(seq) |
| if age is None or unicode_version is not None and age > unicode_version: |
| print(f'check zwj sequences: undefined sequence {fp}') |
| |
| |
| def _check_no_alias_sources(sorted_seq_to_filepath): |
| """Check that we don't have sequences that we expect to be aliased to |
| some other sequence.""" |
| aliases = add_aliases.read_default_emoji_aliases() |
| for seq, fp in sorted_seq_to_filepath.items(): |
| if seq in aliases: |
| print(f'check no alias sources: aliased sequence {fp}') |
| |
| |
| def _check_coverage(seq_to_filepath, unicode_version): |
| """Ensure we have all and only the cps and sequences that we need for the |
| font as of this version.""" |
| |
| coverage_pass = True |
| age = unicode_version |
| |
| non_vs_to_canonical = {} |
| for k in seq_to_filepath: |
| if EMOJI_VS in k: |
| non_vs = unicode_data.strip_emoji_vs(k) |
| non_vs_to_canonical[non_vs] = k |
| |
| aliases = add_aliases.read_default_emoji_aliases() |
| for k, v in sorted(aliases.items()): |
| if v not in seq_to_filepath and v not in non_vs_to_canonical: |
| alias_str = unicode_data.seq_to_string(k) |
| target_str = unicode_data.seq_to_string(v) |
| print(f'coverage: alias {alias_str} missing target {target_str}') |
| coverage_pass = False |
| continue |
| if k in seq_to_filepath or k in non_vs_to_canonical: |
| alias_str = unicode_data.seq_to_string(k) |
| target_str = unicode_data.seq_to_string(v) |
| print(f'coverage: alias {alias_str} already exists as {target_str} ({seq_name(v)})') |
| coverage_pass = False |
| continue |
| filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]] |
| seq_to_filepath[k] = 'alias:' + filename |
| |
| # check single emoji, this includes most of the special chars |
| emoji = sorted(unicode_data.get_emoji()) |
| for cp in emoji: |
| if tuple([cp]) not in seq_to_filepath: |
| print( |
| f'coverage: missing single {cp} ({unicode_data.name(cp)})') |
| coverage_pass = False |
| |
| # special characters |
| # all but combining enclosing keycap are currently marked as emoji |
| for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + list(range(0x30, 0x3a)): |
| if cp not in emoji and tuple([cp]) not in seq_to_filepath: |
| print(f'coverage: missing special {cp} ({unicode_data.name(cp)})') |
| coverage_pass = False |
| |
| # combining sequences |
| comb_seq_to_name = sorted( |
| unicode_data._emoji_sequence_data.items()) |
| for seq, name in comb_seq_to_name: |
| if seq not in seq_to_filepath: |
| # strip vs and try again |
| non_vs_seq = unicode_data.strip_emoji_vs(seq) |
| if non_vs_seq not in seq_to_filepath: |
| print(f'coverage: missing combining sequence {unicode_data.seq_to_string(seq)} ({name})') |
| coverage_pass = False |
| |
| # check for 'unknown flag' |
| # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that |
| # don't start with our prefix so 'unknown_flag' would be excluded by default. |
| if tuple([0xfe82b]) not in seq_to_filepath: |
| print('coverage: missing unknown flag PUA fe82b') |
| coverage_pass = False |
| |
| if not coverage_pass: |
| exit("Please fix the problems mentioned above or run: make BYPASS_SEQUENCE_CHECK='True'") |
| |
| |
| def check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage): |
| sorted_seq_to_filepath = collections.OrderedDict( |
| sorted(seq_to_filepath.items())) |
| _check_no_vs(sorted_seq_to_filepath) |
| _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version) |
| _check_zwj(sorted_seq_to_filepath) |
| _check_flags(sorted_seq_to_filepath) |
| _check_tags(sorted_seq_to_filepath) |
| _check_skintone(sorted_seq_to_filepath) |
| _check_zwj_sequences(sorted_seq_to_filepath, unicode_version) |
| _check_no_alias_sources(sorted_seq_to_filepath) |
| if coverage: |
| _check_coverage(sorted_seq_to_filepath, unicode_version) |
| |
| |
| def create_sequence_to_filepath(name_to_dirpath, prefix, suffix): |
| """Check names, and convert name to sequences for names that are ok, |
| returning a sequence to file path mapping. Reports bad segments |
| of a name to stderr.""" |
| segment_re = re.compile(r'^[0-9a-f]{4,6}$') |
| result = {} |
| for name, dirname in name_to_dirpath.items(): |
| if not name.startswith(prefix): |
| print(f'expected prefix "{prefix}" for "{name}"') |
| continue |
| |
| segments = name[len(prefix): -len(suffix)].split('_') |
| segfail = False |
| seq = [] |
| for s in segments: |
| if not segment_re.match(s): |
| print(f'bad codepoint name "{s}" in {dirname}/{name}') |
| segfail = True |
| continue |
| n = int(s, 16) |
| if n > 0x10ffff: |
| print(f'codepoint "{s}" out of range in {dirname}/{name}') |
| segfail = True |
| continue |
| seq.append(n) |
| if not segfail: |
| result[tuple(seq)] = path.join(dirname, name) |
| return result |
| |
| |
| def collect_name_to_dirpath(directory, prefix, suffix, exclude=None): |
| """Return a mapping from filename to path rooted at directory, ignoring files |
| that don't match suffix, and subtrees with names in exclude. Report when a |
| filename appears in more than one subdir; the first path found is kept.""" |
| result = {} |
| for dirname, dirs, files in os.walk(directory, topdown=True): |
| if exclude: |
| dirs[:] = [d for d in dirs if d not in exclude] |
| |
| if directory != '.': |
| dirname = directory |
| for f in files: |
| if not f.endswith(suffix): |
| continue |
| if f in result: |
| print('duplicate file "%s" in %s and %s ' % ( |
| f, dirname, result[f]), file=sys.stderr) |
| continue |
| result[f] = dirname |
| return result |
| |
| |
| def collect_name_to_dirpath_with_override(dirs, prefix, suffix, exclude=None): |
| """Return a mapping from filename to a directory path rooted at a directory |
| in dirs, using collect_name_to_filepath. The last directory is retained. This |
| does not report an error if a file appears under more than one root directory, |
| so lets later root directories override earlier ones. Use 'exclude' to |
| name subdirectories (of any root) whose subtree you wish to skip.""" |
| result = {} |
| for d in dirs: |
| result.update(collect_name_to_dirpath(d, prefix, suffix, exclude)) |
| return result |
| |
| |
| def run_check(dirs, names, prefix, suffix, exclude, unicode_version, coverage): |
| msg = '' |
| if unicode_version: |
| msg = ' (%3.1f)' % unicode_version |
| |
| if (names and dirs): |
| sys.exit("Please only provide a directory or a list of names") |
| elif names: |
| name_to_dirpath = {} |
| for name in names: |
| name_to_dirpath[name] = "" |
| elif dirs: |
| print(f'Checking files with prefix "{prefix}" and suffix "{suffix}"{msg} in: {dirs}') |
| name_to_dirpath = collect_name_to_dirpath_with_override(dirs, prefix=prefix, suffix=suffix, exclude=exclude) |
| |
| print(f'checking {len(name_to_dirpath)} names') |
| seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix) |
| print(f'checking {len(seq_to_filepath)} sequences') |
| check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage) |
| print('Done running checks') |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| '-d', '--dirs', help='directory roots containing emoji images', |
| metavar='dir', nargs='+') |
| parser.add_argument( |
| '-n', '--names', help='list with expected emoji', |
| metavar='names', nargs='+') |
| parser.add_argument( |
| '-e', '--exclude', help='names of source subdirs to exclude', |
| metavar='dir', nargs='+') |
| parser.add_argument( |
| '-c', '--coverage', help='test for complete coverage', |
| action='store_true') |
| parser.add_argument( |
| '-p', '--prefix', help='prefix to match, default "emoji_u"', |
| metavar='pfx', default='emoji_u') |
| parser.add_argument( |
| '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx', |
| default='.png') |
| parser.add_argument( |
| '-u', '--unicode_version', help='limit to this unicode version or before', |
| metavar='version', type=float) |
| args = parser.parse_args() |
| run_check( |
| args.dirs, args.names, args.prefix, args.suffix, args.exclude, args.unicode_version, |
| args.coverage) |
| |
| |
| if __name__ == '__main__': |
| main() |