| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*-# |
| # |
| # Copyright 2015 Google Inc. All rights reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Generate name data for emoji resources. Currently in json format.""" |
| from __future__ import print_function |
| |
| import argparse |
| import collections |
| import glob |
| import json |
| import os |
| from os import path |
| import re |
| import sys |
| |
| import generate_emoji_html |
| |
| from nototools import tool_utils |
| from nototools import unicode_data |
| |
| def _create_custom_gendered_seq_names(): |
| """The names have detail that is adequately represented by the image.""" |
| |
| BOY = 0x1f466 |
| GIRL = 0x1f467 |
| MAN = 0x1f468 |
| WOMAN = 0x1f469 |
| HEART = 0x2764 # Heavy Black Heart |
| KISS_MARK = 0x1f48b |
| return { |
| (MAN, HEART, KISS_MARK, MAN): 'Kiss', |
| (WOMAN, HEART, KISS_MARK, WOMAN): 'Kiss', |
| (WOMAN, HEART, KISS_MARK, MAN): 'Kiss', |
| (WOMAN, HEART, MAN): 'Couple with Heart', |
| (MAN, HEART, MAN): 'Couple with Heart', |
| (WOMAN, HEART, WOMAN): 'Couple with Heart', |
| (MAN, GIRL): 'Family', |
| (MAN, GIRL, GIRL): 'Family', |
| (MAN, GIRL, BOY): 'Family', |
| (MAN, BOY): 'Family', |
| (MAN, BOY, BOY): 'Family', |
| (MAN, WOMAN, GIRL): 'Family', |
| (MAN, WOMAN, GIRL, GIRL): 'Family', |
| (MAN, WOMAN, GIRL, BOY): 'Family', |
| (MAN, WOMAN, BOY): 'Family', |
| (MAN, WOMAN, BOY, BOY): 'Family', |
| (MAN, MAN, GIRL): 'Family', |
| (MAN, MAN, GIRL, GIRL): 'Family', |
| (MAN, MAN, GIRL, BOY): 'Family', |
| (MAN, MAN, BOY): 'Family', |
| (MAN, MAN, BOY, BOY): 'Family', |
| (WOMAN, GIRL): 'Family', |
| (WOMAN, GIRL, GIRL): 'Family', |
| (WOMAN, GIRL, BOY): 'Family', |
| (WOMAN, BOY): 'Family', |
| (WOMAN, BOY, BOY): 'Family', |
| (WOMAN, WOMAN, GIRL): 'Family', |
| (WOMAN, WOMAN, GIRL, GIRL): 'Family', |
| (WOMAN, WOMAN, GIRL, BOY): 'Family', |
| (WOMAN, WOMAN, BOY): 'Family', |
| (WOMAN, WOMAN, BOY, BOY): 'Family' } |
| |
| def _create_custom_seq_names(): |
| """These have names that often are of the form 'Person xyz-ing' or 'Man Xyz.' |
| We opt to simplify the former to an activity name or action, and the latter to |
| drop the gender. This also generally makes the names shorter.""" |
| |
| EYE = 0x1f441 |
| SPEECH = 0x1f5e8 |
| WHITE_FLAG = 0x1f3f3 |
| RAINBOW = 0x1f308 |
| return { |
| (EYE, SPEECH): 'I Witness', |
| (WHITE_FLAG, RAINBOW): 'Rainbow Flag', |
| (0x2695,): 'Health Worker', |
| (0x2696,): 'Judge', |
| (0x26f7,): 'Skiing', |
| (0x26f9,): 'Bouncing a Ball', |
| (0x2708,): 'Pilot', |
| (0x1f33e,): 'Farmer', |
| (0x1f373,): 'Cook', |
| (0x1f393,): 'Student', |
| (0x1f3a4,): 'Singer', |
| (0x1f3a8,): 'Artist', |
| (0x1f3c2,): 'Snowboarding', |
| (0x1f3c3,): 'Running', |
| (0x1f3c4,): 'Surfing', |
| (0x1f3ca,): 'Swimming', |
| (0x1f3cb,): 'Weight Lifting', |
| (0x1f3cc,): 'Golfing', |
| (0x1f3eb,): 'Teacher', |
| (0x1f3ed,): 'Factory Worker', |
| (0x1f46e,): 'Police Officer', |
| (0x1f46f,): 'Partying', |
| (0x1f471,): 'Person with Blond Hair', |
| (0x1f473,): 'Person Wearing Turban', |
| (0x1f477,): 'Construction Worker', |
| (0x1f481,): 'Tipping Hand', |
| (0x1f482,): 'Guard', |
| (0x1f486,): 'Face Massage', |
| (0x1f487,): 'Haircut', |
| (0x1f4bb,): 'Technologist', |
| (0x1f4bc,): 'Office Worker', |
| (0x1f527,): 'Mechanic', |
| (0x1f52c,): 'Scientist', |
| (0x1f575,): 'Detective', |
| (0x1f645,): 'No Good Gesture', |
| (0x1f646,): 'OK Gesture', |
| (0x1f647,): 'Bowing Deeply', |
| (0x1f64b,): 'Raising Hand', |
| (0x1f64d,): 'Frowning', |
| (0x1f64e,): 'Pouting', |
| (0x1f680,): 'Astronaut', |
| (0x1f692,): 'Firefighter', |
| (0x1f6a3,): 'Rowing', |
| (0x1f6b4,): 'Bicycling', |
| (0x1f6b5,): 'Mountain Biking', |
| (0x1f6b6,): 'Walking', |
| (0x1f926,): 'Face Palm', |
| (0x1f937,): 'Shrug', |
| (0x1f938,): 'Doing a Cartwheel', |
| (0x1f939,): 'Juggling', |
| (0x1f93c,): 'Wrestling', |
| (0x1f93d,): 'Water Polo', |
| (0x1f93e,): 'Playing Handball', |
| (0x1f9d6,): 'Person in Steamy Room', |
| (0x1f9d7,): 'Climbing', |
| (0x1f9d8,): 'Person in Lotus Position', |
| (0x1f9d9,): 'Mage', |
| (0x1f9da,): 'Fairy', |
| (0x1f9db,): 'Vampire', |
| (0x1f9dd,): 'Elf', |
| (0x1f9de,): 'Genie', |
| (0x1f9df,): 'Zombie', |
| } |
| |
| _CUSTOM_GENDERED_SEQ_NAMES = _create_custom_gendered_seq_names() |
| _CUSTOM_SEQ_NAMES = _create_custom_seq_names() |
| |
| # Fixes for unusual capitalization or cases we don't care to handle in code. |
| # Also prevents titlecasing 'S' after apostrophe in posessives. Note we _do_ |
| # want titlecasing after apostrophe in some cases, e.g. O'Clock. |
| _CUSTOM_CAPS_NAMES = { |
| (0x26d1,): 'Rescue Worker’s Helmet', |
| (0x1f170,): 'A Button (blood type)', # a Button (Blood Type) |
| (0x1f171,): 'B Button (blood type)', # B Button (Blood Type) |
| (0x1f17e,): 'O Button (blood type)', # O Button (Blood Type) |
| (0x1f18e,): 'AB Button (blood type)', # Ab Button (Blood Type) |
| (0x1f191,): 'CL Button', # Cl Button |
| (0x1f192,): 'COOL Button', # Cool Button |
| (0x1f193,): 'FREE Button', # Free Button |
| (0x1f194,): 'ID Button', # Id Button |
| (0x1f195,): 'NEW Button', # New Button |
| (0x1f196,): 'NG Button', # Ng Button |
| (0x1f197,): 'OK Button', # Ok Button |
| (0x1f198,): 'SOS Button', # Sos Button |
| (0x1f199,): 'UP! Button', # Up! Button |
| (0x1f19a,): 'VS Button', # Vs Button |
| (0x1f3e7,): 'ATM Sign', # Atm Sign |
| (0x1f44C,): 'OK Hand', # Ok Hand |
| (0x1f452,): 'Woman’s Hat', |
| (0x1f45a,): 'Woman’s Clothes', |
| (0x1f45e,): 'Man’s Shoe', |
| (0x1f461,): 'Woman’s Sandal', |
| (0x1f462,): 'Woman’s Boot', |
| (0x1f519,): 'BACK Arrow', # Back Arrow |
| (0x1f51a,): 'END Arrow', # End Arrow |
| (0x1f51b,): 'ON! Arrow', # On! Arrow |
| (0x1f51c,): 'SOON Arrow', # Soon Arrow |
| (0x1f51d,): 'TOP Arrow', # Top Arrow |
| (0x1f6b9,): 'Men’s Room', |
| (0x1f6ba,): 'Women’s Room', |
| } |
| |
| # For the custom sequences we ignore ZWJ, the emoji variation selector |
| # and skin tone modifiers. We can't always ignore gender because |
| # the gendered sequences match against them, but we ignore gender in other |
| # cases so we define a separate set of gendered emoji to remove. |
| |
| _NON_GENDER_CPS_TO_STRIP = frozenset( |
| [0xfe0f, 0x200d] + |
| range(unicode_data._FITZ_START, unicode_data._FITZ_END + 1)) |
| |
| _GENDER_CPS_TO_STRIP = frozenset([0x2640, 0x2642, 0x1f468, 0x1f469]) |
| |
| def _custom_name(seq): |
| """Apply three kinds of custom names, based on the sequence.""" |
| |
| seq = tuple([cp for cp in seq if cp not in _NON_GENDER_CPS_TO_STRIP]) |
| name = _CUSTOM_CAPS_NAMES.get(seq) |
| if name: |
| return name |
| |
| # Single characters that participate in sequences (e.g. fire truck in the |
| # firefighter sequences) should not get converted. Single characters |
| # are in the custom caps names set but not the other sets. |
| if len(seq) == 1: |
| return None |
| |
| name = _CUSTOM_GENDERED_SEQ_NAMES.get(seq) |
| if name: |
| return name |
| |
| seq = tuple([cp for cp in seq if cp not in _GENDER_CPS_TO_STRIP]) |
| name = _CUSTOM_SEQ_NAMES.get(seq) |
| |
| return name |
| |
| |
| def _standard_name(seq): |
| """Use the standard emoji name, with some algorithmic modifications. |
| |
| We want to ignore skin-tone modifiers (but of course if the sequence _is_ |
| the skin-tone modifier itself we keep that). So we strip these so we can |
| start with the generic name ignoring skin tone. |
| |
| Non-emoji that are turned into emoji using the emoji VS have '(emoji) ' |
| prepended to them, so strip that. |
| |
| Regional indicator symbol names are a bit long, so shorten them. |
| |
| Regional sequences are assumed to be ok as-is in terms of capitalization and |
| punctuation, so no modifications are applied to them. |
| |
| After title-casing we make some English articles/prepositions lower-case |
| again. We also replace '&' with 'and'; Unicode seems rather fond of |
| ampersand.""" |
| |
| if not unicode_data.is_skintone_modifier(seq[0]): |
| seq = tuple([cp for cp in seq if not unicode_data.is_skintone_modifier(cp)]) |
| name = unicode_data.get_emoji_sequence_name(seq) |
| |
| if name.startswith('(emoji) '): |
| name = name[8:] |
| |
| if len(seq) == 1 and unicode_data.is_regional_indicator(seq[0]): |
| return 'Regional Symbol ' + unicode_data.regional_indicator_to_ascii(seq[0]) |
| |
| if (unicode_data.is_regional_indicator_seq(seq) or |
| unicode_data.is_regional_tag_seq(seq)): |
| return name |
| |
| name = name.title() |
| # Require space delimiting just in case... |
| name = re.sub(r'\s&\s', ' and ', name) |
| name = re.sub( |
| # not \b at start because we retain capital at start of phrase |
| r'(\s(:?A|And|From|In|Of|With|For))\b', lambda s: s.group(1).lower(), |
| name) |
| |
| return name |
| |
| |
| def _name_data(seq, seq_file): |
| name = _custom_name(seq) or _standard_name(seq) |
| # we don't need canonical sequences |
| sequence = ''.join('&#x%x;' % cp for cp in seq if cp != 0xfe0f) |
| fname = path.basename(seq_file) |
| return fname, sequence, name |
| |
| |
| def generate_names( |
| src_dir, dst_dir, skip_limit=20, omit_groups=None, pretty_print=False, |
| verbose=False): |
| srcdir = tool_utils.resolve_path(src_dir) |
| if not path.isdir(srcdir): |
| print('%s is not a directory' % src_dir, file=sys.stderr) |
| return |
| |
| if omit_groups: |
| unknown_groups = set(omit_groups) - set(unicode_data.get_emoji_groups()) |
| if unknown_groups: |
| print('did not recognize %d group%s: %s' % ( |
| len(unknown_groups), '' if len(unknown_groups) == 1 else 's', |
| ', '.join('"%s"' % g for g in omit_groups if g in unknown_groups)), file=sys.stderr) |
| print('valid groups are:\n %s' % ( |
| '\n '.join(g for g in unicode_data.get_emoji_groups())), file=sys.stderr) |
| return |
| print('omitting %d group%s: %s' % ( |
| len(omit_groups), '' if len(omit_groups) == 1 else 's', |
| ', '.join('"%s"' % g for g in omit_groups))) |
| else: |
| # might be None |
| print('keeping all groups') |
| omit_groups = [] |
| |
| # make sure the destination exists |
| dstdir = tool_utils.ensure_dir_exists( |
| tool_utils.resolve_path(dst_dir)) |
| |
| # _get_image_data returns canonical cp sequences |
| print('src dir:', srcdir) |
| seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u') |
| print('seq to file has %d sequences' % len(seq_to_file)) |
| |
| # Aliases add non-gendered versions using gendered images for the most part. |
| # But when we display the images, we don't distinguish genders in the |
| # naming, we rely on the images-- so these look redundant. So we |
| # intentionally don't generate images for these. |
| # However, the alias file also includes the flag aliases, which we do want, |
| # and it also fails to exclude the unknown flag pua (since it doesn't |
| # map to anything), so we need to adjust for this. |
| canonical_aliases = generate_emoji_html._get_canonical_aliases() |
| |
| aliases = set([ |
| cps for cps in canonical_aliases.keys() |
| if not unicode_data.is_regional_indicator_seq(cps)]) |
| aliases.add((0xfe82b,)) # unknown flag PUA |
| excluded = aliases | generate_emoji_html._get_canonical_excluded() |
| |
| # The flag aliases have distinct names, so we _do_ want to show them |
| # multiple times. |
| to_add = {} |
| for seq in canonical_aliases: |
| if unicode_data.is_regional_indicator_seq(seq): |
| replace_seq = canonical_aliases[seq] |
| if seq in seq_to_file: |
| print('warning, alias %s has file %s' % ( |
| unicode_data.regional_indicator_seq_to_string(seq), |
| seq_to_file[seq])) |
| continue |
| replace_file = seq_to_file.get(replace_seq) |
| if replace_file: |
| to_add[seq] = replace_file |
| seq_to_file.update(to_add) |
| |
| data = [] |
| last_skipped_group = None |
| skipcount = 0 |
| for group in unicode_data.get_emoji_groups(): |
| if group in omit_groups: |
| continue |
| name_data = [] |
| for seq in unicode_data.get_emoji_in_group(group): |
| if seq in excluded: |
| continue |
| seq_file = seq_to_file.get(seq, None) |
| if seq_file is None: |
| skipcount += 1 |
| if verbose: |
| if group != last_skipped_group: |
| print('group %s' % group) |
| last_skipped_group = group |
| print(' %s (%s)' % ( |
| unicode_data.seq_to_string(seq), |
| ', '.join(unicode_data.name(cp, 'x') for cp in seq))) |
| if skip_limit >= 0 and skipcount > skip_limit: |
| raise Exception('skipped too many items') |
| else: |
| name_data.append(_name_data(seq, seq_file)) |
| data.append({'category': group, 'emojis': name_data}) |
| |
| outfile = path.join(dstdir, 'data.json') |
| with open(outfile, 'w') as f: |
| indent = 2 if pretty_print else None |
| separators = None if pretty_print else (',', ':') |
| json.dump(data, f, indent=indent, separators=separators) |
| print('wrote %s' % outfile) |
| |
| |
| def main(): |
| DEFAULT_DSTDIR = '[emoji]/emoji' |
| DEFAULT_IMAGEDIR = '[emoji]/build/compressed_pngs' |
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| '-s', '--srcdir', help='directory containing images (default %s)' % |
| DEFAULT_IMAGEDIR, metavar='dir', default=DEFAULT_IMAGEDIR) |
| parser.add_argument( |
| '-d', '--dstdir', help='name of destination directory (default %s)' % |
| DEFAULT_DSTDIR, metavar='fname', default=DEFAULT_DSTDIR) |
| parser.add_argument( |
| '-p', '--pretty_print', help='pretty-print json file', |
| action='store_true') |
| parser.add_argument( |
| '-m', '--missing_limit', help='number of missing images before failure ' |
| '(default 20), use -1 for no limit', metavar='n', default=20) |
| parser.add_argument( |
| '--omit_groups', help='names of groups to omit (default "Misc, Flags")', |
| metavar='name', default=['Misc', 'Flags'], nargs='*') |
| parser.add_argument( |
| '-v', '--verbose', help='print progress information to stdout', |
| action='store_true') |
| args = parser.parse_args() |
| generate_names( |
| args.srcdir, args.dstdir, args.missing_limit, args.omit_groups, |
| pretty_print=args.pretty_print, verbose=args.verbose) |
| |
| |
| if __name__ == "__main__": |
| main() |