generate_emoji_name_data.py - third_party/github.com/googlefonts/noto-emoji - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-#
 #
 # Copyright 2015 Google Inc. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """Generate name data for emoji resources. Currently in json format."""
 from __future__ import print_function

 import argparse
 import collections
 import glob
 import json
 import os
 from os import path
 import re
 import sys

 import generate_emoji_html

 from nototools import tool_utils
 from nototools import unicode_data

 def _create_custom_gendered_seq_names():
   """The names have detail that is adequately represented by the image."""

   BOY = 0x1f466
   GIRL = 0x1f467
   MAN = 0x1f468
   WOMAN = 0x1f469
   HEART = 0x2764  # Heavy Black Heart
   KISS_MARK = 0x1f48b
   return {
       (MAN, HEART, KISS_MARK, MAN): 'Kiss',
       (WOMAN, HEART, KISS_MARK, WOMAN): 'Kiss',
       (WOMAN, HEART, KISS_MARK, MAN): 'Kiss',
       (WOMAN, HEART, MAN): 'Couple with Heart',
       (MAN, HEART, MAN): 'Couple with Heart',
       (WOMAN, HEART, WOMAN): 'Couple with Heart',
       (MAN, GIRL): 'Family',
       (MAN, GIRL, GIRL): 'Family',
       (MAN, GIRL, BOY): 'Family',
       (MAN, BOY): 'Family',
       (MAN, BOY, BOY): 'Family',
       (MAN, WOMAN, GIRL): 'Family',
       (MAN, WOMAN, GIRL, GIRL): 'Family',
       (MAN, WOMAN, GIRL, BOY): 'Family',
       (MAN, WOMAN, BOY): 'Family',
       (MAN, WOMAN, BOY, BOY): 'Family',
       (MAN, MAN, GIRL): 'Family',
       (MAN, MAN, GIRL, GIRL): 'Family',
       (MAN, MAN, GIRL, BOY): 'Family',
       (MAN, MAN, BOY): 'Family',
       (MAN, MAN, BOY, BOY): 'Family',
       (WOMAN, GIRL): 'Family',
       (WOMAN, GIRL, GIRL): 'Family',
       (WOMAN, GIRL, BOY): 'Family',
       (WOMAN, BOY): 'Family',
       (WOMAN, BOY, BOY): 'Family',
       (WOMAN, WOMAN, GIRL): 'Family',
       (WOMAN, WOMAN, GIRL, GIRL): 'Family',
       (WOMAN, WOMAN, GIRL, BOY): 'Family',
       (WOMAN, WOMAN, BOY): 'Family',
       (WOMAN, WOMAN, BOY, BOY): 'Family' }

 def _create_custom_seq_names():
   """These have names that often are of the form 'Person xyz-ing' or 'Man Xyz.'
   We opt to simplify the former to an activity name or action, and the latter to
   drop the gender.  This also generally makes the names shorter."""

   EYE = 0x1f441
   SPEECH = 0x1f5e8
   WHITE_FLAG = 0x1f3f3
   RAINBOW = 0x1f308
   return {
       (EYE, SPEECH): 'I Witness',
       (WHITE_FLAG, RAINBOW): 'Rainbow Flag',
       (0x2695,): 'Health Worker',
       (0x2696,): 'Judge',
       (0x26f7,): 'Skiing',
       (0x26f9,): 'Bouncing a Ball',
       (0x2708,): 'Pilot',
       (0x1f33e,): 'Farmer',
       (0x1f373,): 'Cook',
       (0x1f393,): 'Student',
       (0x1f3a4,): 'Singer',
       (0x1f3a8,): 'Artist',
       (0x1f3c2,): 'Snowboarding',
       (0x1f3c3,): 'Running',
       (0x1f3c4,): 'Surfing',
       (0x1f3ca,): 'Swimming',
       (0x1f3cb,): 'Weight Lifting',
       (0x1f3cc,): 'Golfing',
       (0x1f3eb,): 'Teacher',
       (0x1f3ed,): 'Factory Worker',
       (0x1f46e,): 'Police Officer',
       (0x1f46f,): 'Partying',
       (0x1f471,): 'Person with Blond Hair',
       (0x1f473,): 'Person Wearing Turban',
       (0x1f477,): 'Construction Worker',
       (0x1f481,): 'Tipping Hand',
       (0x1f482,): 'Guard',
       (0x1f486,): 'Face Massage',
       (0x1f487,): 'Haircut',
       (0x1f4bb,): 'Technologist',
       (0x1f4bc,): 'Office Worker',
       (0x1f527,): 'Mechanic',
       (0x1f52c,): 'Scientist',
       (0x1f575,): 'Detective',
       (0x1f645,): 'No Good Gesture',
       (0x1f646,): 'OK Gesture',
       (0x1f647,): 'Bowing Deeply',
       (0x1f64b,): 'Raising Hand',
       (0x1f64d,): 'Frowning',
       (0x1f64e,): 'Pouting',
       (0x1f680,): 'Astronaut',
       (0x1f692,): 'Firefighter',
       (0x1f6a3,): 'Rowing',
       (0x1f6b4,): 'Bicycling',
       (0x1f6b5,): 'Mountain Biking',
       (0x1f6b6,): 'Walking',
       (0x1f926,): 'Face Palm',
       (0x1f937,): 'Shrug',
       (0x1f938,): 'Doing a Cartwheel',
       (0x1f939,): 'Juggling',
       (0x1f93c,): 'Wrestling',
       (0x1f93d,): 'Water Polo',
       (0x1f93e,): 'Playing Handball',
       (0x1f9d6,): 'Person in Steamy Room',
       (0x1f9d7,): 'Climbing',
       (0x1f9d8,): 'Person in Lotus Position',
       (0x1f9d9,): 'Mage',
       (0x1f9da,): 'Fairy',
       (0x1f9db,): 'Vampire',
       (0x1f9dd,): 'Elf',
       (0x1f9de,): 'Genie',
       (0x1f9df,): 'Zombie',
   }

 _CUSTOM_GENDERED_SEQ_NAMES = _create_custom_gendered_seq_names()
 _CUSTOM_SEQ_NAMES = _create_custom_seq_names()

 # Fixes for unusual capitalization or cases we don't care to handle in code.
 # Also prevents titlecasing 'S' after apostrophe in posessives.  Note we _do_
 # want titlecasing after apostrophe in some cases, e.g. O'Clock.
 _CUSTOM_CAPS_NAMES = {
     (0x26d1,): 'Rescue Worker’s Helmet',
     (0x1f170,): 'A Button (blood type)',  # a Button (Blood Type)
     (0x1f171,): 'B Button (blood type)',  # B Button (Blood Type)
     (0x1f17e,): 'O Button (blood type)',  # O Button (Blood Type)
     (0x1f18e,): 'AB Button (blood type)',  # Ab Button (Blood Type)
     (0x1f191,): 'CL Button',  # Cl Button
     (0x1f192,): 'COOL Button',  # Cool Button
     (0x1f193,): 'FREE Button',  # Free Button
     (0x1f194,): 'ID Button',  # Id Button
     (0x1f195,): 'NEW Button',  # New Button
     (0x1f196,): 'NG Button',  # Ng Button
     (0x1f197,): 'OK Button',  # Ok Button
     (0x1f198,): 'SOS Button',  # Sos Button
     (0x1f199,): 'UP! Button',  # Up! Button
     (0x1f19a,): 'VS Button',  # Vs Button
     (0x1f3e7,): 'ATM Sign',  # Atm Sign
     (0x1f44C,): 'OK Hand',  # Ok Hand
     (0x1f452,): 'Woman’s Hat',
     (0x1f45a,): 'Woman’s Clothes',
     (0x1f45e,): 'Man’s Shoe',
     (0x1f461,): 'Woman’s Sandal',
     (0x1f462,): 'Woman’s Boot',
     (0x1f519,): 'BACK Arrow',  # Back Arrow
     (0x1f51a,): 'END Arrow',  # End Arrow
     (0x1f51b,): 'ON! Arrow',  # On! Arrow
     (0x1f51c,): 'SOON Arrow',  # Soon Arrow
     (0x1f51d,): 'TOP Arrow',  # Top Arrow
     (0x1f6b9,): 'Men’s Room',
     (0x1f6ba,): 'Women’s Room',
 }

 # For the custom sequences we ignore ZWJ, the emoji variation selector
 # and skin tone modifiers.  We can't always ignore gender  because
 # the gendered sequences match against them, but we ignore gender in other
 # cases so we define a separate set of gendered emoji to remove.

 _NON_GENDER_CPS_TO_STRIP = frozenset(
     [0xfe0f, 0x200d] +
     range(unicode_data._FITZ_START, unicode_data._FITZ_END + 1))

 _GENDER_CPS_TO_STRIP = frozenset([0x2640, 0x2642, 0x1f468, 0x1f469])

 def _custom_name(seq):
   """Apply three kinds of custom names, based on the sequence."""

   seq = tuple([cp for cp in seq if cp not in _NON_GENDER_CPS_TO_STRIP])
   name = _CUSTOM_CAPS_NAMES.get(seq)
   if name:
     return name

   # Single characters that participate in sequences (e.g. fire truck in the
   # firefighter sequences) should not get converted.  Single characters
   # are in the custom caps names set but not the other sets.
   if len(seq) == 1:
     return None

   name = _CUSTOM_GENDERED_SEQ_NAMES.get(seq)
   if name:
     return name

   seq = tuple([cp for cp in seq if cp not in _GENDER_CPS_TO_STRIP])
   name = _CUSTOM_SEQ_NAMES.get(seq)

   return name


 def _standard_name(seq):
   """Use the standard emoji name, with some algorithmic modifications.

   We want to ignore skin-tone modifiers (but of course if the sequence _is_
   the skin-tone modifier itself we keep that).  So we strip these so we can
   start with the generic name ignoring skin tone.

   Non-emoji that are turned into emoji using the emoji VS have '(emoji) '
   prepended to them, so strip that.

   Regional indicator symbol names are a bit long, so shorten them.

   Regional sequences are assumed to be ok as-is in terms of capitalization and
   punctuation, so no modifications are applied to them.

   After title-casing we make some English articles/prepositions lower-case
   again.  We also replace '&' with 'and'; Unicode seems rather fond of
   ampersand."""

   if not unicode_data.is_skintone_modifier(seq[0]):
     seq = tuple([cp for cp in seq if not unicode_data.is_skintone_modifier(cp)])
   name = unicode_data.get_emoji_sequence_name(seq)

   if name.startswith('(emoji) '):
     name = name[8:]

   if len(seq) == 1 and unicode_data.is_regional_indicator(seq[0]):
     return 'Regional Symbol ' + unicode_data.regional_indicator_to_ascii(seq[0])

   if (unicode_data.is_regional_indicator_seq(seq) or
       unicode_data.is_regional_tag_seq(seq)):
     return name

   name = name.title()
   # Require space delimiting just in case...
   name = re.sub(r'\s&\s', ' and ', name)
   name = re.sub(
       # not \b at start because we retain capital at start of phrase
       r'(\s(:?A|And|From|In|Of|With|For))\b', lambda s: s.group(1).lower(),
       name)

   return name


 def _name_data(seq, seq_file):
   name = _custom_name(seq) or _standard_name(seq)
   # we don't need canonical sequences
   sequence = ''.join('&#x%x;' % cp for cp in seq if cp != 0xfe0f)
   fname = path.basename(seq_file)
   return fname, sequence, name


 def generate_names(
     src_dir, dst_dir, skip_limit=20, omit_groups=None, pretty_print=False,
     verbose=False):
   srcdir = tool_utils.resolve_path(src_dir)
   if not path.isdir(srcdir):
     print('%s is not a directory' % src_dir, file=sys.stderr)
     return

   if omit_groups:
     unknown_groups = set(omit_groups) - set(unicode_data.get_emoji_groups())
     if unknown_groups:
       print('did not recognize %d group%s: %s' % (
           len(unknown_groups), '' if len(unknown_groups) == 1 else 's',
           ', '.join('"%s"' % g for g in omit_groups if g in unknown_groups)), file=sys.stderr)
       print('valid groups are:\n  %s' % (
           '\n  '.join(g for g in unicode_data.get_emoji_groups())), file=sys.stderr)
       return
     print('omitting %d group%s: %s' % (
         len(omit_groups), '' if len(omit_groups) == 1 else 's',
         ', '.join('"%s"' % g for g in omit_groups)))
   else:
     # might be None
     print('keeping all groups')
     omit_groups = []

   # make sure the destination exists
   dstdir = tool_utils.ensure_dir_exists(
       tool_utils.resolve_path(dst_dir))

   # _get_image_data returns canonical cp sequences
   print('src dir:', srcdir)
   seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u')
   print('seq to file has %d sequences' % len(seq_to_file))

   # Aliases add non-gendered versions using gendered images for the most part.
   # But when we display the images, we don't distinguish genders in the
   # naming, we rely on the images-- so these look redundant. So we
   # intentionally don't generate images for these.
   # However, the alias file also includes the flag aliases, which we do want,
   # and it also fails to exclude the unknown flag pua (since it doesn't
   # map to anything), so we need to adjust for this.
   canonical_aliases = generate_emoji_html._get_canonical_aliases()

   aliases = set([
       cps for cps in canonical_aliases.keys()
       if not unicode_data.is_regional_indicator_seq(cps)])
   aliases.add((0xfe82b,))  # unknown flag PUA
   excluded = aliases | generate_emoji_html._get_canonical_excluded()

   # The flag aliases have distinct names, so we _do_ want to show them
   # multiple times.
   to_add = {}
   for seq in canonical_aliases:
     if unicode_data.is_regional_indicator_seq(seq):
       replace_seq = canonical_aliases[seq]
       if seq in seq_to_file:
         print('warning, alias %s has file %s' % (
             unicode_data.regional_indicator_seq_to_string(seq),
             seq_to_file[seq]))
         continue
       replace_file = seq_to_file.get(replace_seq)
       if replace_file:
         to_add[seq] = replace_file
   seq_to_file.update(to_add)

   data = []
   last_skipped_group = None
   skipcount = 0
   for group in unicode_data.get_emoji_groups():
     if group in omit_groups:
       continue
     name_data = []
     for seq in unicode_data.get_emoji_in_group(group):
       if seq in excluded:
         continue
       seq_file = seq_to_file.get(seq, None)
       if seq_file is None:
         skipcount += 1
         if verbose:
           if group != last_skipped_group:
             print('group %s' % group)
             last_skipped_group = group
           print('  %s (%s)' % (
               unicode_data.seq_to_string(seq),
               ', '.join(unicode_data.name(cp, 'x') for cp in seq)))
         if skip_limit >= 0 and skipcount > skip_limit:
           raise Exception('skipped too many items')
       else:
         name_data.append(_name_data(seq, seq_file))
     data.append({'category': group, 'emojis': name_data})

   outfile = path.join(dstdir, 'data.json')
   with open(outfile, 'w') as f:
     indent = 2 if pretty_print else None
     separators = None if pretty_print else (',', ':')
     json.dump(data, f, indent=indent, separators=separators)
   print('wrote %s' % outfile)


 def main():
   DEFAULT_DSTDIR = '[emoji]/emoji'
   DEFAULT_IMAGEDIR = '[emoji]/build/compressed_pngs'

   parser = argparse.ArgumentParser()
   parser.add_argument(
       '-s', '--srcdir', help='directory containing images (default %s)' %
       DEFAULT_IMAGEDIR,  metavar='dir', default=DEFAULT_IMAGEDIR)
   parser.add_argument(
       '-d', '--dstdir', help='name of destination directory (default %s)' %
       DEFAULT_DSTDIR, metavar='fname', default=DEFAULT_DSTDIR)
   parser.add_argument(
       '-p', '--pretty_print', help='pretty-print json file',
       action='store_true')
   parser.add_argument(
       '-m', '--missing_limit', help='number of missing images before failure '
       '(default 20), use -1 for no limit', metavar='n', default=20)
   parser.add_argument(
       '--omit_groups', help='names of groups to omit (default "Misc, Flags")',
       metavar='name', default=['Misc', 'Flags'], nargs='*')
   parser.add_argument(
       '-v', '--verbose', help='print progress information to stdout',
       action='store_true')
   args = parser.parse_args()
   generate_names(
       args.srcdir, args.dstdir, args.missing_limit, args.omit_groups,
       pretty_print=args.pretty_print, verbose=args.verbose)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --#
	#
	# Copyright 2015 Google Inc. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Generate name data for emoji resources. Currently in json format."""
	from __future__ import print_function

	import argparse
	import collections
	import glob
	import json
	import os
	from os import path
	import re
	import sys

	import generate_emoji_html

	from nototools import tool_utils
	from nototools import unicode_data

	def _create_custom_gendered_seq_names():
	"""The names have detail that is adequately represented by the image."""

	BOY = 0x1f466
	GIRL = 0x1f467
	MAN = 0x1f468
	WOMAN = 0x1f469
	HEART = 0x2764 # Heavy Black Heart
	KISS_MARK = 0x1f48b
	return {
	(MAN, HEART, KISS_MARK, MAN): 'Kiss',
	(WOMAN, HEART, KISS_MARK, WOMAN): 'Kiss',
	(WOMAN, HEART, KISS_MARK, MAN): 'Kiss',
	(WOMAN, HEART, MAN): 'Couple with Heart',
	(MAN, HEART, MAN): 'Couple with Heart',
	(WOMAN, HEART, WOMAN): 'Couple with Heart',
	(MAN, GIRL): 'Family',
	(MAN, GIRL, GIRL): 'Family',
	(MAN, GIRL, BOY): 'Family',
	(MAN, BOY): 'Family',
	(MAN, BOY, BOY): 'Family',
	(MAN, WOMAN, GIRL): 'Family',
	(MAN, WOMAN, GIRL, GIRL): 'Family',
	(MAN, WOMAN, GIRL, BOY): 'Family',
	(MAN, WOMAN, BOY): 'Family',
	(MAN, WOMAN, BOY, BOY): 'Family',
	(MAN, MAN, GIRL): 'Family',
	(MAN, MAN, GIRL, GIRL): 'Family',
	(MAN, MAN, GIRL, BOY): 'Family',
	(MAN, MAN, BOY): 'Family',
	(MAN, MAN, BOY, BOY): 'Family',
	(WOMAN, GIRL): 'Family',
	(WOMAN, GIRL, GIRL): 'Family',
	(WOMAN, GIRL, BOY): 'Family',
	(WOMAN, BOY): 'Family',
	(WOMAN, BOY, BOY): 'Family',
	(WOMAN, WOMAN, GIRL): 'Family',
	(WOMAN, WOMAN, GIRL, GIRL): 'Family',
	(WOMAN, WOMAN, GIRL, BOY): 'Family',
	(WOMAN, WOMAN, BOY): 'Family',
	(WOMAN, WOMAN, BOY, BOY): 'Family' }

	def _create_custom_seq_names():
	"""These have names that often are of the form 'Person xyz-ing' or 'Man Xyz.'
	We opt to simplify the former to an activity name or action, and the latter to
	drop the gender. This also generally makes the names shorter."""

	EYE = 0x1f441
	SPEECH = 0x1f5e8
	WHITE_FLAG = 0x1f3f3
	RAINBOW = 0x1f308
	return {
	(EYE, SPEECH): 'I Witness',
	(WHITE_FLAG, RAINBOW): 'Rainbow Flag',
	(0x2695,): 'Health Worker',
	(0x2696,): 'Judge',
	(0x26f7,): 'Skiing',
	(0x26f9,): 'Bouncing a Ball',
	(0x2708,): 'Pilot',
	(0x1f33e,): 'Farmer',
	(0x1f373,): 'Cook',
	(0x1f393,): 'Student',
	(0x1f3a4,): 'Singer',
	(0x1f3a8,): 'Artist',
	(0x1f3c2,): 'Snowboarding',
	(0x1f3c3,): 'Running',
	(0x1f3c4,): 'Surfing',
	(0x1f3ca,): 'Swimming',
	(0x1f3cb,): 'Weight Lifting',
	(0x1f3cc,): 'Golfing',
	(0x1f3eb,): 'Teacher',
	(0x1f3ed,): 'Factory Worker',
	(0x1f46e,): 'Police Officer',
	(0x1f46f,): 'Partying',
	(0x1f471,): 'Person with Blond Hair',
	(0x1f473,): 'Person Wearing Turban',
	(0x1f477,): 'Construction Worker',
	(0x1f481,): 'Tipping Hand',
	(0x1f482,): 'Guard',
	(0x1f486,): 'Face Massage',
	(0x1f487,): 'Haircut',
	(0x1f4bb,): 'Technologist',
	(0x1f4bc,): 'Office Worker',
	(0x1f527,): 'Mechanic',
	(0x1f52c,): 'Scientist',
	(0x1f575,): 'Detective',
	(0x1f645,): 'No Good Gesture',
	(0x1f646,): 'OK Gesture',
	(0x1f647,): 'Bowing Deeply',
	(0x1f64b,): 'Raising Hand',
	(0x1f64d,): 'Frowning',
	(0x1f64e,): 'Pouting',
	(0x1f680,): 'Astronaut',
	(0x1f692,): 'Firefighter',
	(0x1f6a3,): 'Rowing',
	(0x1f6b4,): 'Bicycling',
	(0x1f6b5,): 'Mountain Biking',
	(0x1f6b6,): 'Walking',
	(0x1f926,): 'Face Palm',
	(0x1f937,): 'Shrug',
	(0x1f938,): 'Doing a Cartwheel',
	(0x1f939,): 'Juggling',
	(0x1f93c,): 'Wrestling',
	(0x1f93d,): 'Water Polo',
	(0x1f93e,): 'Playing Handball',
	(0x1f9d6,): 'Person in Steamy Room',
	(0x1f9d7,): 'Climbing',
	(0x1f9d8,): 'Person in Lotus Position',
	(0x1f9d9,): 'Mage',
	(0x1f9da,): 'Fairy',
	(0x1f9db,): 'Vampire',
	(0x1f9dd,): 'Elf',
	(0x1f9de,): 'Genie',
	(0x1f9df,): 'Zombie',
	}

	_CUSTOM_GENDERED_SEQ_NAMES = _create_custom_gendered_seq_names()
	_CUSTOM_SEQ_NAMES = _create_custom_seq_names()

	# Fixes for unusual capitalization or cases we don't care to handle in code.
	# Also prevents titlecasing 'S' after apostrophe in posessives. Note we _do_
	# want titlecasing after apostrophe in some cases, e.g. O'Clock.
	_CUSTOM_CAPS_NAMES = {
	(0x26d1,): 'Rescue Worker’s Helmet',
	(0x1f170,): 'A Button (blood type)', # a Button (Blood Type)
	(0x1f171,): 'B Button (blood type)', # B Button (Blood Type)
	(0x1f17e,): 'O Button (blood type)', # O Button (Blood Type)
	(0x1f18e,): 'AB Button (blood type)', # Ab Button (Blood Type)
	(0x1f191,): 'CL Button', # Cl Button
	(0x1f192,): 'COOL Button', # Cool Button
	(0x1f193,): 'FREE Button', # Free Button
	(0x1f194,): 'ID Button', # Id Button
	(0x1f195,): 'NEW Button', # New Button
	(0x1f196,): 'NG Button', # Ng Button
	(0x1f197,): 'OK Button', # Ok Button
	(0x1f198,): 'SOS Button', # Sos Button
	(0x1f199,): 'UP! Button', # Up! Button
	(0x1f19a,): 'VS Button', # Vs Button
	(0x1f3e7,): 'ATM Sign', # Atm Sign
	(0x1f44C,): 'OK Hand', # Ok Hand
	(0x1f452,): 'Woman’s Hat',
	(0x1f45a,): 'Woman’s Clothes',
	(0x1f45e,): 'Man’s Shoe',
	(0x1f461,): 'Woman’s Sandal',
	(0x1f462,): 'Woman’s Boot',
	(0x1f519,): 'BACK Arrow', # Back Arrow
	(0x1f51a,): 'END Arrow', # End Arrow
	(0x1f51b,): 'ON! Arrow', # On! Arrow
	(0x1f51c,): 'SOON Arrow', # Soon Arrow
	(0x1f51d,): 'TOP Arrow', # Top Arrow
	(0x1f6b9,): 'Men’s Room',
	(0x1f6ba,): 'Women’s Room',
	}

	# For the custom sequences we ignore ZWJ, the emoji variation selector
	# and skin tone modifiers. We can't always ignore gender because
	# the gendered sequences match against them, but we ignore gender in other
	# cases so we define a separate set of gendered emoji to remove.

	_NON_GENDER_CPS_TO_STRIP = frozenset(
	[0xfe0f, 0x200d] +
	range(unicode_data._FITZ_START, unicode_data._FITZ_END + 1))

	_GENDER_CPS_TO_STRIP = frozenset([0x2640, 0x2642, 0x1f468, 0x1f469])

	def _custom_name(seq):
	"""Apply three kinds of custom names, based on the sequence."""

	seq = tuple([cp for cp in seq if cp not in _NON_GENDER_CPS_TO_STRIP])
	name = _CUSTOM_CAPS_NAMES.get(seq)
	if name:
	return name

	# Single characters that participate in sequences (e.g. fire truck in the
	# firefighter sequences) should not get converted. Single characters
	# are in the custom caps names set but not the other sets.
	if len(seq) == 1:
	return None

	name = _CUSTOM_GENDERED_SEQ_NAMES.get(seq)
	if name:
	return name

	seq = tuple([cp for cp in seq if cp not in _GENDER_CPS_TO_STRIP])
	name = _CUSTOM_SEQ_NAMES.get(seq)

	return name


	def _standard_name(seq):
	"""Use the standard emoji name, with some algorithmic modifications.

	We want to ignore skin-tone modifiers (but of course if the sequence _is_
	the skin-tone modifier itself we keep that). So we strip these so we can
	start with the generic name ignoring skin tone.

	Non-emoji that are turned into emoji using the emoji VS have '(emoji) '
	prepended to them, so strip that.

	Regional indicator symbol names are a bit long, so shorten them.

	Regional sequences are assumed to be ok as-is in terms of capitalization and
	punctuation, so no modifications are applied to them.

	After title-casing we make some English articles/prepositions lower-case
	again. We also replace '&' with 'and'; Unicode seems rather fond of
	ampersand."""

	if not unicode_data.is_skintone_modifier(seq[0]):
	seq = tuple([cp for cp in seq if not unicode_data.is_skintone_modifier(cp)])
	name = unicode_data.get_emoji_sequence_name(seq)

	if name.startswith('(emoji) '):
	name = name[8:]

	if len(seq) == 1 and unicode_data.is_regional_indicator(seq[0]):
	return 'Regional Symbol ' + unicode_data.regional_indicator_to_ascii(seq[0])

	if (unicode_data.is_regional_indicator_seq(seq) or
	unicode_data.is_regional_tag_seq(seq)):
	return name

	name = name.title()
	# Require space delimiting just in case...
	name = re.sub(r'\s&\s', ' and ', name)
	name = re.sub(
	# not \b at start because we retain capital at start of phrase
	r'(\s(:?A\|And\|From\|In\|Of\|With\|For))\b', lambda s: s.group(1).lower(),
	name)

	return name


	def _name_data(seq, seq_file):
	name = _custom_name(seq) or _standard_name(seq)
	# we don't need canonical sequences
	sequence = ''.join('&#x%x;' % cp for cp in seq if cp != 0xfe0f)
	fname = path.basename(seq_file)
	return fname, sequence, name


	def generate_names(
	src_dir, dst_dir, skip_limit=20, omit_groups=None, pretty_print=False,
	verbose=False):
	srcdir = tool_utils.resolve_path(src_dir)
	if not path.isdir(srcdir):
	print('%s is not a directory' % src_dir, file=sys.stderr)
	return

	if omit_groups:
	unknown_groups = set(omit_groups) - set(unicode_data.get_emoji_groups())
	if unknown_groups:
	print('did not recognize %d group%s: %s' % (
	len(unknown_groups), '' if len(unknown_groups) == 1 else 's',
	', '.join('"%s"' % g for g in omit_groups if g in unknown_groups)), file=sys.stderr)
	print('valid groups are:\n %s' % (
	'\n '.join(g for g in unicode_data.get_emoji_groups())), file=sys.stderr)
	return
	print('omitting %d group%s: %s' % (
	len(omit_groups), '' if len(omit_groups) == 1 else 's',
	', '.join('"%s"' % g for g in omit_groups)))
	else:
	# might be None
	print('keeping all groups')
	omit_groups = []

	# make sure the destination exists
	dstdir = tool_utils.ensure_dir_exists(
	tool_utils.resolve_path(dst_dir))

	# _get_image_data returns canonical cp sequences
	print('src dir:', srcdir)
	seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u')
	print('seq to file has %d sequences' % len(seq_to_file))

	# Aliases add non-gendered versions using gendered images for the most part.
	# But when we display the images, we don't distinguish genders in the
	# naming, we rely on the images-- so these look redundant. So we
	# intentionally don't generate images for these.
	# However, the alias file also includes the flag aliases, which we do want,
	# and it also fails to exclude the unknown flag pua (since it doesn't
	# map to anything), so we need to adjust for this.
	canonical_aliases = generate_emoji_html._get_canonical_aliases()

	aliases = set([
	cps for cps in canonical_aliases.keys()
	if not unicode_data.is_regional_indicator_seq(cps)])
	aliases.add((0xfe82b,)) # unknown flag PUA
	excluded = aliases \| generate_emoji_html._get_canonical_excluded()

	# The flag aliases have distinct names, so we _do_ want to show them
	# multiple times.
	to_add = {}
	for seq in canonical_aliases:
	if unicode_data.is_regional_indicator_seq(seq):
	replace_seq = canonical_aliases[seq]
	if seq in seq_to_file:
	print('warning, alias %s has file %s' % (
	unicode_data.regional_indicator_seq_to_string(seq),
	seq_to_file[seq]))
	continue
	replace_file = seq_to_file.get(replace_seq)
	if replace_file:
	to_add[seq] = replace_file
	seq_to_file.update(to_add)

	data = []
	last_skipped_group = None
	skipcount = 0
	for group in unicode_data.get_emoji_groups():
	if group in omit_groups:
	continue
	name_data = []
	for seq in unicode_data.get_emoji_in_group(group):
	if seq in excluded:
	continue
	seq_file = seq_to_file.get(seq, None)
	if seq_file is None:
	skipcount += 1
	if verbose:
	if group != last_skipped_group:
	print('group %s' % group)
	last_skipped_group = group
	print(' %s (%s)' % (
	unicode_data.seq_to_string(seq),
	', '.join(unicode_data.name(cp, 'x') for cp in seq)))
	if skip_limit >= 0 and skipcount > skip_limit:
	raise Exception('skipped too many items')
	else:
	name_data.append(_name_data(seq, seq_file))
	data.append({'category': group, 'emojis': name_data})

	outfile = path.join(dstdir, 'data.json')
	with open(outfile, 'w') as f:
	indent = 2 if pretty_print else None
	separators = None if pretty_print else (',', ':')
	json.dump(data, f, indent=indent, separators=separators)
	print('wrote %s' % outfile)


	def main():
	DEFAULT_DSTDIR = '[emoji]/emoji'
	DEFAULT_IMAGEDIR = '[emoji]/build/compressed_pngs'

	parser = argparse.ArgumentParser()
	parser.add_argument(
	'-s', '--srcdir', help='directory containing images (default %s)' %
	DEFAULT_IMAGEDIR, metavar='dir', default=DEFAULT_IMAGEDIR)
	parser.add_argument(
	'-d', '--dstdir', help='name of destination directory (default %s)' %
	DEFAULT_DSTDIR, metavar='fname', default=DEFAULT_DSTDIR)
	parser.add_argument(
	'-p', '--pretty_print', help='pretty-print json file',
	action='store_true')
	parser.add_argument(
	'-m', '--missing_limit', help='number of missing images before failure '
	'(default 20), use -1 for no limit', metavar='n', default=20)
	parser.add_argument(
	'--omit_groups', help='names of groups to omit (default "Misc, Flags")',
	metavar='name', default=['Misc', 'Flags'], nargs='*')
	parser.add_argument(
	'-v', '--verbose', help='print progress information to stdout',
	action='store_true')
	args = parser.parse_args()
	generate_names(
	args.srcdir, args.dstdir, args.missing_limit, args.omit_groups,
	pretty_print=args.pretty_print, verbose=args.verbose)


	if __name__ == "__main__":
	main()