Lib/fontTools/unicodedata/__init__.py - third_party/fonttools - Git at Google

 from __future__ import (
     print_function, division, absolute_import, unicode_literals)
 from fontTools.misc.py23 import *

 import re
 from bisect import bisect_right

 try:
     # use unicodedata backport compatible with python2:
     # https://github.com/mikekap/unicodedata2
     from unicodedata2 import *
 except ImportError:  # pragma: no cover
     # fall back to built-in unicodedata (possibly outdated)
     from unicodedata import *

 from . import Blocks, Scripts, ScriptExtensions


 __all__ = [tostr(s) for s in (
     # names from built-in unicodedata module
     "lookup",
     "name",
     "decimal",
     "digit",
     "numeric",
     "category",
     "bidirectional",
     "combining",
     "east_asian_width",
     "mirrored",
     "decomposition",
     "normalize",
     "unidata_version",
     "ucd_3_2_0",
     # additonal functions
     "block",
     "script",
     "script_extension",
     "script_name",
     "script_code",
 )]


 def script(char):
     """ Return the four-letter script code assigned to the Unicode character
     'char' as string.

     >>> script("a")
     'Latn'
     >>> script(",")
     'Zyyy'
     >>> script(unichr(0x10FFFF))
     'Zzzz'
     """
     code = byteord(char)
     # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
     # comes after (to the right of) any existing entries of x in a, and it
     # partitions array a into two halves so that, for the left side
     # all(val <= x for val in a[lo:i]), and for the right side
     # all(val > x for val in a[i:hi]).
     # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
     # breakpoints); we want to use `bisect_right` to look up the range that
     # contains the given codepoint: i.e. whose start is less than or equal
     # to the codepoint. Thus, we subtract -1 from the index returned.
     i = bisect_right(Scripts.RANGES, code)
     return Scripts.VALUES[i-1]


 def script_extension(char):
     """ Return the script extension property assigned to the Unicode character
     'char' as a set of string.

     >>> script_extension("a") == {'Latn'}
     True
     >>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
     True
     >>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
     True
     """
     code = byteord(char)
     i = bisect_right(ScriptExtensions.RANGES, code)
     value = ScriptExtensions.VALUES[i-1]
     if value is None:
         # code points not explicitly listed for Script Extensions
         # have as their value the corresponding Script property value
         return {script(char)}
     return value


 def script_name(code, default=KeyError):
     """ Return the long, human-readable script name given a four-letter
     Unicode script code.

     If no matching name is found, a KeyError is raised by default.

     You can use the 'default' argument to return a fallback value (e.g.
     'Unknown' or None) instead of throwing an error.
     """
     try:
         return str(Scripts.NAMES[code].replace("_", " "))
     except KeyError:
         if isinstance(default, type) and issubclass(default, KeyError):
             raise
         return default


 _normalize_re = re.compile(r"[-_ ]+")


 def _normalize_property_name(string):
     """Remove case, strip space, '-' and '_' for loose matching."""
     return _normalize_re.sub("", string).lower()


 _SCRIPT_CODES = {_normalize_property_name(v): k
                  for k, v in Scripts.NAMES.items()}


 def script_code(script_name, default=KeyError):
     """Returns the four-letter Unicode script code from its long name

     If no matching script code is found, a KeyError is raised by default.

     You can use the 'default' argument to return a fallback string (e.g.
     'Zzzz' or None) instead of throwing an error.
     """
     normalized_name = _normalize_property_name(script_name)
     try:
         return _SCRIPT_CODES[normalized_name]
     except KeyError:
         if isinstance(default, type) and issubclass(default, KeyError):
             raise
         return default


 def block(char):
     """ Return the block property assigned to the Unicode character 'char'
     as a string.

     >>> block("a")
     'Basic Latin'
     >>> block(unichr(0x060C))
     'Arabic'
     >>> block(unichr(0xEFFFF))
     'No_Block'
     """
     code = byteord(char)
     i = bisect_right(Blocks.RANGES, code)
     return Blocks.VALUES[i-1]
	from __future__ import (
	print_function, division, absolute_import, unicode_literals)
	from fontTools.misc.py23 import *

	import re
	from bisect import bisect_right

	try:
	# use unicodedata backport compatible with python2:
	# https://github.com/mikekap/unicodedata2
	from unicodedata2 import *
	except ImportError: # pragma: no cover
	# fall back to built-in unicodedata (possibly outdated)
	from unicodedata import *

	from . import Blocks, Scripts, ScriptExtensions


	__all__ = [tostr(s) for s in (
	# names from built-in unicodedata module
	"lookup",
	"name",
	"decimal",
	"digit",
	"numeric",
	"category",
	"bidirectional",
	"combining",
	"east_asian_width",
	"mirrored",
	"decomposition",
	"normalize",
	"unidata_version",
	"ucd_3_2_0",
	# additonal functions
	"block",
	"script",
	"script_extension",
	"script_name",
	"script_code",
	)]


	def script(char):
	""" Return the four-letter script code assigned to the Unicode character
	'char' as string.

	>>> script("a")
	'Latn'
	>>> script(",")
	'Zyyy'
	>>> script(unichr(0x10FFFF))
	'Zzzz'
	"""
	code = byteord(char)
	# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
	# comes after (to the right of) any existing entries of x in a, and it
	# partitions array a into two halves so that, for the left side
	# all(val <= x for val in a[lo:i]), and for the right side
	# all(val > x for val in a[i:hi]).
	# Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
	# breakpoints); we want to use `bisect_right` to look up the range that
	# contains the given codepoint: i.e. whose start is less than or equal
	# to the codepoint. Thus, we subtract -1 from the index returned.
	i = bisect_right(Scripts.RANGES, code)
	return Scripts.VALUES[i-1]


	def script_extension(char):
	""" Return the script extension property assigned to the Unicode character
	'char' as a set of string.

	>>> script_extension("a") == {'Latn'}
	True
	>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
	True
	>>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
	True
	"""
	code = byteord(char)
	i = bisect_right(ScriptExtensions.RANGES, code)
	value = ScriptExtensions.VALUES[i-1]
	if value is None:
	# code points not explicitly listed for Script Extensions
	# have as their value the corresponding Script property value
	return {script(char)}
	return value


	def script_name(code, default=KeyError):
	""" Return the long, human-readable script name given a four-letter
	Unicode script code.

	If no matching name is found, a KeyError is raised by default.

	You can use the 'default' argument to return a fallback value (e.g.
	'Unknown' or None) instead of throwing an error.
	"""
	try:
	return str(Scripts.NAMES[code].replace("_", " "))
	except KeyError:
	if isinstance(default, type) and issubclass(default, KeyError):
	raise
	return default


	_normalize_re = re.compile(r"[-_ ]+")


	def _normalize_property_name(string):
	"""Remove case, strip space, '-' and '_' for loose matching."""
	return _normalize_re.sub("", string).lower()


	_SCRIPT_CODES = {_normalize_property_name(v): k
	for k, v in Scripts.NAMES.items()}


	def script_code(script_name, default=KeyError):
	"""Returns the four-letter Unicode script code from its long name

	If no matching script code is found, a KeyError is raised by default.

	You can use the 'default' argument to return a fallback string (e.g.
	'Zzzz' or None) instead of throwing an error.
	"""
	normalized_name = _normalize_property_name(script_name)
	try:
	return _SCRIPT_CODES[normalized_name]
	except KeyError:
	if isinstance(default, type) and issubclass(default, KeyError):
	raise
	return default


	def block(char):
	""" Return the block property assigned to the Unicode character 'char'
	as a string.

	>>> block("a")
	'Basic Latin'
	>>> block(unichr(0x060C))
	'Arabic'
	>>> block(unichr(0xEFFFF))
	'No_Block'
	"""
	code = byteord(char)
	i = bisect_right(Blocks.RANGES, code)
	return Blocks.VALUES[i-1]