blob: c2bc29e1542daede6cd4206127195e2aee9a551e [file] [log] [blame]
from __future__ import (
print_function, division, absolute_import, unicode_literals)
from fontTools.misc.py23 import *
import re
from bisect import bisect_right
try:
# use unicodedata backport compatible with python2:
# https://github.com/mikekap/unicodedata2
from unicodedata2 import *
except ImportError: # pragma: no cover
# fall back to built-in unicodedata (possibly outdated)
from unicodedata import *
from . import Blocks, Scripts, ScriptExtensions
__all__ = [tostr(s) for s in (
# names from built-in unicodedata module
"lookup",
"name",
"decimal",
"digit",
"numeric",
"category",
"bidirectional",
"combining",
"east_asian_width",
"mirrored",
"decomposition",
"normalize",
"unidata_version",
"ucd_3_2_0",
# additonal functions
"block",
"script",
"script_extension",
"script_name",
"script_code",
)]
def script(char):
""" Return the four-letter script code assigned to the Unicode character
'char' as string.
>>> script("a")
'Latn'
>>> script(",")
'Zyyy'
>>> script(unichr(0x10FFFF))
'Zzzz'
"""
code = byteord(char)
# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
# comes after (to the right of) any existing entries of x in a, and it
# partitions array a into two halves so that, for the left side
# all(val <= x for val in a[lo:i]), and for the right side
# all(val > x for val in a[i:hi]).
# Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
# breakpoints); we want to use `bisect_right` to look up the range that
# contains the given codepoint: i.e. whose start is less than or equal
# to the codepoint. Thus, we subtract -1 from the index returned.
i = bisect_right(Scripts.RANGES, code)
return Scripts.VALUES[i-1]
def script_extension(char):
""" Return the script extension property assigned to the Unicode character
'char' as a set of string.
>>> script_extension("a") == {'Latn'}
True
>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
True
>>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
True
"""
code = byteord(char)
i = bisect_right(ScriptExtensions.RANGES, code)
value = ScriptExtensions.VALUES[i-1]
if value is None:
# code points not explicitly listed for Script Extensions
# have as their value the corresponding Script property value
return {script(char)}
return value
def script_name(code, default=KeyError):
""" Return the long, human-readable script name given a four-letter
Unicode script code.
If no matching name is found, a KeyError is raised by default.
You can use the 'default' argument to return a fallback value (e.g.
'Unknown' or None) instead of throwing an error.
"""
try:
return str(Scripts.NAMES[code].replace("_", " "))
except KeyError:
if isinstance(default, type) and issubclass(default, KeyError):
raise
return default
_normalize_re = re.compile(r"[-_ ]+")
def _normalize_property_name(string):
"""Remove case, strip space, '-' and '_' for loose matching."""
return _normalize_re.sub("", string).lower()
_SCRIPT_CODES = {_normalize_property_name(v): k
for k, v in Scripts.NAMES.items()}
def script_code(script_name, default=KeyError):
"""Returns the four-letter Unicode script code from its long name
If no matching script code is found, a KeyError is raised by default.
You can use the 'default' argument to return a fallback string (e.g.
'Zzzz' or None) instead of throwing an error.
"""
normalized_name = _normalize_property_name(script_name)
try:
return _SCRIPT_CODES[normalized_name]
except KeyError:
if isinstance(default, type) and issubclass(default, KeyError):
raise
return default
def block(char):
""" Return the block property assigned to the Unicode character 'char'
as a string.
>>> block("a")
'Basic Latin'
>>> block(unichr(0x060C))
'Arabic'
>>> block(unichr(0xEFFFF))
'No_Block'
"""
code = byteord(char)
i = bisect_right(Blocks.RANGES, code)
return Blocks.VALUES[i-1]