Lib/fontTools/encodings/codecs.py - third_party/fonttools - Git at Google

 """Extend the Python codecs module with a few encodings that are used in OpenType (name table)
 but missing from Python.  See https://github.com/fonttools/fonttools/issues/236 for details."""

 from __future__ import print_function, division, absolute_import
 from fontTools.misc.py23 import *
 import codecs
 import encodings

 class ExtendCodec(codecs.Codec):

 	def __init__(self, name, base_encoding, mapping):
 		self.name = name
 		self.base_encoding = base_encoding
 		self.mapping = mapping
 		self.reverse = {v:k for k,v in mapping.items()}
 		self.max_len = max(len(v) for v in mapping.values())
 		self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
 		codecs.register_error(name, self.error)

 	def encode(self, input, errors='strict'):
 		assert errors == 'strict'
 		#return codecs.encode(input, self.base_encoding, self.name), len(input)

 		# The above line could totally be all we needed, relying on the error
 		# handling to replace the unencodable Unicode characters with our extended
 		# byte sequences.
 		#
 		# However, there seems to be a design bug in Python (probably intentional):
 		# the error handler for encoding is supposed to return a **Unicode** character,
 		# that then needs to be encodable itself...  Ugh.
 		#
 		# So we implement what codecs.encode() should have been doing: which is expect
 		# error handler to return bytes() to be added to the output.
 		#
 		# This seems to have been fixed in Python 3.3.  We should try using that and
 		# use fallback only if that failed.
 		# https://docs.python.org/3.3/library/codecs.html#codecs.register_error

 		length = len(input)
 		out = b''
 		while input:
 			try:
 				part = codecs.encode(input, self.base_encoding)
 				out += part
 				input = '' # All converted
 			except UnicodeEncodeError as e:
 				# Convert the correct part
 				out += codecs.encode(input[:e.start], self.base_encoding)
 				replacement, pos = self.error(e)
 				out += replacement
 				input = input[pos:]
 		return out, length

 	def decode(self, input, errors='strict'):
 		assert errors == 'strict'
 		return codecs.decode(input, self.base_encoding, self.name), len(input)

 	def error(self, e):
 		if isinstance(e, UnicodeDecodeError):
 			for end in range(e.start + 1, e.end + 1):
 				s = e.object[e.start:end]
 				if s in self.mapping:
 					return self.mapping[s], end
 		elif isinstance(e, UnicodeEncodeError):
 			for end in range(e.start + 1, e.start + self.max_len + 1):
 				s = e.object[e.start:end]
 				if s in self.reverse:
 					return self.reverse[s], end
 		e.encoding = self.name
 		raise e


 _extended_encodings = {
 	"x_mac_japanese_ttx": ("shift_jis", {
 					b"\xFC": unichr(0x007C),
 					b"\x7E": unichr(0x007E),
 					b"\x80": unichr(0x005C),
 					b"\xA0": unichr(0x00A0),
 					b"\xFD": unichr(0x00A9),
 					b"\xFE": unichr(0x2122),
 					b"\xFF": unichr(0x2026),
 				}),
 	"x_mac_trad_chinese_ttx": ("big5", {
 					b"\x80": unichr(0x005C),
 					b"\xA0": unichr(0x00A0),
 					b"\xFD": unichr(0x00A9),
 					b"\xFE": unichr(0x2122),
 					b"\xFF": unichr(0x2026),
 				}),
 	"x_mac_korean_ttx": ("euc_kr", {
 					b"\x80": unichr(0x00A0),
 					b"\x81": unichr(0x20A9),
 					b"\x82": unichr(0x2014),
 					b"\x83": unichr(0x00A9),
 					b"\xFE": unichr(0x2122),
 					b"\xFF": unichr(0x2026),
 				}),
 	"x_mac_simp_chinese_ttx": ("gb2312", {
 					b"\x80": unichr(0x00FC),
 					b"\xA0": unichr(0x00A0),
 					b"\xFD": unichr(0x00A9),
 					b"\xFE": unichr(0x2122),
 					b"\xFF": unichr(0x2026),
 				}),
 }

 _cache = {}

 def search_function(name):
 	name = encodings.normalize_encoding(name) # Rather undocumented...
 	if name in _extended_encodings:
 		if name not in _cache:
 			base_encoding, mapping = _extended_encodings[name]
 			assert(name[-4:] == "_ttx")
 			# Python 2 didn't have any of the encodings that we are implementing
 			# in this file.  Python 3 added aliases for the East Asian ones, mapping
 			# them "temporarily" to the same base encoding as us, with a comment
 			# suggesting that full implementation will appear some time later.
 			# As such, try the Python version of the x_mac_... first, if that is found,
 			# use *that* as our base encoding.  This would make our encoding upgrade
 			# to the full encoding when and if Python finally implements that.
 			# http://bugs.python.org/issue24041
 			base_encodings = [name[:-4], base_encoding]
 			for base_encoding in base_encodings:
 				try:
 					codecs.lookup(base_encoding)
 				except LookupError:
 					continue
 				_cache[name] = ExtendCodec(name, base_encoding, mapping)
 				break
 		return _cache[name].info

 	return None

 codecs.register(search_function)
	"""Extend the Python codecs module with a few encodings that are used in OpenType (name table)
	but missing from Python. See https://github.com/fonttools/fonttools/issues/236 for details."""

	from __future__ import print_function, division, absolute_import
	from fontTools.misc.py23 import *
	import codecs
	import encodings

	class ExtendCodec(codecs.Codec):

	def __init__(self, name, base_encoding, mapping):
	self.name = name
	self.base_encoding = base_encoding
	self.mapping = mapping
	self.reverse = {v:k for k,v in mapping.items()}
	self.max_len = max(len(v) for v in mapping.values())
	self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
	codecs.register_error(name, self.error)

	def encode(self, input, errors='strict'):
	assert errors == 'strict'
	#return codecs.encode(input, self.base_encoding, self.name), len(input)

	# The above line could totally be all we needed, relying on the error
	# handling to replace the unencodable Unicode characters with our extended
	# byte sequences.
	#
	# However, there seems to be a design bug in Python (probably intentional):
	# the error handler for encoding is supposed to return a Unicode character,
	# that then needs to be encodable itself... Ugh.
	#
	# So we implement what codecs.encode() should have been doing: which is expect
	# error handler to return bytes() to be added to the output.
	#
	# This seems to have been fixed in Python 3.3. We should try using that and
	# use fallback only if that failed.
	# https://docs.python.org/3.3/library/codecs.html#codecs.register_error

	length = len(input)
	out = b''
	while input:
	try:
	part = codecs.encode(input, self.base_encoding)
	out += part
	input = '' # All converted
	except UnicodeEncodeError as e:
	# Convert the correct part
	out += codecs.encode(input[:e.start], self.base_encoding)
	replacement, pos = self.error(e)
	out += replacement
	input = input[pos:]
	return out, length

	def decode(self, input, errors='strict'):
	assert errors == 'strict'
	return codecs.decode(input, self.base_encoding, self.name), len(input)

	def error(self, e):
	if isinstance(e, UnicodeDecodeError):
	for end in range(e.start + 1, e.end + 1):
	s = e.object[e.start:end]
	if s in self.mapping:
	return self.mapping[s], end
	elif isinstance(e, UnicodeEncodeError):
	for end in range(e.start + 1, e.start + self.max_len + 1):
	s = e.object[e.start:end]
	if s in self.reverse:
	return self.reverse[s], end
	e.encoding = self.name
	raise e


	_extended_encodings = {
	"x_mac_japanese_ttx": ("shift_jis", {
	b"\xFC": unichr(0x007C),
	b"\x7E": unichr(0x007E),
	b"\x80": unichr(0x005C),
	b"\xA0": unichr(0x00A0),
	b"\xFD": unichr(0x00A9),
	b"\xFE": unichr(0x2122),
	b"\xFF": unichr(0x2026),
	}),
	"x_mac_trad_chinese_ttx": ("big5", {
	b"\x80": unichr(0x005C),
	b"\xA0": unichr(0x00A0),
	b"\xFD": unichr(0x00A9),
	b"\xFE": unichr(0x2122),
	b"\xFF": unichr(0x2026),
	}),
	"x_mac_korean_ttx": ("euc_kr", {
	b"\x80": unichr(0x00A0),
	b"\x81": unichr(0x20A9),
	b"\x82": unichr(0x2014),
	b"\x83": unichr(0x00A9),
	b"\xFE": unichr(0x2122),
	b"\xFF": unichr(0x2026),
	}),
	"x_mac_simp_chinese_ttx": ("gb2312", {
	b"\x80": unichr(0x00FC),
	b"\xA0": unichr(0x00A0),
	b"\xFD": unichr(0x00A9),
	b"\xFE": unichr(0x2122),
	b"\xFF": unichr(0x2026),
	}),
	}

	_cache = {}

	def search_function(name):
	name = encodings.normalize_encoding(name) # Rather undocumented...
	if name in _extended_encodings:
	if name not in _cache:
	base_encoding, mapping = _extended_encodings[name]
	assert(name[-4:] == "_ttx")
	# Python 2 didn't have any of the encodings that we are implementing
	# in this file. Python 3 added aliases for the East Asian ones, mapping
	# them "temporarily" to the same base encoding as us, with a comment
	# suggesting that full implementation will appear some time later.
	# As such, try the Python version of the x_mac_... first, if that is found,
	# use that as our base encoding. This would make our encoding upgrade
	# to the full encoding when and if Python finally implements that.
	# http://bugs.python.org/issue24041
	base_encodings = [name[:-4], base_encoding]
	for base_encoding in base_encodings:
	try:
	codecs.lookup(base_encoding)
	except LookupError:
	continue
	_cache[name] = ExtendCodec(name, base_encoding, mapping)
	break
	return _cache[name].info

	return None

	codecs.register(search_function)