Add decode_utf8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae81d36..f14452a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
* Update up Unicode 13.0.0
* Don't use non-standard ``__uint128``
* Add PyPy compatibility
+* Add ``decode_utf8(byte-like)``
### 1.4.7
diff --git a/src/_decoder.pyx b/src/_decoder.pyx
index ce2a9a6..cd3c668 100644
--- a/src/_decoder.pyx
+++ b/src/_decoder.pyx
@@ -769,15 +769,16 @@
cdef object _decode_all(ReaderRef reader, boolean some):
- cdef object ex
+ cdef object ex, ex2
try:
return _decode_all_sub(reader, some)
except _DecoderException as ex:
- raise (<_DecoderException> ex).cls(
+ ex2 = (<_DecoderException> ex).cls(
(<_DecoderException> ex).msg,
(<_DecoderException> ex).result,
(<_DecoderException> ex).extra,
)
+ raise ex2
cdef object _decode_ucs1(const void *string, Py_ssize_t length,
@@ -807,12 +808,27 @@
return _decode_all(reader, some)
+cdef object _decode_utf8(const void *string, Py_ssize_t length,
+ Py_ssize_t maxdepth, boolean some):
+ cdef ReaderUTF8 reader = ReaderUTF8(
+ ReaderUCS(length, 0, maxdepth),
+ <const Py_UCS1*> string,
+ )
+ return _decode_all(reader, some)
+
+
cdef object _decode_unicode(object data, Py_ssize_t maxdepth, boolean some):
cdef Py_ssize_t length
cdef int kind
+ cdef const char *s
PyUnicode_READY(data)
+ if CYTHON_COMPILING_IN_PYPY:
+ length = 0
+ s = PyUnicode_AsUTF8AndSize(data, &length)
+ return _decode_utf8(s, length, maxdepth, some)
+
length = PyUnicode_GET_LENGTH(data)
kind = PyUnicode_KIND(data)
@@ -831,7 +847,10 @@
cdef object (*decoder)(const void*, Py_ssize_t, Py_ssize_t, boolean)
cdef Py_ssize_t length = 0
- if wordlength == 1:
+ if wordlength == 0:
+ decoder = _decode_utf8
+ length = view.len // 1
+ elif wordlength == 1:
decoder = _decode_ucs1
length = view.len // 1
elif wordlength == 2:
diff --git a/src/_exports.pyx b/src/_exports.pyx
index e359fac..64500a0 100644
--- a/src/_exports.pyx
+++ b/src/_exports.pyx
@@ -62,7 +62,7 @@
.. code:: python
- decode_buffer(b'["Hello", "world!"]') == ['Hello', 'world!']
+ decode_latin1(b'["Hello", "world!"]') == ['Hello', 'world!']
Parameters
----------
@@ -88,6 +88,38 @@
return decode_buffer(data, maxdepth, bool(some), 1)
+def decode_utf8(object data, object maxdepth=None, object some=False):
+ '''
+ Decodes JSON5 serialized data from a ``bytes`` object.
+
+ .. code:: python
+
+ decode_utf8(b'["H\\xe2\\x82\\xacllo", "w\\xc3\\xb6rld!"]') == ['H€llo', 'wörld!']
+
+ Parameters
+ ----------
+ data : bytes
+ JSON5 serialized data, encoded as UTF-8 or ASCII.
+ maxdepth : Optional[int]
+ see `decode(...) <pyjson5.decode_>`_
+ some : bool
+ see `decode(...) <pyjson5.decode_>`_
+
+ Raises
+ ------
+ Json5DecoderException
+ An exception occured while decoding.
+ TypeError
+ An argument had a wrong type.
+
+ Returns
+ -------
+ object
+ see `decode(...) <pyjson5.decode_>`_
+ '''
+ return decode_buffer(data, maxdepth, bool(some), 0)
+
+
def decode_buffer(object obj, object maxdepth=None, object some=False,
object wordlength=None):
'''
@@ -111,7 +143,7 @@
some : bool
see `decode(...) <pyjson5.decode_>`_
wordlength : Optional[int]
- Must be 1, 2, 4 to denote UCS1, USC2 or USC4 data.
+ Must be 0, 1, 2, 4 to denote UTF-8, UCS1, USC2 or USC4 data, resp.
Surrogates are not supported. Decode the data to an ``str`` if need be.
If ``None`` is supplied, then the buffer's ``itemsize`` is used.
@@ -574,7 +606,7 @@
__all__ = (
# DECODE
- 'decode', 'decode_latin1', 'decode_buffer', 'decode_callback', 'decode_io',
+ 'decode', 'decode_utf8', 'decode_latin1', 'decode_buffer', 'decode_callback', 'decode_io',
# ENCODE
'encode', 'encode_bytes', 'encode_callback', 'encode_io', 'encode_noop', 'Options',
# LEGACY
diff --git a/src/_imports.pyx b/src/_imports.pyx
index 139b356..34e8ca3 100644
--- a/src/_imports.pyx
+++ b/src/_imports.pyx
@@ -198,6 +198,11 @@
boolean expect 'JSON5EncoderCpp_expect'(boolean actual, boolean expected)
+cdef extern from * nogil:
+ enum:
+ CYTHON_COMPILING_IN_PYPY
+
+
cdef type datetime, date, time, Decimal, Mapping, IOBase
cdef object saferepr
diff --git a/src/_reader_ucs.pyx b/src/_reader_ucs.pyx
index f94a5f2..bf187dc 100644
--- a/src/_reader_ucs.pyx
+++ b/src/_reader_ucs.pyx
@@ -19,9 +19,15 @@
const Py_UCS4 *string
+cdef struct ReaderUTF8:
+ ReaderUCS base
+ const Py_UCS1 *string
+
+
ctypedef ReaderUCS1 &ReaderUCS1Ref
ctypedef ReaderUCS2 &ReaderUCS2Ref
ctypedef ReaderUCS4 &ReaderUCS4Ref
+ctypedef ReaderUTF8 &ReaderUTF8Ref
ctypedef Py_UCS1 *UCS1String
ctypedef Py_UCS2 *UCS2String
@@ -31,6 +37,7 @@
ReaderUCS1Ref
ReaderUCS2Ref
ReaderUCS4Ref
+ ReaderUTF8Ref
ctypedef fused UCSString:
UCS1String
@@ -50,3 +57,32 @@
self.base.position += 1
return cast_to_uint32(c)
+
+
+cdef inline uint32_t _reader_utf8_get(ReaderUCSRef self):
+ cdef uint32_t c0 = _reader_ucs_get(self)
+ cdef unsigned int n
+
+ if (c0 & 0b1_0000000) == 0b0_0000000: # ASCII
+ return c0
+ elif (c0 & 0b11_000000) == 0b10_000000: # broken continuation
+ return c0
+ elif (c0 & 0b111_00000) == 0b110_00000: # 2 bytes
+ c0 = (c0 & 0b000_11111)
+ n = 1
+ elif (c0 & 0b1111_0000) == 0b1110_0000: # 3 bytes
+ c0 = (c0 & 0b0000_1111)
+ n = 2
+ elif (c0 & 0b11111_000) == 0b11110_000: # 4 bytes
+ c0 = (c0 & 0b00000_111)
+ n = 3
+ else: # 5+ bytes, invalid
+ return c0
+
+ for n in range(n, 0, -1):
+ if not _reader_ucs_good(self):
+ return c0
+
+ c0 = (c0 << 6) | (_reader_ucs_get(self) & 0b00_111111)
+
+ return c0
diff --git a/src/_readers.pyx b/src/_readers.pyx
index fff1ba0..e0a910e 100644
--- a/src/_readers.pyx
+++ b/src/_readers.pyx
@@ -24,10 +24,14 @@
cdef inline uint32_t _reader_get(ReaderRef self):
- if ReaderRef in ReaderUCSRef:
- return _reader_ucs_get(self)
+ cdef uint32_t c0
+ if ReaderRef is ReaderUTF8Ref:
+ c0 = _reader_utf8_get(self)
+ elif ReaderRef in ReaderUCSRef:
+ c0 = _reader_ucs_get(self)
elif ReaderRef is ReaderCallbackRef:
- return _reader_Callback_get(self)
+ c0 = _reader_Callback_get(self)
+ return c0
cdef int32_t _reader_good(ReaderRef self) except -1: