Make encoder a tiny bit faster and doc performance
diff --git a/docs/index.rst b/docs/index.rst
index 6f09e82..fd6374e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -38,6 +38,7 @@
encoder.rst
decoder.rst
exceptions.rst
+ performance.rst
Quick Summary
diff --git a/docs/performance.rst b/docs/performance.rst
new file mode 100644
index 0000000..4cf000a
--- /dev/null
+++ b/docs/performance.rst
@@ -0,0 +1,34 @@
+Performance
+===========
+
+This library is written in Cython for a better performance than a pure-Python implementation could give you.
+
+
+Decoder Performance
+-------------------
+
+The library is a bit slower than the shipped ``json`` module for *pure* JSON data.
+If you know that your input does not use JSON5 extension, then this library is probably not what you need.
+
+* Dataset: https://github.com/zemirco/sf-city-lots-json
+* CPU: Core i7-3770 @ 3.40GHz
+* :func:`pyjson5.decode`: **4.58** s ± 68.6 ms per loop *(lower is better)*
+* :func:`json.loads`: **3.27** s ± 27.7 ms per loop
+* The decoder works correcty: ``json.loads(content) == pyjson5.loads(content)``
+
+
+Encoder Performance
+-------------------
+
+The encoder generates pure JSON data if there are no infinite or NaN values in the input, which are invalid in JSON.
+The serialized data is XML-safe, i.e. there are no cheverons ``<>``, ampersands ``&``, apostrophes ``'`` or control characters in the output.
+The output is always ASCII regardless if you call :func:`pyjson5.encode` or :func:`pyjson5.encode_bytes`.
+
+* Dataset: https://github.com/zemirco/sf-city-lots-json
+* CPU: Core i7-3770 @ 3.40GHz
+* :func:`pyjson5.encode`: **8.54** s ± 29.3 ms per loop *(lower is better)*
+* :func:`json.dumps`: **4.68** s ± 20.4 ms per loop
+* :func:`json.dumps` + :func:`xml.sax.saxutils.escape`: **5.02** s ± 141 ms per loop
+* The encoder works correcty: ``obj == json.loads(pyjson5.encode(obj, floatformat='%.16e'))``
+
+Unless you need the advanced settings in :class:`pyjson5.Options`, most most likely don't benefit from using this library as an encoder.
diff --git a/src/_decoder.pyx b/src/_decoder.pyx
index 9f6b8d1..3dc963d 100644
--- a/src/_decoder.pyx
+++ b/src/_decoder.pyx
@@ -511,13 +511,13 @@
try:
value = _decode_recursive(reader, &c0)
except _DecoderException as ex:
- result[key] = (<_DecoderException> ex).result
+ PyDict_SetItem(result, key, (<_DecoderException> ex).result)
raise
if expect(c0 < 0, False):
break
- result[key] = value
+ PyDict_SetItem(result, key, value)
done = _skip_comma(
reader, start, <unsigned char>b'}', b'object', &c0,
@@ -550,13 +550,13 @@
try:
value = _decode_recursive(reader, &c0)
except _DecoderException as ex:
- result.append((<_DecoderException> ex).result)
+ PyList_Append(result, (<_DecoderException> ex).result)
raise
if expect(c0 < 0, False):
break
- result.append(value)
+ PyList_Append(result, value)
done = _skip_comma(
reader, start, <unsigned char>b']', b'array', &c0,
diff --git a/src/_encoder.pyx b/src/_encoder.pyx
index f151597..1fafc51 100644
--- a/src/_encoder.pyx
+++ b/src/_encoder.pyx
@@ -14,62 +14,83 @@
cdef boolean _encode_unicode_impl(WriterRef writer, UCSString data, Py_ssize_t length) except False:
- cdef char buf[16]
+ cdef char buf[32]
cdef uint32_t c
cdef uint32_t s1, s2
- cdef Py_ssize_t index
cdef const char *escaped_string
cdef Py_ssize_t escaped_length
- cdef size_t unescaped_length
+ cdef size_t unescaped_length, index
+ cdef Py_ssize_t sublength
if length > 0:
writer.reserve(writer, 2 + length)
writer.append_c(writer, <char> b'"')
- if UCSString is UCS1String:
- while True:
- unescaped_length = ESCAPE_DCT.find_unescaped_range(<const char*> data, length)
- if unescaped_length > 0:
+ while True:
+ if UCSString is UCS1String:
+ sublength = length
+ else:
+ sublength = min(length, <Py_ssize_t> sizeof(buf))
+
+ unescaped_length = ESCAPE_DCT.find_unescaped_range(data, sublength)
+ if unescaped_length > 0:
+ if UCSString is UCS1String:
writer.append_s(writer, <const char*> data, unescaped_length)
+ else:
+ for index in range(unescaped_length):
+ buf[index] = <const char> data[index]
+ writer.append_s(writer, buf, unescaped_length)
- data += unescaped_length
- length -= unescaped_length
- if length <= 0:
- break
+ data += unescaped_length
+ length -= unescaped_length
+ if length <= 0:
+ break
- c = data[0]
+ if UCSString is not UCS1String:
+ continue
+
+ c = data[0]
+ if (UCSString is UCS1String) or (c < 0x100):
escaped_string = &ESCAPE_DCT.items[c][0]
escaped_length = ESCAPE_DCT.items[c][7]
writer.append_s(writer, escaped_string, escaped_length)
+ elif (UCSString is UCS2String) or (c <= 0xffff):
+ buf[0] = '\\';
+ buf[1] = 'u';
+ buf[2] = HEX[(c >> (4*3)) & 0xf];
+ buf[3] = HEX[(c >> (4*2)) & 0xf];
+ buf[4] = HEX[(c >> (4*1)) & 0xf];
+ buf[5] = HEX[(c >> (4*0)) & 0xf];
+ buf[6] = 0;
- data += 1
- length -= 1
- if length <= 0:
- break
- else:
- for index in range(length):
- c = data[index]
- if UCSString is UCS2String:
- if not ESCAPE_DCT.is_escaped(c):
- writer.append_c(writer, <char> <unsigned char> c)
- else:
- escaped_string = &ESCAPE_DCT.items[c][0]
- escaped_length = ESCAPE_DCT.items[c][7]
- writer.append_s(writer, escaped_string, escaped_length)
- elif UCSString is UCS4String:
- if not ESCAPE_DCT.is_escaped(c):
- writer.append_c(writer, <char> <unsigned char> c)
- elif c < 0x10000:
- escaped_string = &ESCAPE_DCT.items[c][0]
- escaped_length = ESCAPE_DCT.items[c][7]
- writer.append_s(writer, escaped_string, escaped_length)
- else:
- # surrogate pair
- c -= 0x10000
- s1 = 0xd800 | ((c >> 10) & 0x3ff)
- s2 = 0xdc00 | (c & 0x3ff)
+ writer.append_s(writer, buf, 6);
+ else:
+ # surrogate pair
+ c -= 0x10000
+ s1 = 0xd800 | ((c >> 10) & 0x3ff)
+ s2 = 0xdc00 | (c & 0x3ff)
- snprintf(buf, sizeof(buf), b'\\u%04x\\u%04x', s1, s2)
- writer.append_s(writer, buf, 2 * 6)
+ buf[0x0] = '\\';
+ buf[0x1] = 'u';
+ buf[0x2] = HEX[(s1 >> (4*3)) & 0xf];
+ buf[0x3] = HEX[(s1 >> (4*2)) & 0xf];
+ buf[0x4] = HEX[(s1 >> (4*1)) & 0xf];
+ buf[0x5] = HEX[(s1 >> (4*0)) & 0xf];
+
+ buf[0x6] = '\\';
+ buf[0x7] = 'u';
+ buf[0x8] = HEX[(s2 >> (4*3)) & 0xf];
+ buf[0x9] = HEX[(s2 >> (4*2)) & 0xf];
+ buf[0xa] = HEX[(s2 >> (4*1)) & 0xf];
+ buf[0xb] = HEX[(s2 >> (4*0)) & 0xf];
+
+ buf[0xc] = 0;
+
+ writer.append_s(writer, buf, 12);
+
+ data += 1
+ length -= 1
+ if length <= 0:
+ break
writer.append_c(writer, <char> b'"')
else:
writer.append_s(writer, b'""', 2)
diff --git a/src/_imports.pyx b/src/_imports.pyx
index c139f71..c4d47a8 100644
--- a/src/_imports.pyx
+++ b/src/_imports.pyx
@@ -8,8 +8,10 @@
PyBytes_AsStringAndSize, PyBytes_FromStringAndSize, PyBytes_Check,
)
from cpython.datetime cimport datetime, date, time
+from cpython.dict cimport PyDict_SetItem
from cpython.float cimport PyFloat_Check, PyFloat_AsDouble
from cpython.int cimport PyInt_Check
+from cpython.list cimport PyList_Append
from cpython.long cimport PyLong_FromString, PyLong_Check
from cpython.object cimport PyObject
from cpython.type cimport PyType_Check
@@ -57,6 +59,12 @@
void swap[T](T&, T&)
+cdef extern from 'Python.h':
+ ctypedef signed char Py_UCS1
+ ctypedef signed short Py_UCS2
+ ctypedef signed long Py_UCS4
+
+
cdef extern from 'src/native.hpp' namespace 'JSON5EncoderCpp' nogil:
int32_t cast_to_int32(...)
uint32_t cast_to_uint32(...)
@@ -65,10 +73,12 @@
boolean obj_has_iter(object obj)
ctypedef char EscapeDctItem[8]
- struct EscapeDct:
- EscapeDctItem items[0x10000]
+ cppclass EscapeDct:
+ EscapeDctItem items[0x100]
boolean is_escaped(uint32_t c)
- Py_ssize_t find_unescaped_range(const char *start, Py_ssize_t length)
+ Py_ssize_t find_unescaped_range(const Py_UCS1 *start, Py_ssize_t length)
+ Py_ssize_t find_unescaped_range(const Py_UCS2 *start, Py_ssize_t length)
+ Py_ssize_t find_unescaped_range(const Py_UCS4 *start, Py_ssize_t length)
EscapeDct ESCAPE_DCT
enum:
@@ -79,12 +89,10 @@
LONGDESCRIPTION_LENGTH
const char LONGDESCRIPTION[]
+ const char HEX[]
+
cdef extern from 'Python.h':
- ctypedef signed char Py_UCS1
- ctypedef signed short Py_UCS2
- ctypedef signed long Py_UCS4
-
enum:
PyUnicode_WCHAR_KIND
PyUnicode_1BYTE_KIND
diff --git a/src/native.hpp b/src/native.hpp
index b608265..0be19ef 100644
--- a/src/native.hpp
+++ b/src/native.hpp
@@ -58,15 +58,16 @@
return (i != nullptr) && (i != &_PyObject_NextNotImplemented);
}
+constexpr char HEX[] = "0123456789abcdef";
+
struct EscapeDct {
using Item = std::array<char, 8>; // 7 are needed, 1 length
- static constexpr std::size_t length = 0x10000;
+ static constexpr std::size_t length = 0x100;
Item items[length];
unsigned __int128 is_escaped_array;
static constexpr Item unicode_item(size_t index) {
- constexpr char HEX[] = "0123456789abcdef";
return {{
'\\',
'u',
@@ -87,14 +88,30 @@
return {{ (char) (unsigned char) chr, 0, 0, 0, 0, 0, 0, 1 }};
}
- inline bool is_escaped(uint32_t c) const {
+ inline bool is_escaped(std::uint32_t c) const {
return (c >= 0x0080) || (is_escaped_array & (
static_cast<unsigned __int128>(1) <<
static_cast<std::uint8_t>(c)
));
}
- inline std::size_t find_unescaped_range(const char *start, Py_ssize_t length) const {
+ inline std::size_t find_unescaped_range(const Py_UCS1 *start, Py_ssize_t length) const {
+ Py_ssize_t index = 0;
+ while ((index < length) && !is_escaped(start[index])) {
+ ++index;
+ }
+ return index;
+ }
+
+ inline std::size_t find_unescaped_range(const Py_UCS2 *start, Py_ssize_t length) const {
+ Py_ssize_t index = 0;
+ while ((index < length) && !is_escaped(start[index])) {
+ ++index;
+ }
+ return index;
+ }
+
+ inline std::size_t find_unescaped_range(const Py_UCS4 *start, Py_ssize_t length) const {
Py_ssize_t index = 0;
while ((index < length) && !is_escaped(start[index])) {
++index;
@@ -122,12 +139,12 @@
);
}
}
- items[(uint8_t) '\\'] = escaped_item('\\');
- items[(uint8_t) '\b'] = escaped_item('b');
- items[(uint8_t) '\f'] = escaped_item('f');
- items[(uint8_t) '\n'] = escaped_item('n');
- items[(uint8_t) '\r'] = escaped_item('r');
- items[(uint8_t) '\t'] = escaped_item('t');
+ items[(std::uint8_t) '\\'] = escaped_item('\\');
+ items[(std::uint8_t) '\b'] = escaped_item('b');
+ items[(std::uint8_t) '\f'] = escaped_item('f');
+ items[(std::uint8_t) '\n'] = escaped_item('n');
+ items[(std::uint8_t) '\r'] = escaped_item('r');
+ items[(std::uint8_t) '\t'] = escaped_item('t');
}
};