Optimize decoder a bit
diff --git a/Makefile b/Makefile
index 76e3516..66f35d6 100644
--- a/Makefile
+++ b/Makefile
@@ -4,8 +4,12 @@
.PHONY: all sdist bdist_wheel clean docs
-FILES := Makefile MANIFEST.in pyjson5.pyx README.rst setup.py \
- src/native.hpp src/VERSION src/_unicode_cat_of.hpp
+INCLUDES := \
+ src/VERSION src/DESCRIPTION \
+ src/_decoder_recursive_select.hpp src/_unicode_cat_of.hpp \
+ src/_escape_dct.hpp src/_stack_heap_string.hpp src/native.hpp
+
+FILES := Makefile MANIFEST.in pyjson5.pyx README.rst setup.py ${INCLUDES}
DerivedGeneralCategory.txt:
wget -O $@ https://www.unicode.org/Public/12.1.0/ucd/extracted/DerivedGeneralCategory.txt
@@ -14,7 +18,7 @@
src/_unicode_cat_of.hpp: DerivedGeneralCategory.txt make_unicode_categories.py
python make_unicode_categories.py $< $@
-pyjson5.cpp: pyjson5.pyx $(wildcard src/*.pyx)
+pyjson5.cpp: pyjson5.pyx $(wildcard src/*.pyx) $(wildcard src/*.hpp)
python -m cython -f -o $@ $<
sdist: pyjson5.cpp ${FILES}
diff --git a/docs/performance.rst b/docs/performance.rst
index ef35eba..eb9935c 100644
--- a/docs/performance.rst
+++ b/docs/performance.rst
@@ -13,7 +13,7 @@
* Dataset: https://github.com/zemirco/sf-city-lots-json
* Version: Python 3.7.5 (default, Oct 27 2019, 15:43:29)
* CPU: AMD Ryzen 7 2700 @ 3.7GHz
-* :func:`pyjson5.decode`: **3.29 s** ± 8.66 ms per loop per loop per loop *(lower is better)*
+* :func:`pyjson5.decode`: **3.18 s** ± 4.89 ms per loop per loop per loop *(lower is better)*
* :func:`json.loads`: **2.64** s ± 12.7 ms ms per loop
* The decoder works correcty: ``json.loads(content) == pyjson5.loads(content)``
diff --git a/src/_constants.pyx b/src/_constants.pyx
index eba4878..1321fd2 100644
--- a/src/_constants.pyx
+++ b/src/_constants.pyx
@@ -3,7 +3,7 @@
cdef object CONST_NEG_NAN = float('-NaN')
cdef object CONST_NEG_INF = float('-Infinity')
-cdef object DATETIME_CLASSES = (date, time, datetime,)
+cdef object DATETIME_CLASSES = (date, time,) # issubclass(datetime, date) == True
cdef object ORD_CLASSES = (unicode, bytes, bytearray,)
cdef object UCS1_COMPATIBLE_CODECS = frozenset((
diff --git a/src/_decoder.pyx b/src/_decoder.pyx
index 6309c10..ee2708a 100644
--- a/src/_decoder.pyx
+++ b/src/_decoder.pyx
@@ -614,33 +614,39 @@
return True
-cdef object _decode_literal(ReaderRef reader, int32_t *c_in_out):
- cdef const char *tail
- cdef object result
- cdef uint32_t c0
- cdef int32_t c1
-
- c0 = cast_to_uint32(c_in_out[0])
- if c0 == b'n':
- tail = b'ull'
- result = None
- elif c0 == b't':
- tail = b'rue'
- result = True
- elif c0 == b'f':
- tail = b'alse'
- result = False
- elif c0 == b'I':
- tail = b'nfinity'
- result = CONST_POS_INF
- else: # elif c0 == b'N':
- tail = b'aN'
- result = CONST_POS_NAN
-
- _accept_string(reader, tail)
-
+cdef object _decode_null(ReaderRef reader, int32_t *c_in_out):
+ # n
+ _accept_string(reader, b'ull')
c_in_out[0] = NO_EXTRA_DATA
- return result
+ return None
+
+
+cdef object _decode_true(ReaderRef reader, int32_t *c_in_out):
+ # t
+ _accept_string(reader, b'rue')
+ c_in_out[0] = NO_EXTRA_DATA
+ return True
+
+
+cdef object _decode_false(ReaderRef reader, int32_t *c_in_out):
+ # f
+ _accept_string(reader, b'alse')
+ c_in_out[0] = NO_EXTRA_DATA
+ return False
+
+
+cdef object _decode_inf(ReaderRef reader, int32_t *c_in_out):
+ # I
+ _accept_string(reader, b'nfinity')
+ c_in_out[0] = NO_EXTRA_DATA
+ return CONST_POS_INF
+
+
+cdef object _decode_nan(ReaderRef reader, int32_t *c_in_out):
+ # N
+ _accept_string(reader, b'aN')
+ c_in_out[0] = NO_EXTRA_DATA
+ return CONST_POS_NAN
cdef object _decode_recursive_enter(ReaderRef reader, int32_t *c_in_out):
@@ -688,22 +694,40 @@
cdef object _decode_recursive(ReaderRef reader, int32_t *c_in_out):
- cdef object (*decoder)(ReaderRef, int32_t*)
cdef int32_t c0
cdef uint32_t c1
+ cdef Py_ssize_t start
+ cdef DrsKind kind
+ cdef object (*decoder)(ReaderRef, int32_t*)
c0 = c_in_out[0]
c1 = cast_to_uint32(c0)
+ if c1 >= 128:
+ start = _reader_tell(reader)
+ _raise_expected_s('JSON5Value', start, c1)
- decoder = _decoder_unknown
- if c1 in b'ntfIN':
- decoder = _decode_literal
- elif c1 in b'\'"':
+ kind = drs_lookup[c1]
+ if kind == DRS_fail:
+ decoder = _decoder_unknown
+ elif kind == DRS_null:
+ decoder = _decode_null
+ elif kind == DRS_true:
+ decoder = _decode_true
+ elif kind == DRS_false:
+ decoder = _decode_false
+ elif kind == DRS_inf:
+ decoder = _decode_inf
+ elif kind == DRS_nan:
+ decoder = _decode_nan
+ elif kind == DRS_string:
decoder = _decode_string
- elif c1 in b'+-.0123456789':
+ elif kind == DRS_number:
decoder = _decode_number
- elif c1 in b'{[':
+ elif kind == DRS_recursive:
decoder = _decode_recursive_enter
+ else:
+ __builtin_unreachable()
+ decoder = _decoder_unknown
return decoder(reader, c_in_out)
diff --git a/src/_decoder_recursive_select.hpp b/src/_decoder_recursive_select.hpp
new file mode 100644
index 0000000..392ea08
--- /dev/null
+++ b/src/_decoder_recursive_select.hpp
@@ -0,0 +1,37 @@
+#ifndef JSON5EncoderCpp_decoder_recursive_select
+#define JSON5EncoderCpp_decoder_recursive_select
+
+#include <cstdint>
+
+namespace JSON5EncoderCpp {
+inline namespace {
+
+enum DrsKind : std::uint8_t {
+ DRS_fail,
+ DRS_null, DRS_true, DRS_false, DRS_inf, DRS_nan,
+ DRS_string, DRS_number, DRS_recursive,
+};
+
+static const DrsKind drs_lookup[128] = {
+ /* 00-08 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 08-10 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 10-18 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 18-20 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 20-28 */ DRS_fail, DRS_fail, /*"*/DRS_string, DRS_fail, DRS_fail, DRS_fail, /*'*/DRS_string, DRS_fail,
+ /* 28-30 */ DRS_fail, DRS_fail, DRS_fail, /*+*/DRS_number, DRS_fail, /*-*/DRS_number, /*.*/DRS_number, DRS_fail,
+ /* 30-38 */ /*0*/DRS_number, DRS_number, DRS_number, DRS_number, DRS_number, DRS_number, DRS_number, DRS_number,
+ /* 38-40 */ /*8*/DRS_number, DRS_number, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 40-48 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 48-50 */ DRS_fail, /*I*/DRS_inf, DRS_fail, DRS_fail, DRS_fail, DRS_fail, /*N*/DRS_nan, DRS_fail,
+ /* 50-58 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 58-60 */ DRS_fail, DRS_fail, DRS_fail, /*[*/DRS_recursive, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+ /* 60-68 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, /*f*/DRS_false, DRS_fail,
+ /* 68-70 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, DRS_fail, /*n*/DRS_null, DRS_fail,
+ /* 70-78 */ DRS_fail, DRS_fail, DRS_fail, DRS_fail, /*t*/DRS_true, DRS_fail, DRS_fail, DRS_fail,
+ /* 78-80 */ DRS_fail, DRS_fail, DRS_fail, /*{*/DRS_recursive, DRS_fail, DRS_fail, DRS_fail, DRS_fail,
+};
+
+}
+}
+
+#endif
diff --git a/src/_encoder.pyx b/src/_encoder.pyx
index c196f34..3e7a26f 100644
--- a/src/_encoder.pyx
+++ b/src/_encoder.pyx
@@ -205,9 +205,6 @@
cdef boolean first
cdef object value
- if _encode_tojson(writer, data):
- return True
-
Py_EnterRecursiveCall(' while encoding nested JSON5 object')
try:
writer.append_c(writer, <char> b'[')
@@ -229,9 +226,6 @@
cdef boolean first
cdef object key, value
- if _encode_tojson(writer, data):
- return True
-
Py_EnterRecursiveCall(' while encoding nested JSON5 object')
try:
writer.append_c(writer, <char> b'{')
@@ -339,11 +333,23 @@
cdef boolean _encode_long(WriterRef writer, object data) except False:
- return _encode_format_string(writer, data, (<Options> writer.options).intformat)
+ _encode_format_string(writer, data, (<Options> writer.options).intformat)
+ return True
cdef boolean _encode_decimal(WriterRef writer, object data) except False:
- return _encode_format_string(writer, data, (<Options> writer.options).decimalformat)
+ _encode_format_string(writer, data, (<Options> writer.options).decimalformat)
+ return True
+
+
+cdef boolean _encode_iterable(WriterRef writer, object data) except False:
+ if _encode_tojson(writer, data):
+ pass
+ elif isinstance(data, (<Options> writer.options).mappingtypes):
+ _encode_mapping(writer, data)
+ else:
+ _encode_sequence(writer, data)
+ return True
cdef boolean _encode(WriterRef writer, object data) except False:
@@ -362,10 +368,7 @@
elif PyFloat_Check(data):
encoder = _encode_float
elif obj_has_iter(data):
- if isinstance(data, (<Options> writer.options).mappingtypes):
- encoder = _encode_mapping
- else:
- encoder = _encode_sequence
+ encoder = _encode_iterable
elif isinstance(data, Decimal):
encoder = _encode_decimal
elif isinstance(data, DATETIME_CLASSES):
@@ -373,7 +376,9 @@
else:
encoder = _encode_unknown
- return encoder(writer, data)
+ encoder(writer, data)
+
+ return True
cdef boolean _encode_callback_bytes(object data, object cb, object options) except False:
diff --git a/src/_imports.pyx b/src/_imports.pyx
index f578308..c70ea17 100644
--- a/src/_imports.pyx
+++ b/src/_imports.pyx
@@ -102,6 +102,15 @@
boolean push_back(T codepoint) except False
+cdef extern from 'src/_decoder_recursive_select.hpp' namespace 'JSON5EncoderCpp' nogil:
+ cdef enum DrsKind:
+ DRS_fail,
+ DRS_null, DRS_true, DRS_false, DRS_inf, DRS_nan,
+ DRS_string, DRS_number, DRS_recursive
+
+ DrsKind drs_lookup[128]
+
+
cdef extern from 'Python.h':
enum:
PyUnicode_WCHAR_KIND