minor changes
- fixed a bug that did not discard strings with control characters
between 0x10 and 0x1f
- added termination proofs for two important loops
- made get_ref() constexpr
diff --git a/doc/images/callback_events.png b/doc/images/callback_events.png
new file mode 100644
index 0000000..09aa2b3
--- /dev/null
+++ b/doc/images/callback_events.png
Binary files differ
diff --git a/src/json.hpp b/src/json.hpp
index 2915532..00bee97 100644
--- a/src/json.hpp
+++ b/src/json.hpp
@@ -717,7 +717,19 @@
This enumeration collects the different JSON types. It is internally used
to distinguish the stored values, and the functions @ref is_null(), @ref
is_object(), @ref is_array(), @ref is_string(), @ref is_boolean(), @ref
- is_number(), and @ref is_discarded() rely on it.
+ is_number() (with @ref is_number_integer(), @ref is_number_unsigned(), and
+ @ref is_number_float()), @ref is_discarded(), @ref is_primitive(), and
+ @ref is_structured() rely on it.
+
+ @note There are three enumeration entries (number_integer,
+ number_unsigned, and number_float), because the library distinguishes
+ these three types for numbers: @ref number_unsigned_t is used for unsigned
+ integers, @ref number_integer_t is used for signed integers, and @ref
+ number_float_t is used for floating-point numbers or to approximate
+ integers which do not fit in the limits of their respective type.
+
+ @sa @ref basic_json(const value_t value_type) -- create a JSON value with
+ the default value for a given type
@since version 1.0.0
*/
@@ -728,7 +740,7 @@
array, ///< array (ordered collection of values)
string, ///< string value
boolean, ///< boolean value
- number_integer, ///< number value (integer)
+ number_integer, ///< number value (signed integer)
number_unsigned, ///< number value (unsigned integer)
number_float, ///< number value (floating-point)
discarded ///< discarded by the the parser callback function
@@ -758,7 +770,24 @@
/*!
@brief a JSON value
- The actual storage for a JSON value of the @ref basic_json class.
+ The actual storage for a JSON value of the @ref basic_json class. This
+ union combines the different storage types for the JSON value types
+ defined in @ref value_t.
+
+ JSON type | value_t type | used type
+ --------- | --------------- | ------------------------
+ object | object | pointer to @ref object_t
+ array | array | pointer to @ref array_t
+ string | string | pointer to @ref string_t
+ boolean | boolean | @ref boolean_t
+ number | number_integer | @ref number_integer_t
+ number | number_unsigned | @ref number_unsigned_t
+ number | number_float | @ref number_float_t
+ null | null | *no value is stored*
+
+ @note Variable-length types (objects, arrays, and strings) are stored as
+ pointers. The size of the union should not exceed 64 bits if the default
+ value types are used.
@since version 1.0.0
*/
@@ -874,6 +903,8 @@
This enumeration lists the parser events that can trigger calling a
callback function of type @ref parser_callback_t during parsing.
+ @image html callback_events.png "Example when certain parse events are triggered"
+
@since version 1.0.0
*/
enum class parse_event_t : uint8_t
@@ -916,6 +947,8 @@
parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+ @image html callback_events.png "Example when certain parse events are triggered"
+
Discarding a value (i.e., returning `false`) has different effects
depending on the context in which function was called:
@@ -2773,21 +2806,16 @@
type of the current JSON
*/
template<typename ReferenceType, typename ThisType>
- static ReferenceType get_ref_impl(ThisType& obj)
+ static constexpr ReferenceType get_ref_impl(ThisType& obj)
{
- // delegate the call to get_ptr<>()
+ // helper type
using PointerType = typename std::add_pointer<ReferenceType>::type;
- auto ptr = obj.template get_ptr<PointerType>();
- if (ptr != nullptr)
- {
- return *ptr;
- }
- else
- {
- throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
- obj.type_name());
- }
+ // delegate the call to get_ptr<>()
+ return obj.template get_ptr<PointerType>() != nullptr
+ ? *obj.template get_ptr<PointerType>()
+ : throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
+ obj.type_name());
}
public:
@@ -3015,7 +3043,7 @@
std::is_reference<ReferenceType>::value
and std::is_const<typename std::remove_reference<ReferenceType>::type>::value
, int>::type = 0>
- ReferenceType get_ref() const
+ constexpr ReferenceType get_ref() const
{
// delegate call to get_ref_impl
return get_ref_impl<ReferenceType>(*this);
@@ -7286,6 +7314,8 @@
@throw std::invalid_argument if the low surrogate is invalid; example:
`""missing or wrong low surrogate""`
+ @complexity Constant.
+
@see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
*/
static string_t to_unicode(const std::size_t codepoint1,
@@ -7402,6 +7432,17 @@
function consists of a large block of code with `goto` jumps.
@return the class of the next token read from the buffer
+
+ @complexity Linear in the length of the input.\n
+
+ Proposition: The loop below will always terminate for finite input.\n
+
+ Proof (by contradiction): Assume a finite input. To loop forever, the
+ loop must never hit code with a `break` statement. The only code
+ snippets without a `break` statement are the continue statements for
+ whitespace and byte-order-marks. To loop forever, the input must be an
+ infinite sequence of whitespace or byte-order-marks. This contradicts
+ the assumption of finite input, q.e.d.
*/
token_type scan() noexcept
{
@@ -7422,8 +7463,8 @@
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 32, 32, 0, 0, 32, 0, 0,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
160, 128, 0, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128,
192, 192, 192, 192, 192, 192, 192, 192,
@@ -7602,7 +7643,7 @@
basic_json_parser_9:
yyaccept = 0;
yych = *(m_marker = ++m_cursor);
- if (yych <= 0x0F)
+ if (yych <= 0x1F)
{
goto basic_json_parser_5;
}
@@ -7760,7 +7801,7 @@
{
goto basic_json_parser_31;
}
- if (yych <= 0x0F)
+ if (yych <= 0x1F)
{
goto basic_json_parser_33;
}
@@ -8233,16 +8274,53 @@
according to the nature of the escape. Some escapes create new
characters (e.g., `"\\n"` is replaced by `"\n"`), some are copied
as is (e.g., `"\\\\"`). Furthermore, Unicode escapes of the shape
- `"\\uxxxx"` need special care. In this case, to_unicode takes care
- of the construction of the values.
+ `"\\uxxxx"` need special care. In this case, @ref to_unicode takes
+ care of the construction of the values.
2. Unescaped characters are copied as is.
+ @pre `m_cursor - m_start >= 2`, meaning the length of the last token
+ is at least 2 bytes which is trivially true for any string (which
+ consists of at least two quotes).
+
+ " c1 c2 c3 ... "
+ ^ ^
+ m_start m_cursor
+
+ @complexity Linear in the length of the string.\n
+
+ Lemma: The loop body will always terminate.\n
+
+ Proof (by contradiction): Assume the loop body does not terminate. As
+ the loop body does not contain another loop, one of the called
+ functions must never return. The called functions are `std::strtoul`
+ and @ref to_unicode. Neither function can loop forever, so the loop
+ body will never loop forever which contradicts the assumption that the
+ loop body does not terminate, q.e.d.\n
+
+ Lemma: The loop condition for the for loop is eventually false.\n
+
+ Proof (by contradiction): Assume the loop does not terminate. Due to
+ the above lemma, this can only be due to a tautological loop
+ condition; that is, the loop condition i < m_cursor - 1 must always be
+ true. Let x be the change of i for any loop iteration. Then
+ m_start + 1 + x < m_cursor - 1 must hold to loop indefinitely.
+ This can be rephrased to m_cursor - m_start - 2 > x. With the
+ precondition, we x <= 0, meaning that the loop condition holds
+ indefinitly if i is always decreased. However, observe that the
+ value of i is strictly increasing with each iteration, as it is
+ incremented by 1 in the iteration expression and never
+ decremented inside the loop body. Hence, the loop condition
+ will eventually be false which contradicts the assumption that
+ the loop condition is a tautology, q.e.d.
+
@return string value of current token without opening and closing
quotes
@throw std::out_of_range if to_unicode fails
*/
string_t get_string() const
{
+ assert(m_cursor - m_start >= 2);
+
string_t result;
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
@@ -8915,6 +8993,8 @@
/*!
@brief create and return a reference to the pointed to value
+
+ @complexity Linear in the number of reference tokens.
*/
reference get_and_create(reference j) const
{
@@ -9352,6 +9432,7 @@
basic_json result;
// iterate the JSON object values
+ assert(value.m_value.object != nullptr);
for (const auto& element : *value.m_value.object)
{
if (not element.second.is_primitive())
diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c
index 8eeedef..05c49ec 100644
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
@@ -717,7 +717,19 @@
This enumeration collects the different JSON types. It is internally used
to distinguish the stored values, and the functions @ref is_null(), @ref
is_object(), @ref is_array(), @ref is_string(), @ref is_boolean(), @ref
- is_number(), and @ref is_discarded() rely on it.
+ is_number() (with @ref is_number_integer(), @ref is_number_unsigned(), and
+ @ref is_number_float()), @ref is_discarded(), @ref is_primitive(), and
+ @ref is_structured() rely on it.
+
+ @note There are three enumeration entries (number_integer,
+ number_unsigned, and number_float), because the library distinguishes
+ these three types for numbers: @ref number_unsigned_t is used for unsigned
+ integers, @ref number_integer_t is used for signed integers, and @ref
+ number_float_t is used for floating-point numbers or to approximate
+ integers which do not fit in the limits of their respective type.
+
+ @sa @ref basic_json(const value_t value_type) -- create a JSON value with
+ the default value for a given type
@since version 1.0.0
*/
@@ -728,7 +740,7 @@
array, ///< array (ordered collection of values)
string, ///< string value
boolean, ///< boolean value
- number_integer, ///< number value (integer)
+ number_integer, ///< number value (signed integer)
number_unsigned, ///< number value (unsigned integer)
number_float, ///< number value (floating-point)
discarded ///< discarded by the the parser callback function
@@ -758,7 +770,24 @@
/*!
@brief a JSON value
- The actual storage for a JSON value of the @ref basic_json class.
+ The actual storage for a JSON value of the @ref basic_json class. This
+ union combines the different storage types for the JSON value types
+ defined in @ref value_t.
+
+ JSON type | value_t type | used type
+ --------- | --------------- | ------------------------
+ object | object | pointer to @ref object_t
+ array | array | pointer to @ref array_t
+ string | string | pointer to @ref string_t
+ boolean | boolean | @ref boolean_t
+ number | number_integer | @ref number_integer_t
+ number | number_unsigned | @ref number_unsigned_t
+ number | number_float | @ref number_float_t
+ null | null | *no value is stored*
+
+ @note Variable-length types (objects, arrays, and strings) are stored as
+ pointers. The size of the union should not exceed 64 bits if the default
+ value types are used.
@since version 1.0.0
*/
@@ -874,6 +903,8 @@
This enumeration lists the parser events that can trigger calling a
callback function of type @ref parser_callback_t during parsing.
+ @image html callback_events.png "Example when certain parse events are triggered"
+
@since version 1.0.0
*/
enum class parse_event_t : uint8_t
@@ -916,6 +947,8 @@
parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+ @image html callback_events.png "Example when certain parse events are triggered"
+
Discarding a value (i.e., returning `false`) has different effects
depending on the context in which function was called:
@@ -2773,21 +2806,16 @@
type of the current JSON
*/
template<typename ReferenceType, typename ThisType>
- static ReferenceType get_ref_impl(ThisType& obj)
+ static constexpr ReferenceType get_ref_impl(ThisType& obj)
{
- // delegate the call to get_ptr<>()
+ // helper type
using PointerType = typename std::add_pointer<ReferenceType>::type;
- auto ptr = obj.template get_ptr<PointerType>();
- if (ptr != nullptr)
- {
- return *ptr;
- }
- else
- {
- throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
- obj.type_name());
- }
+ // delegate the call to get_ptr<>()
+ return obj.template get_ptr<PointerType>() != nullptr
+ ? *obj.template get_ptr<PointerType>()
+ : throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
+ obj.type_name());
}
public:
@@ -3015,7 +3043,7 @@
std::is_reference<ReferenceType>::value
and std::is_const<typename std::remove_reference<ReferenceType>::type>::value
, int>::type = 0>
- ReferenceType get_ref() const
+ constexpr ReferenceType get_ref() const
{
// delegate call to get_ref_impl
return get_ref_impl<ReferenceType>(*this);
@@ -7286,6 +7314,8 @@
@throw std::invalid_argument if the low surrogate is invalid; example:
`""missing or wrong low surrogate""`
+ @complexity Constant.
+
@see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
*/
static string_t to_unicode(const std::size_t codepoint1,
@@ -7402,6 +7432,17 @@
function consists of a large block of code with `goto` jumps.
@return the class of the next token read from the buffer
+
+ @complexity Linear in the length of the input.\n
+
+ Proposition: The loop below will always terminate for finite input.\n
+
+ Proof (by contradiction): Assume a finite input. To loop forever, the
+ loop must never hit code with a `break` statement. The only code
+ snippets without a `break` statement are the continue statements for
+ whitespace and byte-order-marks. To loop forever, the input must be an
+ infinite sequence of whitespace or byte-order-marks. This contradicts
+ the assumption of finite input, q.e.d.
*/
token_type scan() noexcept
{
@@ -7447,32 +7488,32 @@
"false" { last_token_type = token_type::literal_false; break; }
// number
- decimal_point = [.];
+ decimal_point = ".";
digit = [0-9];
digit_1_9 = [1-9];
- e = [eE];
- minus = [-];
- plus = [+];
- zero = [0];
- exp = e (minus|plus)? digit+;
+ e = "e" | "E";
+ minus = "-";
+ plus = "+";
+ zero = "0";
+ exp = e (minus | plus)? digit+;
frac = decimal_point digit+;
- int = (zero|digit_1_9 digit*);
+ int = (zero | digit_1_9 digit*);
number = minus? int frac? exp?;
number { last_token_type = token_type::value_number; break; }
// string
- quotation_mark = ["];
- escape = [\\];
- unescaped = [^"\\\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F];
- single_escaped = ["\\/bfnrt];
- unicode_escaped = [u][0-9a-fA-F]{4};
+ quotation_mark = "\"";
+ escape = "\\";
+ unescaped = [^"\\\x00-\x1f];
+ single_escaped = "\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t";
+ unicode_escaped = "u" [0-9a-fA-F]{4};
escaped = escape (single_escaped | unicode_escaped);
char = unescaped | escaped;
string = quotation_mark char* quotation_mark;
string { last_token_type = token_type::value_string; break; }
// end of file
- '\000' { last_token_type = token_type::end_of_input; break; }
+ "\000" { last_token_type = token_type::end_of_input; break; }
// anything else is an error
. { last_token_type = token_type::parse_error; break; }
@@ -7530,16 +7571,53 @@
according to the nature of the escape. Some escapes create new
characters (e.g., `"\\n"` is replaced by `"\n"`), some are copied
as is (e.g., `"\\\\"`). Furthermore, Unicode escapes of the shape
- `"\\uxxxx"` need special care. In this case, to_unicode takes care
- of the construction of the values.
+ `"\\uxxxx"` need special care. In this case, @ref to_unicode takes
+ care of the construction of the values.
2. Unescaped characters are copied as is.
+ @pre `m_cursor - m_start >= 2`, meaning the length of the last token
+ is at least 2 bytes which is trivially true for any string (which
+ consists of at least two quotes).
+
+ " c1 c2 c3 ... "
+ ^ ^
+ m_start m_cursor
+
+ @complexity Linear in the length of the string.\n
+
+ Lemma: The loop body will always terminate.\n
+
+ Proof (by contradiction): Assume the loop body does not terminate. As
+ the loop body does not contain another loop, one of the called
+ functions must never return. The called functions are `std::strtoul`
+ and @ref to_unicode. Neither function can loop forever, so the loop
+ body will never loop forever which contradicts the assumption that the
+ loop body does not terminate, q.e.d.\n
+
+ Lemma: The loop condition for the for loop is eventually false.\n
+
+ Proof (by contradiction): Assume the loop does not terminate. Due to
+ the above lemma, this can only be due to a tautological loop
+ condition; that is, the loop condition i < m_cursor - 1 must always be
+ true. Let x be the change of i for any loop iteration. Then
+ m_start + 1 + x < m_cursor - 1 must hold to loop indefinitely.
+ This can be rephrased to m_cursor - m_start - 2 > x. With the
+ precondition, we x <= 0, meaning that the loop condition holds
+ indefinitly if i is always decreased. However, observe that the
+ value of i is strictly increasing with each iteration, as it is
+ incremented by 1 in the iteration expression and never
+ decremented inside the loop body. Hence, the loop condition
+ will eventually be false which contradicts the assumption that
+ the loop condition is a tautology, q.e.d.
+
@return string value of current token without opening and closing
quotes
@throw std::out_of_range if to_unicode fails
*/
string_t get_string() const
{
+ assert(m_cursor - m_start >= 2);
+
string_t result;
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
@@ -8212,6 +8290,8 @@
/*!
@brief create and return a reference to the pointed to value
+
+ @complexity Linear in the number of reference tokens.
*/
reference get_and_create(reference j) const
{
@@ -8649,6 +8729,7 @@
basic_json result;
// iterate the JSON object values
+ assert(value.m_value.object != nullptr);
for (const auto& element : *value.m_value.object)
{
if (not element.second.is_primitive())
diff --git a/test/src/unit.cpp b/test/src/unit.cpp
index a7ca739..8ca9b01 100644
--- a/test/src/unit.cpp
+++ b/test/src/unit.cpp
@@ -9716,6 +9716,39 @@
CHECK_THROWS_WITH(json::parser("\"\b\"").parse(), "parse error - unexpected '\"'");
// improve code coverage
CHECK_THROWS_AS(json::parser("\uFF01").parse(), std::invalid_argument);
+ // unescaped control characters
+ CHECK_THROWS_AS(json::parser("\"\x00\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x01\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x02\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x03\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x04\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x05\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x06\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x07\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x08\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x09\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x0a\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x0b\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x0c\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x0d\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x0e\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x0f\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x10\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x11\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x12\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x13\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x14\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x15\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x16\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x17\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x18\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x19\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x1a\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x1b\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x1c\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x1d\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x1e\"").parse(), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("\"\x1f\"").parse(), std::invalid_argument);
}
SECTION("escaped")