minor changes - fixed a bug that did not discard strings with control characters between 0x10 and 0x1f - added termination proofs for two important loops - made get_ref() constexpr

commit: 4e7501e59aff4c9dc27b4175bdefb49319d82ed3 [log] [tgz]
author: Niels <niels.lohmann@gmail.com> Fri Jul 22 15:34:45 2016 +0200
committer: Niels <niels.lohmann@gmail.com> Fri Jul 22 15:34:45 2016 +0200
tree: a97efe5774f10130960780b015a14f9b1e6167bf
parent: 4c98c971b838645416d7bac9de4e8a4d44e9d584 [diff]
diff --git a/doc/images/callback_events.png b/doc/images/callback_events.png
new file mode 100644
index 0000000..09aa2b3
--- /dev/null
+++ b/doc/images/callback_events.png
Binary files differ

diff --git a/src/json.hpp b/src/json.hpp
index 2915532..00bee97 100644
--- a/src/json.hpp
+++ b/src/json.hpp

@@ -717,7 +717,19 @@
     This enumeration collects the different JSON types. It is internally used
     to distinguish the stored values, and the functions @ref is_null(), @ref
     is_object(), @ref is_array(), @ref is_string(), @ref is_boolean(), @ref
-    is_number(), and @ref is_discarded() rely on it.
+    is_number() (with @ref is_number_integer(), @ref is_number_unsigned(), and
+    @ref is_number_float()), @ref is_discarded(), @ref is_primitive(), and
+    @ref is_structured() rely on it.
+
+    @note There are three enumeration entries (number_integer,
+    number_unsigned, and number_float), because the library distinguishes
+    these three types for numbers: @ref number_unsigned_t is used for unsigned
+    integers, @ref number_integer_t is used for signed integers, and @ref
+    number_float_t is used for floating-point numbers or to approximate
+    integers which do not fit in the limits of their respective type.
+
+    @sa @ref basic_json(const value_t value_type) -- create a JSON value with
+    the default value for a given type
 
     @since version 1.0.0
     */
@@ -728,7 +740,7 @@
         array,           ///< array (ordered collection of values)
         string,          ///< string value
         boolean,         ///< boolean value
-        number_integer,  ///< number value (integer)
+        number_integer,  ///< number value (signed integer)
         number_unsigned, ///< number value (unsigned integer)
         number_float,    ///< number value (floating-point)
         discarded        ///< discarded by the the parser callback function
@@ -758,7 +770,24 @@
     /*!
     @brief a JSON value
 
-    The actual storage for a JSON value of the @ref basic_json class.
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
 
     @since version 1.0.0
     */
@@ -874,6 +903,8 @@
     This enumeration lists the parser events that can trigger calling a
     callback function of type @ref parser_callback_t during parsing.
 
+    @image html callback_events.png "Example when certain parse events are triggered"
+
     @since version 1.0.0
     */
     enum class parse_event_t : uint8_t
@@ -916,6 +947,8 @@
     parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
     parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
 
+    @image html callback_events.png "Example when certain parse events are triggered"
+
     Discarding a value (i.e., returning `false`) has different effects
     depending on the context in which function was called:
 
@@ -2773,21 +2806,16 @@
     type of the current JSON
     */
     template<typename ReferenceType, typename ThisType>
-    static ReferenceType get_ref_impl(ThisType& obj)
+    static constexpr ReferenceType get_ref_impl(ThisType& obj)
     {
-        // delegate the call to get_ptr<>()
+        // helper type
         using PointerType = typename std::add_pointer<ReferenceType>::type;
-        auto ptr = obj.template get_ptr<PointerType>();
 
-        if (ptr != nullptr)
-        {
-            return *ptr;
-        }
-        else
-        {
-            throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
-                                    obj.type_name());
-        }
+        // delegate the call to get_ptr<>()
+        return obj.template get_ptr<PointerType>() != nullptr
+               ? *obj.template get_ptr<PointerType>()
+               : throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
+                                         obj.type_name());
     }
 
   public:
@@ -3015,7 +3043,7 @@
                  std::is_reference<ReferenceType>::value
                  and std::is_const<typename std::remove_reference<ReferenceType>::type>::value
                  , int>::type = 0>
-    ReferenceType get_ref() const
+    constexpr ReferenceType get_ref() const
     {
         // delegate call to get_ref_impl
         return get_ref_impl<ReferenceType>(*this);
@@ -7286,6 +7314,8 @@
         @throw std::invalid_argument if the low surrogate is invalid; example:
         `""missing or wrong low surrogate""`
 
+        @complexity Constant.
+
         @see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
         */
         static string_t to_unicode(const std::size_t codepoint1,
@@ -7402,6 +7432,17 @@
         function consists of a large block of code with `goto` jumps.
 
         @return the class of the next token read from the buffer
+
+        @complexity Linear in the length of the input.\n
+
+        Proposition: The loop below will always terminate for finite input.\n
+
+        Proof (by contradiction): Assume a finite input. To loop forever, the
+        loop must never hit code with a `break` statement. The only code
+        snippets without a `break` statement are the continue statements for
+        whitespace and byte-order-marks. To loop forever, the input must be an
+        infinite sequence of whitespace or byte-order-marks. This contradicts
+        the assumption of finite input, q.e.d.
         */
         token_type scan() noexcept
         {
@@ -7422,8 +7463,8 @@
                     {
                         0,   0,   0,   0,   0,   0,   0,   0,
                         0,  32,  32,   0,   0,  32,   0,   0,
-                        128, 128, 128, 128, 128, 128, 128, 128,
-                        128, 128, 128, 128, 128, 128, 128, 128,
+                        0,   0,   0,   0,   0,   0,   0,   0,
+                        0,   0,   0,   0,   0,   0,   0,   0,
                         160, 128,   0, 128, 128, 128, 128, 128,
                         128, 128, 128, 128, 128, 128, 128, 128,
                         192, 192, 192, 192, 192, 192, 192, 192,
@@ -7602,7 +7643,7 @@
 basic_json_parser_9:
                     yyaccept = 0;
                     yych = *(m_marker = ++m_cursor);
-                    if (yych <= 0x0F)
+                    if (yych <= 0x1F)
                     {
                         goto basic_json_parser_5;
                     }
@@ -7760,7 +7801,7 @@
                     {
                         goto basic_json_parser_31;
                     }
-                    if (yych <= 0x0F)
+                    if (yych <= 0x1F)
                     {
                         goto basic_json_parser_33;
                     }
@@ -8233,16 +8274,53 @@
            according to the nature of the escape. Some escapes create new
            characters (e.g., `"\\n"` is replaced by `"\n"`), some are copied
            as is (e.g., `"\\\\"`). Furthermore, Unicode escapes of the shape
-           `"\\uxxxx"` need special care. In this case, to_unicode takes care
-           of the construction of the values.
+           `"\\uxxxx"` need special care. In this case, @ref to_unicode takes
+           care of the construction of the values.
         2. Unescaped characters are copied as is.
 
+        @pre `m_cursor - m_start >= 2`, meaning the length of the last token
+        is at least 2 bytes which is trivially true for any string (which
+        consists of at least two quotes).
+
+            " c1 c2 c3 ... "
+            ^                ^
+            m_start          m_cursor
+
+        @complexity Linear in the length of the string.\n
+
+        Lemma: The loop body will always terminate.\n
+
+        Proof (by contradiction): Assume the loop body does not terminate. As
+        the loop body does not contain another loop, one of the called
+        functions must never return. The called functions are `std::strtoul`
+        and @ref to_unicode. Neither function can loop forever, so the loop
+        body will never loop forever which contradicts the assumption that the
+        loop body does not terminate, q.e.d.\n
+
+        Lemma: The loop condition for the for loop is eventually false.\n
+
+        Proof (by contradiction): Assume the loop does not terminate. Due to
+        the above lemma, this can only be due to a tautological loop
+        condition; that is, the loop condition i < m_cursor - 1 must always be
+        true. Let x be the change of i for any loop iteration. Then
+        m_start + 1 + x < m_cursor - 1 must hold to loop indefinitely.
+        This can be rephrased to m_cursor - m_start - 2 > x. With the
+        precondition, we x <= 0, meaning that the loop condition holds
+        indefinitly if i is always decreased. However, observe that the
+        value of i is strictly increasing with each iteration, as it is
+        incremented by 1 in the iteration expression and never
+        decremented inside the loop body. Hence, the loop condition
+        will eventually be false which contradicts the assumption that
+        the loop condition is a tautology, q.e.d.
+
         @return string value of current token without opening and closing
         quotes
         @throw std::out_of_range if to_unicode fails
         */
         string_t get_string() const
         {
+            assert(m_cursor - m_start >= 2);
+
             string_t result;
             result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
 
@@ -8915,6 +8993,8 @@
 
         /*!
         @brief create and return a reference to the pointed to value
+
+        @complexity Linear in the number of reference tokens.
         */
         reference get_and_create(reference j) const
         {
@@ -9352,6 +9432,7 @@
             basic_json result;
 
             // iterate the JSON object values
+            assert(value.m_value.object != nullptr);
             for (const auto& element : *value.m_value.object)
             {
                 if (not element.second.is_primitive())

diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c
index 8eeedef..05c49ec 100644
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c

@@ -717,7 +717,19 @@
     This enumeration collects the different JSON types. It is internally used
     to distinguish the stored values, and the functions @ref is_null(), @ref
     is_object(), @ref is_array(), @ref is_string(), @ref is_boolean(), @ref
-    is_number(), and @ref is_discarded() rely on it.
+    is_number() (with @ref is_number_integer(), @ref is_number_unsigned(), and
+    @ref is_number_float()), @ref is_discarded(), @ref is_primitive(), and
+    @ref is_structured() rely on it.
+
+    @note There are three enumeration entries (number_integer,
+    number_unsigned, and number_float), because the library distinguishes
+    these three types for numbers: @ref number_unsigned_t is used for unsigned
+    integers, @ref number_integer_t is used for signed integers, and @ref
+    number_float_t is used for floating-point numbers or to approximate
+    integers which do not fit in the limits of their respective type.
+
+    @sa @ref basic_json(const value_t value_type) -- create a JSON value with
+    the default value for a given type
 
     @since version 1.0.0
     */
@@ -728,7 +740,7 @@
         array,           ///< array (ordered collection of values)
         string,          ///< string value
         boolean,         ///< boolean value
-        number_integer,  ///< number value (integer)
+        number_integer,  ///< number value (signed integer)
         number_unsigned, ///< number value (unsigned integer)
         number_float,    ///< number value (floating-point)
         discarded        ///< discarded by the the parser callback function
@@ -758,7 +770,24 @@
     /*!
     @brief a JSON value
 
-    The actual storage for a JSON value of the @ref basic_json class.
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
 
     @since version 1.0.0
     */
@@ -874,6 +903,8 @@
     This enumeration lists the parser events that can trigger calling a
     callback function of type @ref parser_callback_t during parsing.
 
+    @image html callback_events.png "Example when certain parse events are triggered"
+
     @since version 1.0.0
     */
     enum class parse_event_t : uint8_t
@@ -916,6 +947,8 @@
     parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
     parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
 
+    @image html callback_events.png "Example when certain parse events are triggered"
+
     Discarding a value (i.e., returning `false`) has different effects
     depending on the context in which function was called:
 
@@ -2773,21 +2806,16 @@
     type of the current JSON
     */
     template<typename ReferenceType, typename ThisType>
-    static ReferenceType get_ref_impl(ThisType& obj)
+    static constexpr ReferenceType get_ref_impl(ThisType& obj)
     {
-        // delegate the call to get_ptr<>()
+        // helper type
         using PointerType = typename std::add_pointer<ReferenceType>::type;
-        auto ptr = obj.template get_ptr<PointerType>();
 
-        if (ptr != nullptr)
-        {
-            return *ptr;
-        }
-        else
-        {
-            throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
-                                    obj.type_name());
-        }
+        // delegate the call to get_ptr<>()
+        return obj.template get_ptr<PointerType>() != nullptr
+               ? *obj.template get_ptr<PointerType>()
+               : throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " +
+                                         obj.type_name());
     }
 
   public:
@@ -3015,7 +3043,7 @@
                  std::is_reference<ReferenceType>::value
                  and std::is_const<typename std::remove_reference<ReferenceType>::type>::value
                  , int>::type = 0>
-    ReferenceType get_ref() const
+    constexpr ReferenceType get_ref() const
     {
         // delegate call to get_ref_impl
         return get_ref_impl<ReferenceType>(*this);
@@ -7286,6 +7314,8 @@
         @throw std::invalid_argument if the low surrogate is invalid; example:
         `""missing or wrong low surrogate""`
 
+        @complexity Constant.
+
         @see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
         */
         static string_t to_unicode(const std::size_t codepoint1,
@@ -7402,6 +7432,17 @@
         function consists of a large block of code with `goto` jumps.
 
         @return the class of the next token read from the buffer
+
+        @complexity Linear in the length of the input.\n
+
+        Proposition: The loop below will always terminate for finite input.\n
+
+        Proof (by contradiction): Assume a finite input. To loop forever, the
+        loop must never hit code with a `break` statement. The only code
+        snippets without a `break` statement are the continue statements for
+        whitespace and byte-order-marks. To loop forever, the input must be an
+        infinite sequence of whitespace or byte-order-marks. This contradicts
+        the assumption of finite input, q.e.d.
         */
         token_type scan() noexcept
         {
@@ -7447,32 +7488,32 @@
                     "false" { last_token_type = token_type::literal_false; break; }
 
                     // number
-                    decimal_point = [.];
+                    decimal_point = ".";
                     digit         = [0-9];
                     digit_1_9     = [1-9];
-                    e             = [eE];
-                    minus         = [-];
-                    plus          = [+];
-                    zero          = [0];
-                    exp           = e (minus|plus)? digit+;
+                    e             = "e" | "E";
+                    minus         = "-";
+                    plus          = "+";
+                    zero          = "0";
+                    exp           = e (minus | plus)? digit+;
                     frac          = decimal_point digit+;
-                    int           = (zero|digit_1_9 digit*);
+                    int           = (zero | digit_1_9 digit*);
                     number        = minus? int frac? exp?;
                     number        { last_token_type = token_type::value_number; break; }
 
                     // string
-                    quotation_mark  = ["];
-                    escape          = [\\];
-                    unescaped       = [^"\\\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F];
-                    single_escaped  = ["\\/bfnrt];
-                    unicode_escaped = [u][0-9a-fA-F]{4};
+                    quotation_mark  = "\"";
+                    escape          = "\\";
+                    unescaped       = [^"\\\x00-\x1f];
+                    single_escaped  = "\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t";
+                    unicode_escaped = "u" [0-9a-fA-F]{4};
                     escaped         = escape (single_escaped | unicode_escaped);
                     char            = unescaped | escaped;
                     string          = quotation_mark char* quotation_mark;
                     string          { last_token_type = token_type::value_string; break; }
 
                     // end of file
-                    '\000'         { last_token_type = token_type::end_of_input; break; }
+                    "\000"         { last_token_type = token_type::end_of_input; break; }
 
                     // anything else is an error
                     .              { last_token_type = token_type::parse_error; break; }
@@ -7530,16 +7571,53 @@
            according to the nature of the escape. Some escapes create new
            characters (e.g., `"\\n"` is replaced by `"\n"`), some are copied
            as is (e.g., `"\\\\"`). Furthermore, Unicode escapes of the shape
-           `"\\uxxxx"` need special care. In this case, to_unicode takes care
-           of the construction of the values.
+           `"\\uxxxx"` need special care. In this case, @ref to_unicode takes
+           care of the construction of the values.
         2. Unescaped characters are copied as is.
 
+        @pre `m_cursor - m_start >= 2`, meaning the length of the last token
+        is at least 2 bytes which is trivially true for any string (which
+        consists of at least two quotes).
+
+            " c1 c2 c3 ... "
+            ^                ^
+            m_start          m_cursor
+
+        @complexity Linear in the length of the string.\n
+
+        Lemma: The loop body will always terminate.\n
+
+        Proof (by contradiction): Assume the loop body does not terminate. As
+        the loop body does not contain another loop, one of the called
+        functions must never return. The called functions are `std::strtoul`
+        and @ref to_unicode. Neither function can loop forever, so the loop
+        body will never loop forever which contradicts the assumption that the
+        loop body does not terminate, q.e.d.\n
+
+        Lemma: The loop condition for the for loop is eventually false.\n
+
+        Proof (by contradiction): Assume the loop does not terminate. Due to
+        the above lemma, this can only be due to a tautological loop
+        condition; that is, the loop condition i < m_cursor - 1 must always be
+        true. Let x be the change of i for any loop iteration. Then
+        m_start + 1 + x < m_cursor - 1 must hold to loop indefinitely.
+        This can be rephrased to m_cursor - m_start - 2 > x. With the
+        precondition, we x <= 0, meaning that the loop condition holds
+        indefinitly if i is always decreased. However, observe that the
+        value of i is strictly increasing with each iteration, as it is
+        incremented by 1 in the iteration expression and never
+        decremented inside the loop body. Hence, the loop condition
+        will eventually be false which contradicts the assumption that
+        the loop condition is a tautology, q.e.d.
+
         @return string value of current token without opening and closing
         quotes
         @throw std::out_of_range if to_unicode fails
         */
         string_t get_string() const
         {
+            assert(m_cursor - m_start >= 2);
+
             string_t result;
             result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
 
@@ -8212,6 +8290,8 @@
 
         /*!
         @brief create and return a reference to the pointed to value
+
+        @complexity Linear in the number of reference tokens.
         */
         reference get_and_create(reference j) const
         {
@@ -8649,6 +8729,7 @@
             basic_json result;
 
             // iterate the JSON object values
+            assert(value.m_value.object != nullptr);
             for (const auto& element : *value.m_value.object)
             {
                 if (not element.second.is_primitive())

diff --git a/test/src/unit.cpp b/test/src/unit.cpp
index a7ca739..8ca9b01 100644
--- a/test/src/unit.cpp
+++ b/test/src/unit.cpp

@@ -9716,6 +9716,39 @@
                 CHECK_THROWS_WITH(json::parser("\"\b\"").parse(), "parse error - unexpected '\"'");
                 // improve code coverage
                 CHECK_THROWS_AS(json::parser("\uFF01").parse(), std::invalid_argument);
+                // unescaped control characters
+                CHECK_THROWS_AS(json::parser("\"\x00\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x01\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x02\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x03\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x04\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x05\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x06\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x07\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x08\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x09\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x0a\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x0b\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x0c\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x0d\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x0e\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x0f\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x10\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x11\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x12\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x13\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x14\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x15\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x16\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x17\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x18\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x19\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x1a\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x1b\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x1c\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x1d\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x1e\"").parse(), std::invalid_argument);
+                CHECK_THROWS_AS(json::parser("\"\x1f\"").parse(), std::invalid_argument);
             }
 
             SECTION("escaped")
commit	4e7501e59aff4c9dc27b4175bdefb49319d82ed3	[log] [tgz]
author	Niels <niels.lohmann@gmail.com>	Fri Jul 22 15:34:45 2016 +0200
committer	Niels <niels.lohmann@gmail.com>	Fri Jul 22 15:34:45 2016 +0200
tree	a97efe5774f10130960780b015a14f9b1e6167bf
parent	4c98c971b838645416d7bac9de4e8a4d44e9d584 [diff]