Have JSON line-comment tokens exclude the "\n" Before this commit, line-comment token chains included the final "\n" the way block-comments included the final "*/". The "\n" was mandatory. However, when parsing with both QUIRK_ALLOW_COMMENT_LINE and QUIRK_ALLOW_TRAILING_FILLER, input like "123//xyz" (without a "\n" before the EOF) was rejected before and accepted after this commit. So that line-comments are consistent, in that they either always or never include the final "\n", they now never include the final "\n". An input fragment (parsed with QUIRK_ALLOW_COMMENT_LINE) like "//abc\n\t\t" was, before this commit, parsed as a 6 byte comment token and a 2 byte other-filler token. That split is now 5+3.

commit: 77f855206d6b458ffe892dd5cf19150b1de63f24 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Mon Jul 19 00:00:13 2021 +1000
committer: Nigel Tao <nigeltao@golang.org> Tue Jul 20 14:17:09 2021 +1000
tree: 4fa0fd74ac6e5bfa9ea0467b1f33bc2fd74b427d
parent: 40ffa334ee6c6a46797c7e5c72f3c6eff626f0d6 [diff]
diff --git a/example/jsonptr/jsonptr.cc b/example/jsonptr/jsonptr.cc
index 64e55ba..3d72c58 100644
--- a/example/jsonptr/jsonptr.cc
+++ b/example/jsonptr/jsonptr.cc

@@ -1074,7 +1074,7 @@
   return write_dst_slow(s, n);
 }
 
-#define TRY_INDENT_WITH_LEADING_NEW_LINE                                \
+#define TRY_INDENT                                                      \
   do {                                                                  \
     uint32_t adj = (g_num_input_blank_lines > 1) ? 1 : 0;               \
     g_num_input_blank_lines = 0;                                        \
@@ -1086,20 +1086,6 @@
     }                                                                   \
   } while (false)
 
-// TRY_INDENT_SANS_LEADING_NEW_LINE is used after comments, which print their
-// own "\n".
-#define TRY_INDENT_SANS_LEADING_NEW_LINE                                \
-  do {                                                                  \
-    uint32_t adj = (g_num_input_blank_lines > 1) ? 1 : 0;               \
-    g_num_input_blank_lines = 0;                                        \
-    uint32_t indent = g_depth * g_bytes_per_indent_depth;               \
-    TRY(write_dst(g_two_new_lines_then_256_indent_bytes + 2 - adj,      \
-                  adj + (indent & 0xFF)));                              \
-    for (indent >>= 8; indent > 0; indent--) {                          \
-      TRY(write_dst(g_two_new_lines_then_256_indent_bytes + 2, 0x100)); \
-    }                                                                   \
-  } while (false)
-
 // ----
 
 const char*  //
@@ -1155,12 +1141,12 @@
             (g_ctx != context::in_dict_after_brace) &&
             !g_flags.compact_output) {
           if (g_is_after_comment) {
-            TRY_INDENT_SANS_LEADING_NEW_LINE;
+            TRY_INDENT;
           } else {
             if (g_flags.output_extra_comma) {
               TRY(write_dst(",", 1));
             }
-            TRY_INDENT_WITH_LEADING_NEW_LINE;
+            TRY_INDENT;
           }
         } else {
           g_num_input_blank_lines = 0;
@@ -1181,7 +1167,7 @@
     // continuation of a multi-token chain.
     if (start_of_token_chain) {
       if (g_is_after_comment) {
-        TRY_INDENT_SANS_LEADING_NEW_LINE;
+        TRY_INDENT;
       } else if (g_ctx == context::in_dict_after_key) {
         TRY(write_dst(": ", g_flags.compact_output ? 1 : 2));
       } else if (g_ctx != context::none) {
@@ -1190,7 +1176,7 @@
           TRY(write_dst(",", 1));
         }
         if (!g_flags.compact_output) {
-          TRY_INDENT_WITH_LEADING_NEW_LINE;
+          TRY_INDENT;
         }
       }
 
@@ -1336,11 +1322,16 @@
           if (g_flags.compact_output) {
             TRY(write_dst(g_src.data.ptr + g_cursor_index - token_length,
                           token_length));
+            if (!t.continued() &&
+                (t.value_base_detail() &
+                 WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE)) {
+              TRY(write_dst("\n", 1));
+            }
 
           } else {
             if (start_of_token_chain) {
               if (g_is_after_comment) {
-                TRY_INDENT_SANS_LEADING_NEW_LINE;
+                TRY_INDENT;
               } else if (g_ctx != context::none) {
                 if (g_ctx == context::in_dict_after_key) {
                   TRY(write_dst(":", 1));
@@ -1349,16 +1340,11 @@
                            (g_ctx != context::end_of_data)) {
                   TRY(write_dst(",", 1));
                 }
-                TRY_INDENT_WITH_LEADING_NEW_LINE;
+                TRY_INDENT;
               }
             }
             TRY(write_dst(g_src.data.ptr + g_cursor_index - token_length,
                           token_length));
-            if (!t.continued() &&
-                (t.value_base_detail() &
-                 WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK)) {
-              TRY(write_dst("\n", 1));
-            }
             g_is_after_comment = true;
           }
           if (g_ctx == context::in_list_after_bracket) {
@@ -1366,11 +1352,7 @@
           } else if (g_ctx == context::in_dict_after_brace) {
             g_ctx = context::in_dict_after_value;
           }
-          g_num_input_blank_lines =
-              (t.value_base_detail() &
-               WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE)
-                  ? 1
-                  : 0;
+          g_num_input_blank_lines = 0;
 
         } else {
           uint8_t* p = g_src.data.ptr + g_cursor_index - token_length;

diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index bd10aba..11d5e6d 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -33096,16 +33096,19 @@
         }
         while (true) {
           if (((uint64_t)(io2_a_src - iop_a_src)) <= 0) {
-            if (v_length > 0) {
+            if (a_src && a_src->meta.closed) {
+              *iop_a_dst++ = wuffs_base__make_token(
+                  (((uint64_t)(4)) << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
+                  (((uint64_t)(v_length)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+              self->private_impl.f_comment_type = 2;
+              status = wuffs_base__make_status(NULL);
+              goto ok;
+            } else if (v_length > 0) {
               *iop_a_dst++ = wuffs_base__make_token(
                   (((uint64_t)(4)) << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
                   (((uint64_t)(1)) << WUFFS_BASE__TOKEN__CONTINUED__SHIFT) |
                   (((uint64_t)(v_length)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
             }
-            if (a_src && a_src->meta.closed) {
-              status = wuffs_base__make_status(wuffs_json__error__bad_input);
-              goto exit;
-            }
             status = wuffs_base__make_status(wuffs_base__suspension__short_read);
             WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(6);
             v_length = 0;
@@ -33113,10 +33116,9 @@
           }
           v_c = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
           if (v_c == 10) {
-            iop_a_src += 1;
             *iop_a_dst++ = wuffs_base__make_token(
                 (((uint64_t)(4)) << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
-                (((uint64_t)((v_length + 1))) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+                (((uint64_t)(v_length)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
             self->private_impl.f_comment_type = 2;
             status = wuffs_base__make_status(NULL);
             goto ok;

diff --git a/std/json/decode_json.wuffs b/std/json/decode_json.wuffs
index bd83ae8..7575633 100644
--- a/std/json/decode_json.wuffs
+++ b/std/json/decode_json.wuffs

@@ -1530,7 +1530,16 @@
 				pre args.dst.length() > 0,
 			{
 				if args.src.length() <= 0 {
-					if length > 0 {
+					if args.src.is_closed() {
+						args.dst.write_simple_token_fast!(
+							value_major: 0,
+							value_minor: (base.TOKEN__VBC__FILLER << 21) |
+							base.TOKEN__VBD__FILLER__COMMENT_LINE,
+							continued: 0,
+							length: length)
+						this.comment_type = 2
+						return ok
+					} else if length > 0 {
 						args.dst.write_simple_token_fast!(
 							value_major: 0,
 							value_minor: (base.TOKEN__VBC__FILLER << 21) |
@@ -1538,9 +1547,6 @@
 							continued: 1,
 							length: length)
 					}
-					if args.src.is_closed() {
-						return "#bad input"
-					}
 					yield? base."$short read"
 					length = 0
 					continue.comment_line
@@ -1548,13 +1554,12 @@
 
 				c = args.src.peek_u8()
 				if c == '\n' {
-					args.src.skip_u32_fast!(actual: 1, worst_case: 1)
 					args.dst.write_simple_token_fast!(
 						value_major: 0,
 						value_minor: (base.TOKEN__VBC__FILLER << 21) |
 						base.TOKEN__VBD__FILLER__COMMENT_LINE,
 						continued: 0,
-						length: length + 1)
+						length: length)
 					this.comment_type = 2
 					return ok
 				}

diff --git a/std/json/decode_quirks.wuffs b/std/json/decode_quirks.wuffs
index f403033..95c29fc 100644
--- a/std/json/decode_quirks.wuffs
+++ b/std/json/decode_quirks.wuffs

@@ -109,15 +109,17 @@
 // When this quirk is enabled, "// C/C++ style line comments\n" are accepted
 // anywhere whitespace would be. See also QUIRK_ALLOW_TRAILING_FILLER.
 //
-// A line comment may not omit the ending "\n", even if there is no input
-// afterwards (i.e. the prospective line comment ends with the end-of-file).
+// Line comments are terminated by a '\n' byte. '\r' bytes are irrelevant. When
+// combined with QUIRK_ALLOW_TRAILING_FILLER, a line comment at the end of the
+// file may omit the final '\n' byte.
 //
 // They produce WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_LINE tokens. The token
-// chain's source bytes includes the starting "//" and the ending "\n".
+// chain's source bytes includes the starting "//" but always excludes the
+// ending "\n", regardless of whether it met the end of the file instead.
 //
 // Even if the line comments are on consecutive lines, each line comment is a
-// separate token chain. There may be whitespace tokens between one line
-// comment's ending "\n" and the next one's starting "//".
+// separate token chain. Between them is at least one "\n" and maybe other
+// whitespace, which are not part of a line comment token chain.
 //
 // To avoid ambiguity (as comments can contain new lines), this quirk cannot be
 // combined with QUIRK_EXPECT_TRAILING_NEW_LINE_OR_EOF.

diff --git a/test/c/std/json.c b/test/c/std/json.c
index 7e9812c..1d76f9a 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c

@@ -3451,7 +3451,7 @@
       "306 /*foo*/ \n",              //
       "307 /*foo*/ \n\n",            //
       "308/*bar\nbaz*/\n\n",         //
-      "309 // qux\n",                // TODO: drop the "\n".
+      "309 // qux",                  //
       "310 // qux\n",                //
       "311 // qux\n\n",              //
       "312 /*c0*/ /*c1*/\n\n",       //

diff --git a/test/data/json-quirks.tokens b/test/data/json-quirks.tokens
index d7b08ab..9ffd0f8 100644
--- a/test/data/json-quirks.tokens
+++ b/test/data/json-quirks.tokens
Binary files differ
commit	77f855206d6b458ffe892dd5cf19150b1de63f24	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Mon Jul 19 00:00:13 2021 +1000
committer	Nigel Tao <nigeltao@golang.org>	Tue Jul 20 14:17:09 2021 +1000
tree	4fa0fd74ac6e5bfa9ea0467b1f33bc2fd74b427d
parent	40ffa334ee6c6a46797c7e5c72f3c6eff626f0d6 [diff]