Merge branch 'improve-partial-utf8-handling'
diff --git a/expat/lib/internal.h b/expat/lib/internal.h
index 8eb7190..94cb98e 100644
--- a/expat/lib/internal.h
+++ b/expat/lib/internal.h
@@ -79,3 +79,17 @@
# define UNUSED_P(p) UNUSED_ ## p
# endif
#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void
+align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/expat/lib/xmltok.c b/expat/lib/xmltok.c
index 0e68844..4195279 100644
--- a/expat/lib/xmltok.c
+++ b/expat/lib/xmltok.c
@@ -329,6 +329,41 @@
UTF8_cval4 = 0xf0
};
+void
+align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
+{
+ const char * fromLim = *fromLimRef;
+ size_t walked = 0;
+ for (; fromLim > from; fromLim--, walked++) {
+ const unsigned char prev = (unsigned char)fromLim[-1];
+ if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
+ if (walked + 1 >= 4) {
+ fromLim += 4 - 1;
+ break;
+ } else {
+ walked = 0;
+ }
+ } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
+ if (walked + 1 >= 3) {
+ fromLim += 3 - 1;
+ break;
+ } else {
+ walked = 0;
+ }
+ } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
+ if (walked + 1 >= 2) {
+ fromLim += 2 - 1;
+ break;
+ } else {
+ walked = 0;
+ }
+ } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
+ break;
+ }
+ }
+ *fromLimRef = fromLim;
+}
+
static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING *UNUSED_P(enc),
const char **fromP, const char *fromLim,
@@ -340,9 +375,8 @@
if (fromLim - *fromP > toLim - *toP) {
/* Avoid copying partial characters. */
res = XML_CONVERT_OUTPUT_EXHAUSTED;
- for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
- if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
- break;
+ fromLim = *fromP + (toLim - *toP);
+ align_limit_to_full_utf8_characters(*fromP, &fromLim);
}
for (to = *toP, from = *fromP; (from < fromLim) && (to < toLim); from++, to++)
*to = *from;
diff --git a/expat/tests/runtests.c b/expat/tests/runtests.c
index 45adfa5..c0cdea9 100644
--- a/expat/tests/runtests.c
+++ b/expat/tests/runtests.c
@@ -13,6 +13,10 @@
#include <stdio.h>
#include <string.h>
#include <stdint.h>
+#include <stddef.h> /* ptrdiff_t */
+#ifndef __cplusplus
+# include <stdbool.h>
+#endif
#include "expat.h"
#include "chardata.h"
@@ -367,6 +371,68 @@
}
END_TEST
+
+/* Examples, not masks: */
+#define UTF8_LEAD_1 "\x7f" /* 0b01111111 */
+#define UTF8_LEAD_2 "\xdf" /* 0b11011111 */
+#define UTF8_LEAD_3 "\xef" /* 0b11101111 */
+#define UTF8_LEAD_4 "\xf7" /* 0b11110111 */
+#define UTF8_FOLLOW "\xbf" /* 0b10111111 */
+
+START_TEST(test_utf8_auto_align)
+{
+ struct TestCase {
+ ptrdiff_t expectedMovementInChars;
+ const char * input;
+ };
+
+ struct TestCase cases[] = {
+ {00, ""},
+
+ {00, UTF8_LEAD_1},
+
+ {-1, UTF8_LEAD_2},
+ {00, UTF8_LEAD_2 UTF8_FOLLOW},
+
+ {-1, UTF8_LEAD_3},
+ {-2, UTF8_LEAD_3 UTF8_FOLLOW},
+ {00, UTF8_LEAD_3 UTF8_FOLLOW UTF8_FOLLOW},
+
+ {-1, UTF8_LEAD_4},
+ {-2, UTF8_LEAD_4 UTF8_FOLLOW},
+ {-3, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW},
+ {00, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW UTF8_FOLLOW},
+ };
+
+ size_t i = 0;
+ bool success = true;
+ for (; i < sizeof(cases) / sizeof(*cases); i++) {
+ const char * fromLim = cases[i].input + strlen(cases[i].input);
+ const char * const fromLimInitially = fromLim;
+ ptrdiff_t actualMovementInChars;
+
+ align_limit_to_full_utf8_characters(cases[i].input, &fromLim);
+
+ actualMovementInChars = (fromLim - fromLimInitially);
+ if (actualMovementInChars != cases[i].expectedMovementInChars) {
+ size_t j = 0;
+ success = false;
+ printf("[-] UTF-8 case %2lu: Expected movement by %2ld chars"
+ ", actually moved by %2ld chars: \"",
+ i + 1, cases[i].expectedMovementInChars, actualMovementInChars);
+ for (; j < strlen(cases[i].input); j++) {
+ printf("\\x%02x", (unsigned char)cases[i].input[j]);
+ }
+ printf("\"\n");
+ }
+ }
+
+ if (! success) {
+ fail("UTF-8 auto-alignment is not bullet-proof\n");
+ }
+}
+END_TEST
+
START_TEST(test_utf16)
{
/* <?xml version="1.0" encoding="UTF-16"?>
@@ -1543,6 +1609,7 @@
tcase_add_test(tc_basic, test_bom_utf16_be);
tcase_add_test(tc_basic, test_bom_utf16_le);
tcase_add_test(tc_basic, test_illegal_utf8);
+ tcase_add_test(tc_basic, test_utf8_auto_align);
tcase_add_test(tc_basic, test_utf16);
tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
tcase_add_test(tc_basic, test_latin1_umlauts);