Cherry pick 3 patches from the upstream
* Fix the undefined behavior in decimal number parsing
http://bugs.icu-project.org/trac/changeset/40950
* Fix the handling of non-BMP characters in CJK breakiterator
http://www.icu-project.org/trac/changeset/40949
* Limit the recursion depth of UnicodeSet pattern
http://bugs.icu-project.org/trac/changeset/40979
TBR=inferno@chromium.org
Bug: chromium:799850, chromium:796807, chromium:796752
Test: See the bugs.
Change-Id: I1a8909371b601f36faca911039b10d36c7a92c85
Reviewed-on: https://chromium-review.googlesource.com/1009001
Reviewed-by: Jungshik Shin <jshin@chromium.org>
diff --git a/README.chromium b/README.chromium
index 2fc511e..339c957 100644
--- a/README.chromium
+++ b/README.chromium
@@ -296,3 +296,19 @@
upstream bug: https://ssl.icu-project.org/trac/ticket/13692
(62-to-be does not have this issue)
- bug: crbug.com/829144
+
+11. Limit the recursion depth of UnicodeSet pattern
+
+ - patches/uset_depth.patch
+ upstream bug: https://ssl.icu-project.org/trac/ticket/13547
+
+12. Fix a bug in non-BMP character handling in dictionary-based CJK
+ break iterator.
+
+ - patches/cjkdict_nonbmp.patch
+ upstream bug: https://ssl.icu-project.org/trac/ticket/13549
+
+13. Fix the undefined behavior in decimal number parsing
+
+ - patches/number_ub.patch
+ - upstream bug: https://ssl.icu-project.org/trac/ticket/13550
diff --git a/patches/cjkdict_nonbmp.patch b/patches/cjkdict_nonbmp.patch
new file mode 100644
index 0000000..5f728ac
--- /dev/null
+++ b/patches/cjkdict_nonbmp.patch
@@ -0,0 +1,15 @@
+diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
+index 0dfaf305..d3854bb8 100644
+--- a/source/common/dictbe.cpp
++++ b/source/common/dictbe.cpp
+@@ -1324,8 +1324,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
+ }
+ if (katakanaRunLength < kMaxKatakanaGroupLength) {
+ uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
+- if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
+- bestSnlp.setElementAt(newSnlp, j);
++ if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
++ bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
+ prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
+ }
+ }
diff --git a/patches/number_ub.patch b/patches/number_ub.patch
new file mode 100644
index 0000000..e821888
--- /dev/null
+++ b/patches/number_ub.patch
@@ -0,0 +1,18 @@
+Index: source/i18n/decNumber.cpp
+===================================================================
+--- source/i18n/decNumber.cpp (revision 40949)
++++ source/i18n/decNumber.cpp (revision 40950)
+@@ -627,10 +627,12 @@
+
+ for (; *c=='0' && *(c+1)!='\0';) c++; /* strip insignificant zeros */
+ firstexp=c; /* save exponent digit place */
++ uInt uexponent = 0; /* Avoid undefined behavior on signed int overflow */
+ for (; ;c++) {
+ if (*c<'0' || *c>'9') break; /* not a digit */
+- exponent=X10(exponent)+(Int)*c-(Int)'0';
++ uexponent=X10(uexponent)+(uInt)*c-(uInt)'0';
+ } /* c */
++ exponent = (Int)uexponent;
+ /* if not now on a '\0', *c must not be a digit */
+ if (*c!='\0') break;
+
diff --git a/patches/uset_depth.patch b/patches/uset_depth.patch
new file mode 100644
index 0000000..fbbc1f3
--- /dev/null
+++ b/patches/uset_depth.patch
@@ -0,0 +1,134 @@
+diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h
+index 4a4ce193..ed9a3eb7 100644
+--- a/source/common/unicode/uniset.h
++++ b/source/common/unicode/uniset.h
+@@ -1521,6 +1521,7 @@ private:
+ UnicodeString& rebuiltPat,
+ uint32_t options,
+ UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
++ int32_t depth,
+ UErrorCode& ec);
+
+ //----------------------------------------------------------------
+diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp
+index 44bb4bcd..0b7da796 100644
+--- a/source/common/uniset_closure.cpp
++++ b/source/common/uniset_closure.cpp
+@@ -129,7 +129,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
+ // _applyPattern calls add() etc., which set pat to empty.
+ UnicodeString rebuiltPat;
+ RuleCharacterIterator chars(pattern, symbols, pos);
+- applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
++ applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status);
+ if (U_FAILURE(status)) return *this;
+ if (chars.inVariable()) {
+ // syntaxError(chars, "Extra chars in variable value");
+diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp
+index 1c28a2d8..ceefde05 100644
+--- a/source/common/uniset_props.cpp
++++ b/source/common/uniset_props.cpp
+@@ -257,6 +257,7 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
+ return i.fSet;
+ }
+
++namespace {
+
+ // Cache some sets for other services -------------------------------------- ***
+ void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
+@@ -315,6 +316,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
+ // memory leak checker tools
+ #define _dbgct(me)
+
++} // namespace
++
+ //----------------------------------------------------------------
+ // Constructors &c
+ //----------------------------------------------------------------
+@@ -382,7 +385,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
+ // _applyPattern calls add() etc., which set pat to empty.
+ UnicodeString rebuiltPat;
+ RuleCharacterIterator chars(pattern, symbols, pos);
+- applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
++ applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
+ if (U_FAILURE(status)) return;
+ if (chars.inVariable()) {
+ // syntaxError(chars, "Extra chars in variable value");
+@@ -406,6 +409,8 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
+ // Implementation: Pattern parsing
+ //----------------------------------------------------------------
+
++namespace {
++
+ /**
+ * A small all-inline class to manage a UnicodeSet pointer. Add
+ * operator->() etc. as needed.
+@@ -424,6 +429,10 @@ public:
+ }
+ };
+
++constexpr int32_t MAX_DEPTH = 100;
++
++} // namespace
++
+ /**
+ * Parse the pattern from the given RuleCharacterIterator. The
+ * iterator is advanced over the parsed pattern.
+@@ -443,8 +452,13 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
+ UnicodeString& rebuiltPat,
+ uint32_t options,
+ UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
++ int32_t depth,
+ UErrorCode& ec) {
+ if (U_FAILURE(ec)) return;
++ if (depth > MAX_DEPTH) {
++ ec = U_ILLEGAL_ARGUMENT_ERROR;
++ return;
++ }
+
+ // Syntax characters: [ ] ^ - & { }
+
+@@ -579,7 +593,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
+ }
+ switch (setMode) {
+ case 1:
+- nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
++ nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
+ break;
+ case 2:
+ chars.skipIgnored(opts);
+@@ -837,6 +851,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
+ // Property set implementation
+ //----------------------------------------------------------------
+
++namespace {
++
+ static UBool numericValueFilter(UChar32 ch, void* context) {
+ return u_getNumericValue(ch) == *(double*)context;
+ }
+@@ -868,6 +884,8 @@ static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
+ return uscript_hasScript(ch, *(UScriptCode*)context);
+ }
+
++} // namespace
++
+ /**
+ * Generic filter-based scanning code for UCD property UnicodeSets.
+ */
+@@ -924,6 +942,8 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
+ }
+ }
+
++namespace {
++
+ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
+ /* Note: we use ' ' in compiler code page */
+ int32_t j = 0;
+@@ -941,6 +961,8 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
+ return TRUE;
+ }
+
++} // namespace
++
+ //----------------------------------------------------------------
+ // Property set API
+ //----------------------------------------------------------------
diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
index 0dfaf30..d3854bb 100644
--- a/source/common/dictbe.cpp
+++ b/source/common/dictbe.cpp
@@ -1324,8 +1324,8 @@
}
if (katakanaRunLength < kMaxKatakanaGroupLength) {
uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
- if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
- bestSnlp.setElementAt(newSnlp, j);
+ if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
+ bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
}
}
diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h
index 4a4ce19..ed9a3eb 100644
--- a/source/common/unicode/uniset.h
+++ b/source/common/unicode/uniset.h
@@ -1521,6 +1521,7 @@
UnicodeString& rebuiltPat,
uint32_t options,
UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
+ int32_t depth,
UErrorCode& ec);
//----------------------------------------------------------------
diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp
index 44bb4bc..0b7da79 100644
--- a/source/common/uniset_closure.cpp
+++ b/source/common/uniset_closure.cpp
@@ -129,7 +129,7 @@
// _applyPattern calls add() etc., which set pat to empty.
UnicodeString rebuiltPat;
RuleCharacterIterator chars(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
+ applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status);
if (U_FAILURE(status)) return *this;
if (chars.inVariable()) {
// syntaxError(chars, "Extra chars in variable value");
diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp
index 1c28a2d..ceefde0 100644
--- a/source/common/uniset_props.cpp
+++ b/source/common/uniset_props.cpp
@@ -257,6 +257,7 @@
return i.fSet;
}
+namespace {
// Cache some sets for other services -------------------------------------- ***
void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
@@ -315,6 +316,8 @@
// memory leak checker tools
#define _dbgct(me)
+} // namespace
+
//----------------------------------------------------------------
// Constructors &c
//----------------------------------------------------------------
@@ -382,7 +385,7 @@
// _applyPattern calls add() etc., which set pat to empty.
UnicodeString rebuiltPat;
RuleCharacterIterator chars(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
+ applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
if (U_FAILURE(status)) return;
if (chars.inVariable()) {
// syntaxError(chars, "Extra chars in variable value");
@@ -406,6 +409,8 @@
// Implementation: Pattern parsing
//----------------------------------------------------------------
+namespace {
+
/**
* A small all-inline class to manage a UnicodeSet pointer. Add
* operator->() etc. as needed.
@@ -424,6 +429,10 @@
}
};
+constexpr int32_t MAX_DEPTH = 100;
+
+} // namespace
+
/**
* Parse the pattern from the given RuleCharacterIterator. The
* iterator is advanced over the parsed pattern.
@@ -443,8 +452,13 @@
UnicodeString& rebuiltPat,
uint32_t options,
UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
+ int32_t depth,
UErrorCode& ec) {
if (U_FAILURE(ec)) return;
+ if (depth > MAX_DEPTH) {
+ ec = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
// Syntax characters: [ ] ^ - & { }
@@ -579,7 +593,7 @@
}
switch (setMode) {
case 1:
- nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
+ nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
break;
case 2:
chars.skipIgnored(opts);
@@ -837,6 +851,8 @@
// Property set implementation
//----------------------------------------------------------------
+namespace {
+
static UBool numericValueFilter(UChar32 ch, void* context) {
return u_getNumericValue(ch) == *(double*)context;
}
@@ -868,6 +884,8 @@
return uscript_hasScript(ch, *(UScriptCode*)context);
}
+} // namespace
+
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
@@ -924,6 +942,8 @@
}
}
+namespace {
+
static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
/* Note: we use ' ' in compiler code page */
int32_t j = 0;
@@ -941,6 +961,8 @@
return TRUE;
}
+} // namespace
+
//----------------------------------------------------------------
// Property set API
//----------------------------------------------------------------
diff --git a/source/i18n/decNumber.cpp b/source/i18n/decNumber.cpp
index 149062e..cee2f8e 100644
--- a/source/i18n/decNumber.cpp
+++ b/source/i18n/decNumber.cpp
@@ -627,10 +627,12 @@
for (; *c=='0' && *(c+1)!='\0';) c++; /* strip insignificant zeros */
firstexp=c; /* save exponent digit place */
+ uInt uexponent = 0; /* Avoid undefined behavior on signed int overflow */
for (; ;c++) {
if (*c<'0' || *c>'9') break; /* not a digit */
- exponent=X10(exponent)+(Int)*c-(Int)'0';
+ uexponent=X10(uexponent)+(uInt)*c-(uInt)'0';
} /* c */
+ exponent = (Int)uexponent;
/* if not now on a '\0', *c must not be a digit */
if (*c!='\0') break;