Make ISO-2022-JP converter compliant to the WHATWG encoding spec
The encoding of 373 code point into ISO-2022-JP failed because they're
mapped to SJIS code points for which there's no JIS 208 mapping.
Using JIS X 208 table in EUC-JP solves this issue.
Upstream bug: https://unicode-org.atlassian.net/browse/ICU-20251
Bug: 901255
Test: https://www.w3.org/International/tests/repo/run?base=encoding&batch=encoding-dbl-byte&test=legacy-mb-japanese/iso-2022-jp/iso2022jp-encode-form.html
Change-Id: I3237fa1269a3ae2e89b22e246aaf2ae43fd56c7d
Reviewed-on: https://chromium-review.googlesource.com/c/1314052
Reviewed-by: Joshua Bell <jsbell@chromium.org>
diff --git a/README.chromium b/README.chromium
index 143d973..71b7f0e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -251,5 +251,9 @@
https://unicode-org.atlassian.net/browse/ICU-20246
- Fix:
https://github.com/unicode-org/icu/pull/253
-
+9. ISO-2022-JP encoding (fromUnicode) change per WHATWG encoding spec.
+
+ - patches/iso2022jp.patch
+ - upstream bug:
+ https://unicode-org.atlassian.net/browse/ICU-20251
diff --git a/patches/iso2022jp.patch b/patches/iso2022jp.patch
new file mode 100644
index 0000000..b36bcf4
--- /dev/null
+++ b/patches/iso2022jp.patch
@@ -0,0 +1,134 @@
+diff --git a/source/common/ucnv2022.cpp b/source/common/ucnv2022.cpp
+index 2ef5db97..6ac8f62f 100644
+--- a/source/common/ucnv2022.cpp
++++ b/source/common/ucnv2022.cpp
+@@ -513,7 +513,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
+ ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
+ }
+ myConverterData->myConverterArray[JISX208] =
+- ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
++ ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode);
+ if(jpCharsetMasks[version]&CSM(JISX212)) {
+ myConverterData->myConverterArray[JISX212] =
+ ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
+@@ -1514,79 +1514,6 @@ jisx201FromU(uint32_t value) {
+ return 0xfffe;
+ }
+
+-/*
+- * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
+- * to JIS X 0208, and convert it to a pair of 21..7E bytes.
+- * Return 0 if the byte pair is out of range.
+- */
+-static inline uint32_t
+-_2022FromSJIS(uint32_t value) {
+- uint8_t trail;
+-
+- if(value > 0xEFFC) {
+- return 0; /* beyond JIS X 0208 */
+- }
+-
+- trail = (uint8_t)value;
+-
+- value &= 0xff00; /* lead byte */
+- if(value <= 0x9f00) {
+- value -= 0x7000;
+- } else /* 0xe000 <= value <= 0xef00 */ {
+- value -= 0xb000;
+- }
+- value <<= 1;
+-
+- if(trail <= 0x9e) {
+- value -= 0x100;
+- if(trail <= 0x7e) {
+- value |= trail - 0x1f;
+- } else {
+- value |= trail - 0x20;
+- }
+- } else /* trail <= 0xfc */ {
+- value |= trail - 0x7e;
+- }
+- return value;
+-}
+-
+-/*
+- * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
+- * If either byte is outside 21..7E make sure that the result is not valid
+- * for Shift-JIS so that the converter catches it.
+- * Some invalid byte values already turn into equally invalid Shift-JIS
+- * byte values and need not be tested explicitly.
+- */
+-static inline void
+-_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
+- if(c1&1) {
+- ++c1;
+- if(c2 <= 0x5f) {
+- c2 += 0x1f;
+- } else if(c2 <= 0x7e) {
+- c2 += 0x20;
+- } else {
+- c2 = 0; /* invalid */
+- }
+- } else {
+- if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
+- c2 += 0x7e;
+- } else {
+- c2 = 0; /* invalid */
+- }
+- }
+- c1 >>= 1;
+- if(c1 <= 0x2f) {
+- c1 += 0x70;
+- } else if(c1 <= 0x3f) {
+- c1 += 0xb0;
+- } else {
+- c1 = 0; /* invalid */
+- }
+- bytes[0] = (char)c1;
+- bytes[1] = (char)c2;
+-}
+-
+ /*
+ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
+ * Katakana.
+@@ -1857,8 +1784,13 @@ getTrail:
+ converterData->myConverterArray[cs0],
+ sourceChar, &value,
+ useFallback, MBCS_OUTPUT_2);
+- if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
+- value = _2022FromSJIS(value);
++ // Only accept DBCS char (abs(len2) == 2).
++ // With EUC-JP table for JIS X 208, half-width Kana
++ // represented with DBCS starting with 0x8E has to be
++ // filtered out so that they can be converted with
++ // hwkana_fb table.
++ if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) {
++ value &= 0x7F7F;
+ if(value != 0) {
+ targetValue = value;
+ len = len2;
+@@ -2250,18 +2182,13 @@ getTrailByte:
+ if (leadIsOk && trailIsOk) {
+ ++mySource;
+ tmpSourceChar = (mySourceChar << 8) | trailByte;
+- if(cs == JISX208) {
+- _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
+- mySourceChar = tmpSourceChar;
+- } else {
+- /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
+- mySourceChar = tmpSourceChar;
+- if (cs == KSC5601) {
+- tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
+- }
+- tempBuf[0] = (char)(tmpSourceChar >> 8);
+- tempBuf[1] = (char)(tmpSourceChar);
++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
++ mySourceChar = tmpSourceChar;
++ if (cs == JISX208 || cs == KSC5601) {
++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
+ }
++ tempBuf[0] = (char)(tmpSourceChar >> 8);
++ tempBuf[1] = (char)(tmpSourceChar);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
+ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
diff --git a/source/common/ucnv2022.cpp b/source/common/ucnv2022.cpp
index d1a50d2..6a5dbdb 100644
--- a/source/common/ucnv2022.cpp
+++ b/source/common/ucnv2022.cpp
@@ -513,7 +513,7 @@
ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
}
myConverterData->myConverterArray[JISX208] =
- ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
+ ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode);
if(jpCharsetMasks[version]&CSM(JISX212)) {
myConverterData->myConverterArray[JISX212] =
ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
@@ -1515,79 +1515,6 @@
}
/*
- * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
- * to JIS X 0208, and convert it to a pair of 21..7E bytes.
- * Return 0 if the byte pair is out of range.
- */
-static inline uint32_t
-_2022FromSJIS(uint32_t value) {
- uint8_t trail;
-
- if(value > 0xEFFC) {
- return 0; /* beyond JIS X 0208 */
- }
-
- trail = (uint8_t)value;
-
- value &= 0xff00; /* lead byte */
- if(value <= 0x9f00) {
- value -= 0x7000;
- } else /* 0xe000 <= value <= 0xef00 */ {
- value -= 0xb000;
- }
- value <<= 1;
-
- if(trail <= 0x9e) {
- value -= 0x100;
- if(trail <= 0x7e) {
- value |= trail - 0x1f;
- } else {
- value |= trail - 0x20;
- }
- } else /* trail <= 0xfc */ {
- value |= trail - 0x7e;
- }
- return value;
-}
-
-/*
- * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
- * If either byte is outside 21..7E make sure that the result is not valid
- * for Shift-JIS so that the converter catches it.
- * Some invalid byte values already turn into equally invalid Shift-JIS
- * byte values and need not be tested explicitly.
- */
-static inline void
-_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
- if(c1&1) {
- ++c1;
- if(c2 <= 0x5f) {
- c2 += 0x1f;
- } else if(c2 <= 0x7e) {
- c2 += 0x20;
- } else {
- c2 = 0; /* invalid */
- }
- } else {
- if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
- c2 += 0x7e;
- } else {
- c2 = 0; /* invalid */
- }
- }
- c1 >>= 1;
- if(c1 <= 0x2f) {
- c1 += 0x70;
- } else if(c1 <= 0x3f) {
- c1 += 0xb0;
- } else {
- c1 = 0; /* invalid */
- }
- bytes[0] = (char)c1;
- bytes[1] = (char)c2;
-}
-
-/*
* JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
* Katakana.
* Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
@@ -1857,8 +1784,13 @@
converterData->myConverterArray[cs0],
sourceChar, &value,
useFallback, MBCS_OUTPUT_2);
- if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
- value = _2022FromSJIS(value);
+ // Only accept DBCS char (abs(len2) == 2).
+ // With EUC-JP table for JIS X 208, half-width Kana
+ // represented with DBCS starting with 0x8E has to be
+ // filtered out so that they can be converted with
+ // hwkana_fb table.
+ if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) {
+ value &= 0x7F7F;
if(value != 0) {
targetValue = value;
len = len2;
@@ -2250,18 +2182,13 @@
if (leadIsOk && trailIsOk) {
++mySource;
tmpSourceChar = (mySourceChar << 8) | trailByte;
- if(cs == JISX208) {
- _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
- mySourceChar = tmpSourceChar;
- } else {
- /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
- mySourceChar = tmpSourceChar;
- if (cs == KSC5601) {
- tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
- }
- tempBuf[0] = (char)(tmpSourceChar >> 8);
- tempBuf[1] = (char)(tmpSourceChar);
+ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
+ mySourceChar = tmpSourceChar;
+ if (cs == JISX208 || cs == KSC5601) {
+ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
}
+ tempBuf[0] = (char)(tmpSourceChar >> 8);
+ tempBuf[1] = (char)(tmpSourceChar);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
} else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
/* report a pair of illegal bytes if the second byte is not a DBCS starter */