patches/uconv.patch - third_party/icu - Git at Google

 diff --git a/source/common/ucnv2022.cpp b/source/common/ucnv2022.cpp
 index 9556dd2..7b8df9b 100644
 --- a/source/common/ucnv2022.cpp
 +++ b/source/common/ucnv2022.cpp
 @@ -152,7 +152,11 @@ typedef enum  {
  } StateEnum;

  /* is the StateEnum charset value for a DBCS charset? */
 +#if UCONFIG_NO_NON_HTML5_CONVERSION
 +#define IS_JP_DBCS(cs) (JISX208==(cs))
 +#else
  #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 +#endif

  #define CSM(cs) ((uint16_t)1<<(cs))

 @@ -165,13 +169,23 @@ typedef enum  {
   *   all versions, not just JIS7 and JIS8.
   * - ICU does not distinguish between different versions of JIS X 0208.
   */
 +#if UCONFIG_NO_NON_HTML5_CONVERSION
 +enum { MAX_JA_VERSION=0 };
 +#else
  enum { MAX_JA_VERSION=4 };
 +#endif
  static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
 +/*
 + * TODO(jshin): The encoding spec has JISX212, but we don't support it.
 + * See https://www.w3.org/Bugs/Public/show_bug.cgi?id=26885
 + */
      CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
      CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
      CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
      CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
      CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 +#endif
  };

  typedef enum {
 @@ -358,15 +372,18 @@ static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
      ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
  };

 -
  /* Type def for refactoring changeState_2022 code*/
  typedef enum{
  #ifdef U_ENABLE_GENERIC_ISO_2022
      ISO_2022=0,
  #endif
 +#if UCONFIG_NO_NON_HTML5_CONVERSION
 +    ISO_2022_JP=1
 +#else
      ISO_2022_JP=1,
      ISO_2022_KR=2,
      ISO_2022_CN=3
 +#endif
  } Variant2022;

  /*********** ISO 2022 Converter Protos ***********/
 @@ -483,12 +500,15 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
                  /* prevent indexing beyond jpCharsetMasks[] */
                  myConverterData->version = version = 0;
              }
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
              if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
                  myConverterData->myConverterArray[ISO8859_7] =
                      ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
              }
 +#endif
              myConverterData->myConverterArray[JISX208] =
                  ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
              if(jpCharsetMasks[version]&CSM(JISX212)) {
                  myConverterData->myConverterArray[JISX212] =
                      ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
 @@ -501,6 +521,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
                  myConverterData->myConverterArray[KSC5601] =
                      ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
              }
 +#endif

              /* set the function pointers to appropriate funtions */
              cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 @@ -511,6 +532,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
              myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
              myConverterData->name[len+1]='\0';
          }
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
          else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
              (myLocale[2]=='_' || myLocale[2]=='\0'))
          {
 @@ -580,6 +602,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
                  (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
              }
          }
 +#endif // !UCONFIG_NO_NON_HTML5_CONVERSION
          else{
  #ifdef U_ENABLE_GENERIC_ISO_2022
              myConverterData->isFirstBuffer = TRUE;
 @@ -714,6 +737,7 @@ static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
  };

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  /*************** to unicode *******************/
  static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
  /*      0                1               2               3               4               5               6               7               8               9    */
 @@ -726,6 +750,7 @@ static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
  };
 +#endif


  static UCNV_TableStates_2022
 @@ -878,6 +903,7 @@ DONE:
                      }
                      break;
                  /* case SS3_STATE: not used in ISO-2022-JP-x */
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
                  case ISO8859_1:
                  case ISO8859_7:
                      if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 @@ -887,6 +913,7 @@ DONE:
                          myData2022->toU2022State.cs[2]=(int8_t)tempState;
                      }
                      break;
 +#endif
                  default:
                      if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
                          *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 @@ -898,6 +925,7 @@ DONE:
                  }
              }
              break;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
          case ISO_2022_CN:
              {
                  StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
 @@ -959,6 +987,7 @@ DONE:
                  *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
              }
              break;
 +#endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */

          default:
              *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 @@ -1379,12 +1408,16 @@ toUnicodeCallback(UConverter *cnv,
  static const StateEnum jpCharsetPref[]={
      ASCII,
      JISX201,
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
      ISO8859_1,
      ISO8859_7,
 +#endif
      JISX208,
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
      JISX212,
      GB2312,
      KSC5601,
 +#endif
      HWKANA_7BIT
  };

 @@ -1754,6 +1787,7 @@ getTrail:
                          g = 0;
                      }
                      break;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
                  case ISO8859_1:
                      if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
                          targetValue = (uint32_t)sourceChar - 0x80;
 @@ -1762,6 +1796,7 @@ getTrail:
                          g = 2;
                      }
                      break;
 +#endif
                  case HWKANA_7BIT:
                      if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
                          if(converterData->version==3) {
 @@ -1823,6 +1858,7 @@ getTrail:
                          useFallback = FALSE;
                      }
                      break;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
                  case ISO8859_7:
                      /* G0 SBCS forced to 7-bit output */
                      len2 = MBCS_SINGLE_FROM_UCHAR32(
 @@ -1837,6 +1873,7 @@ getTrail:
                          useFallback = FALSE;
                      }
                      break;
 +#endif
                  default:
                      /* G0 DBCS */
                      len2 = MBCS_FROM_UCHAR32_ISO2022(
 @@ -1844,6 +1881,7 @@ getTrail:
                                  sourceChar, &value,
                                  useFallback, MBCS_OUTPUT_2);
                      if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
                          if(cs0 == KSC5601) {
                              /*
                               * Check for valid bytes for the encoding scheme.
 @@ -1855,6 +1893,7 @@ getTrail:
                                  break;
                              }
                          }
 +#endif
                          targetValue = value;
                          len = len2;
                          cs = cs0;
 @@ -2148,6 +2187,7 @@ escape:
                          targetUniChar = mySourceChar;
                      }
                      break;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
                  case ISO8859_1:
                      if(mySourceChar <= 0x7f) {
                          targetUniChar = mySourceChar + 0x80;
 @@ -2166,6 +2206,7 @@ escape:
                      /* return from a single-shift state to the previous one */
                      pToU2022State->g=pToU2022State->prevG;
                      break;
 +#endif
                  case JISX201:
                      if(mySourceChar <= 0x7f) {
                          targetUniChar = jisx201ToU(mySourceChar);
 @@ -2205,9 +2246,11 @@ getTrailByte:
                              } else {
                                  /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
                                  mySourceChar = tmpSourceChar;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
                                  if (cs == KSC5601) {
                                      tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
                                  }
 +#endif
                                  tempBuf[0] = (char)(tmpSourceChar >> 8);
                                  tempBuf[1] = (char)(tmpSourceChar);
                              }
 @@ -2269,6 +2312,7 @@ endloop:
  }


 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  /***************************************************************
  *   Rules for ISO-2022-KR encoding
  *   i) The KSC5601 designator sequence should appear only once in a file,
 @@ -3412,6 +3456,7 @@ endloop:
      args->target = myTarget;
      args->source = mySource;
  }
 +#endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */

  static void
  _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
 @@ -3613,6 +3658,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
          /* include JIS X 0201 which is hardcoded */
          sa->add(sa->set, 0xa5);
          sa->add(sa->set, 0x203e);
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
          if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
              /* include Latin-1 for some variants of JP */
              sa->addRange(sa->set, 0, 0xff);
 @@ -3620,6 +3666,10 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
              /* include ASCII for JP */
              sa->addRange(sa->set, 0, 0x7f);
          }
 +#else
 +        /* include ASCII for JP */
 +        sa->addRange(sa->set, 0, 0x7f);
 +#endif
          if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
              /*
               * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
 @@ -3638,6 +3688,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
              sa->addRange(sa->set, HWKANA_START, HWKANA_END);
          }
          break;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
      case 'c':
      case 'z':
          /* include ASCII for CN */
 @@ -3649,6 +3700,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
                  cnvData->currentConverter, sa, which, pErrorCode);
          /* the loop over myConverterArray[] will simply not find another converter */
          break;
 +#endif
      default:
          break;
      }
 @@ -3669,9 +3721,15 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
      for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
          UConverterSetFilter filter;
          if(cnvData->myConverterArray[i]!=NULL) {
 -            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
 -                cnvData->version==0 && i==CNS_11643
 -            ) {
 +            if(cnvData->locale[0]=='j' && i==JISX208) {
 +                /*
 +                 * Only add code points that map to Shift-JIS codes
 +                 * corresponding to JIS X 0208.
 +                 */
 +                filter=UCNV_SET_FILTER_SJIS;
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
 +            } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
 +                       cnvData->version==0 && i==CNS_11643) {
                  /*
                   * Version-specific for CN:
                   * CN version 0 does not map CNS planes 3..7 although
 @@ -3680,18 +3738,13 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
                   * The two versions create different Unicode sets.
                   */
                  filter=UCNV_SET_FILTER_2022_CN;
 -            } else if(cnvData->locale[0]=='j' && i==JISX208) {
 -                /*
 -                 * Only add code points that map to Shift-JIS codes
 -                 * corresponding to JIS X 0208.
 -                 */
 -                filter=UCNV_SET_FILTER_SJIS;
              } else if(i==KSC5601) {
                  /*
                   * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
                   * are broader than GR94.
                   */
                  filter=UCNV_SET_FILTER_GR94DBCS;
 +#endif
              } else {
                  filter=UCNV_SET_FILTER_NONE;
              }
 @@ -3829,6 +3882,7 @@ const UConverterSharedData _ISO2022JPData={

  }  // namespace

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  /************* KR ***************/
  static const UConverterImpl _ISO2022KRImpl={
      UCNV_ISO_2022,
 @@ -3945,5 +3999,6 @@ const UConverterSharedData _ISO2022CNData={
  };

  }  // namespace
 +#endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */

  #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
 diff --git a/source/common/ucnv_bld.cpp b/source/common/ucnv_bld.cpp
 index 4940310..047f18a 100644
 --- a/source/common/ucnv_bld.cpp
 +++ b/source/common/ucnv_bld.cpp
 @@ -69,28 +69,41 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={

  #if UCONFIG_NO_LEGACY_CONVERSION
      NULL,
 +#else
 +    &_ISO2022Data,
 +#endif
 +
 +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_NON_HTML5_CONVERSION
      NULL, NULL, NULL, NULL, NULL, NULL,
      NULL, NULL, NULL, NULL, NULL, NULL,
      NULL,
  #else
 -    &_ISO2022Data,
      &_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6,
      &_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19,
      &_HZData,
  #endif

 +#if UCONFIG_NO_NON_HTML5_CONVERSION
 +    NULL,
 +#else
      &_SCSUData,
 +#endif

 -#if UCONFIG_NO_LEGACY_CONVERSION
 +
 +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_NON_HTML5_CONVERSION
      NULL,
  #else
      &_ISCIIData,
  #endif

      &_ASCIIData,
 +#if UCONFIG_NO_NON_HTML5_CONVERSION
 +    NULL, NULL, &_UTF16Data, &_UTF32Data, NULL, NULL,
 +#else
      &_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData,
 +#endif

 -#if UCONFIG_NO_LEGACY_CONVERSION
 +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_NON_HTML5_CONVERSION
      NULL,
  #else
      &_CompoundTextData
 @@ -105,18 +118,24 @@ static struct {
    const char *name;
    const UConverterType type;
  } const cnvNameType[] = {
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
    { "bocu1", UCNV_BOCU1 },
    { "cesu8", UCNV_CESU8 },
 -#if !UCONFIG_NO_LEGACY_CONVERSION
 +#endif
 +#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
    { "hz",UCNV_HZ },
  #endif
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
    { "imapmailboxname", UCNV_IMAP_MAILBOX },
 -#if !UCONFIG_NO_LEGACY_CONVERSION
 +#endif
 +#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
    { "iscii", UCNV_ISCII },
 +#endif
 +#if !UCONFIG_NO_LEGACY_CONVERSION
    { "iso2022", UCNV_ISO_2022 },
  #endif
    { "iso88591", UCNV_LATIN_1 },
 -#if !UCONFIG_NO_LEGACY_CONVERSION
 +#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
    { "lmbcs1", UCNV_LMBCS_1 },
    { "lmbcs11",UCNV_LMBCS_11 },
    { "lmbcs16",UCNV_LMBCS_16 },
 @@ -130,7 +149,9 @@ static struct {
    { "lmbcs6", UCNV_LMBCS_6 },
    { "lmbcs8", UCNV_LMBCS_8 },
  #endif
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
    { "scsu", UCNV_SCSU },
 +#endif
    { "usascii", UCNV_US_ASCII },
    { "utf16", UCNV_UTF16 },
    { "utf16be", UCNV_UTF16_BigEndian },
 @@ -152,9 +173,13 @@ static struct {
    { "utf32oppositeendian", UCNV_UTF32_BigEndian },
    { "utf32platformendian", UCNV_UTF32_LittleEndian },
  #endif
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
    { "utf7", UCNV_UTF7 },
 +#endif
    { "utf8", UCNV_UTF8 },
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
    { "x11compoundtext", UCNV_COMPOUND_TEXT}
 +#endif
  };


 diff --git a/source/common/ucnv_cnv.h b/source/common/ucnv_cnv.h
 index 402e2c9..5fad446 100644
 --- a/source/common/ucnv_cnv.h
 +++ b/source/common/ucnv_cnv.h
 @@ -256,11 +256,15 @@ struct UConverterImpl {
  extern const UConverterSharedData
      _MBCSData, _Latin1Data,
      _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData,
 -    _ISO2022Data,
 +    _ISO2022Data,
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
      _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
      _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
      _HZData,_ISCIIData, _SCSUData, _ASCIIData,
      _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData, _CompoundTextData;
 +#else
 +    _ASCIIData, _UTF16Data, _UTF32Data;
 +#endif

  U_CDECL_END

 diff --git a/source/common/ucnv_ct.c b/source/common/ucnv_ct.c
 index ec0e9c2..e723fa6 100644
 --- a/source/common/ucnv_ct.c
 +++ b/source/common/ucnv_ct.c
 @@ -14,7 +14,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "unicode/ucnv.h"
  #include "unicode/uset.h"
 diff --git a/source/common/ucnv_lmb.c b/source/common/ucnv_lmb.c
 index 1d921dd..a4fccee 100644
 --- a/source/common/ucnv_lmb.c
 +++ b/source/common/ucnv_lmb.c
 @@ -25,7 +25,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "unicode/ucnv_err.h"
  #include "unicode/ucnv.h"
 diff --git a/source/common/ucnv_u7.c b/source/common/ucnv_u7.c
 index 42943f4..6466b87 100644
 --- a/source/common/ucnv_u7.c
 +++ b/source/common/ucnv_u7.c
 @@ -16,7 +16,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "unicode/ucnv.h"
  #include "ucnv_bld.h"
 diff --git a/source/common/ucnv_u8.c b/source/common/ucnv_u8.c
 index 8ee9fe5..24205f5 100644
 --- a/source/common/ucnv_u8.c
 +++ b/source/common/ucnv_u8.c
 @@ -87,6 +87,15 @@ static const int8_t bytesFromUTF8[256] = {
  static const uint32_t
  utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };

 +static UBool hasCESU8Data(const UConverter *cnv)
 +{
 +#if UCONFIG_NO_NON_HTML5_CONVERSION
 +    return FALSE;
 +#else
 +    return (UBool)(cnv->sharedData == &_CESU8Data);
 +#endif
 +}
 +
  static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
                                    UErrorCode * err)
  {
 @@ -96,10 +105,10 @@ static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
      const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
      const UChar *targetLimit = args->targetLimit;
      unsigned char *toUBytes = cnv->toUBytes;
 -    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
 +    UBool isCESU8 = hasCESU8Data(cnv);
      uint32_t ch, ch2 = 0;
      int32_t i, inBytes;
 -
 +
      /* Restore size of current sequence */
      if (cnv->toUnicodeStatus && myTarget < targetLimit)
      {
 @@ -226,7 +235,7 @@ static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
      const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
      const UChar *targetLimit = args->targetLimit;
      unsigned char *toUBytes = cnv->toUBytes;
 -    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
 +    UBool isCESU8 = hasCESU8Data(cnv);
      uint32_t ch, ch2 = 0;
      int32_t i, inBytes;

 @@ -357,7 +366,7 @@ U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
      UChar32 ch;
      uint8_t tempBuf[4];
      int32_t indexToWrite;
 -    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
 +    UBool isNotCESU8 = !hasCESU8Data(cnv);

      if (cnv->fromUChar32 && myTarget < targetLimit)
      {
 @@ -473,7 +482,7 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * ar
      int32_t offsetNum, nextSourceIndex;
      int32_t indexToWrite;
      uint8_t tempBuf[4];
 -    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
 +    UBool isNotCESU8 = !hasCESU8Data(cnv);

      if (cnv->fromUChar32 && myTarget < targetLimit)
      {
 diff --git a/source/common/ucnvbocu.cpp b/source/common/ucnvbocu.cpp
 index b97d666..281d6d9 100644
 --- a/source/common/ucnvbocu.cpp
 +++ b/source/common/ucnvbocu.cpp
 @@ -19,7 +19,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "unicode/ucnv.h"
  #include "unicode/ucnv_cb.h"
 diff --git a/source/common/ucnvhz.c b/source/common/ucnvhz.c
 index 3760c39..51825e2 100644
 --- a/source/common/ucnvhz.c
 +++ b/source/common/ucnvhz.c
 @@ -16,7 +16,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "cmemory.h"
  #include "unicode/ucnv.h"
 @@ -635,4 +635,4 @@ const UConverterSharedData _HZData={
          0
  };

 -#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
 +#endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION */
 diff --git a/source/common/ucnvisci.c b/source/common/ucnvisci.c
 index fe61d40..16fd0a3 100644
 --- a/source/common/ucnvisci.c
 +++ b/source/common/ucnvisci.c
 @@ -17,7 +17,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "unicode/ucnv.h"
  #include "unicode/ucnv_cb.h"
 diff --git a/source/common/ucnvscsu.c b/source/common/ucnvscsu.c
 index c6e96e1..a6f8c9e 100644
 --- a/source/common/ucnvscsu.c
 +++ b/source/common/ucnvscsu.c
 @@ -21,7 +21,7 @@

  #include "unicode/utypes.h"

 -#if !UCONFIG_NO_CONVERSION
 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION

  #include "unicode/ucnv.h"
  #include "unicode/ucnv_cb.h"
 diff --git a/source/common/unicode/uconfig.h b/source/common/unicode/uconfig.h
 index ed073b6..8df56e6 100644
 --- a/source/common/unicode/uconfig.h
 +++ b/source/common/unicode/uconfig.h
 @@ -270,6 +270,14 @@
  #endif

  /**
 + * This switch turns off all the converters NOT listed in
 + * the encoding standard : http://encoding.spec.whatwg.org
 + */
 +#ifndef UCONFIG_NO_NON_HTML5_CONVERSION
 +#define UCONFIG_NO_NON_HTML5_CONVERSION 0
 +#endif
 +
 +/**
   * \def UCONFIG_NO_LEGACY_CONVERSION
   * This switch turns off all converters except for
   * - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1)
 diff --git a/source/common/unicode/urename.h b/source/common/unicode/urename.h
 index a817262..89becca 100644
 --- a/source/common/unicode/urename.h
 +++ b/source/common/unicode/urename.h
 @@ -73,12 +73,14 @@
  #define UDataMemory_setData U_ICU_ENTRY_POINT_RENAME(UDataMemory_setData)
  #define UDatamemory_assign U_ICU_ENTRY_POINT_RENAME(UDatamemory_assign)
  #define _ASCIIData U_ICU_ENTRY_POINT_RENAME(_ASCIIData)
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  #define _Bocu1Data U_ICU_ENTRY_POINT_RENAME(_Bocu1Data)
  #define _CESU8Data U_ICU_ENTRY_POINT_RENAME(_CESU8Data)
  #define _CompoundTextData U_ICU_ENTRY_POINT_RENAME(_CompoundTextData)
  #define _HZData U_ICU_ENTRY_POINT_RENAME(_HZData)
  #define _IMAPData U_ICU_ENTRY_POINT_RENAME(_IMAPData)
  #define _ISCIIData U_ICU_ENTRY_POINT_RENAME(_ISCIIData)
 +#endif
  #define _ISO2022Data U_ICU_ENTRY_POINT_RENAME(_ISO2022Data)
  #define _LMBCSData1 U_ICU_ENTRY_POINT_RENAME(_LMBCSData1)
  #define _LMBCSData11 U_ICU_ENTRY_POINT_RENAME(_LMBCSData11)
 @@ -94,14 +96,18 @@
  #define _LMBCSData8 U_ICU_ENTRY_POINT_RENAME(_LMBCSData8)
  #define _Latin1Data U_ICU_ENTRY_POINT_RENAME(_Latin1Data)
  #define _MBCSData U_ICU_ENTRY_POINT_RENAME(_MBCSData)
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  #define _SCSUData U_ICU_ENTRY_POINT_RENAME(_SCSUData)
 +#endif
  #define _UTF16BEData U_ICU_ENTRY_POINT_RENAME(_UTF16BEData)
  #define _UTF16Data U_ICU_ENTRY_POINT_RENAME(_UTF16Data)
  #define _UTF16LEData U_ICU_ENTRY_POINT_RENAME(_UTF16LEData)
  #define _UTF32BEData U_ICU_ENTRY_POINT_RENAME(_UTF32BEData)
  #define _UTF32Data U_ICU_ENTRY_POINT_RENAME(_UTF32Data)
  #define _UTF32LEData U_ICU_ENTRY_POINT_RENAME(_UTF32LEData)
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  #define _UTF7Data U_ICU_ENTRY_POINT_RENAME(_UTF7Data)
 +#endif
  #define _UTF8Data U_ICU_ENTRY_POINT_RENAME(_UTF8Data)
  #define cmemory_cleanup U_ICU_ENTRY_POINT_RENAME(cmemory_cleanup)
  #define cmemory_inUse U_ICU_ENTRY_POINT_RENAME(cmemory_inUse)
 diff --git a/source/i18n/csdetect.cpp b/source/i18n/csdetect.cpp
 index 3efbd49..ba5b18c 100644
 --- a/source/i18n/csdetect.cpp
 +++ b/source/i18n/csdetect.cpp
 @@ -110,6 +110,7 @@ static void U_CALLCONV initRecognizers(UErrorCode &status) {
          new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),

          new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
          new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
          new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),

 @@ -117,6 +118,7 @@ static void U_CALLCONV initRecognizers(UErrorCode &status) {
          new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
          new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
          new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
 +#endif
      };
      int32_t rCount = ARRAY_SIZE(tempArray);

 diff --git a/source/i18n/csr2022.cpp b/source/i18n/csr2022.cpp
 index 3db0bc9..be3eafa 100644
 --- a/source/i18n/csr2022.cpp
 +++ b/source/i18n/csr2022.cpp
 @@ -119,6 +119,7 @@ static const uint8_t escapeSequences_2022JP[][5] = {
      {0x1b, 0x2e, 0x46, 0x00, 0x00}    // ISO 8859-7
  };

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  static const uint8_t escapeSequences_2022KR[][5] = {
      {0x1b, 0x24, 0x29, 0x43, 0x00}
  };
 @@ -136,6 +137,7 @@ static const uint8_t escapeSequences_2022CN[][5] = {
      {0x1b, 0x4e, 0x00, 0x00, 0x00},   // SS2
      {0x1b, 0x4f, 0x00, 0x00, 0x00},   // SS3
  };
 +#endif

  CharsetRecog_2022JP::~CharsetRecog_2022JP() {}

 @@ -152,6 +154,7 @@ UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const
      return (confidence > 0);
  }

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  CharsetRecog_2022KR::~CharsetRecog_2022KR() {}

  const char *CharsetRecog_2022KR::getName() const {
 @@ -181,6 +184,7 @@ UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const
      results->set(textIn, this, confidence);
      return (confidence > 0);
  }
 +#endif

  CharsetRecog_2022::~CharsetRecog_2022() {
      // nothing to do
 diff --git a/source/i18n/csr2022.h b/source/i18n/csr2022.h
 index 2ac2b87..dad22c7 100644
 --- a/source/i18n/csr2022.h
 +++ b/source/i18n/csr2022.h
 @@ -65,6 +65,7 @@ public:
      UBool match(InputText *textIn, CharsetMatch *results) const;
  };

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  class CharsetRecog_2022KR :public CharsetRecog_2022 {
  public:
      virtual ~CharsetRecog_2022KR();
 @@ -84,6 +85,7 @@ public:

      UBool match(InputText *textIn, CharsetMatch *results) const;
  };
 +#endif

  U_NAMESPACE_END

 diff --git a/source/i18n/csrsbcs.cpp b/source/i18n/csrsbcs.cpp
 index d03367c..7b70dc1 100644
 --- a/source/i18n/csrsbcs.cpp
 +++ b/source/i18n/csrsbcs.cpp
 @@ -137,6 +137,7 @@ int32_t NGramParser::parse(InputText *det)
      return (int32_t) (rawPercent * 300.0);
  }

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  static const uint8_t unshapeMap_IBM420[] = {
  /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
  /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
 @@ -232,6 +233,7 @@ void NGramParser_IBM420::parseCharacters(InputText *det)
          }
      }
  }
 +#endif

  CharsetRecog_sbcs::CharsetRecog_sbcs()
  {
 @@ -624,6 +626,7 @@ static const uint8_t charMap_KOI8_R[] = {
      0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
  };

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  static const int32_t ngrams_IBM424_he_rtl[] = {
      0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
      0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
 @@ -691,6 +694,7 @@ static const uint8_t charMap_IBM420_ar[]= {
  /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
  /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
  };
 +#endif

  //ISO-8859-1,2,5,6,7,8,9 Ngrams

 @@ -1155,6 +1159,7 @@ UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
      return (confidence > 0);
  }

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
  {
      // nothing to do
 @@ -1253,6 +1258,7 @@ UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
      results->set(textIn, this, confidence);
      return (confidence > 0);
  }
 +#endif

  U_NAMESPACE_END
  #endif
 diff --git a/source/i18n/csrsbcs.h b/source/i18n/csrsbcs.h
 index 2579c02..7789f9b 100644
 --- a/source/i18n/csrsbcs.h
 +++ b/source/i18n/csrsbcs.h
 @@ -50,6 +50,7 @@ public:

  };

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  class NGramParser_IBM420 : public NGramParser
  {
  private:
 @@ -61,6 +62,7 @@ private:
  public:
      NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
  };
 +#endif


  class CharsetRecog_sbcs : public CharsetRecognizer
 @@ -229,6 +231,7 @@ public:
      virtual UBool match(InputText *det, CharsetMatch *results) const;
  };

 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
  class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
  {
  public:
 @@ -280,6 +283,7 @@ class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {

      virtual UBool match(InputText *det, CharsetMatch *results) const;
  };
 +#endif

  U_NAMESPACE_END