patches/uniset_perf2.patch - third_party/icu - Git at Google

 diff --git a/source/common/characterproperties.cpp b/source/common/characterproperties.cpp
 index 3aff85b3..b416ef52 100644
 --- a/source/common/characterproperties.cpp
 +++ b/source/common/characterproperties.cpp
 @@ -23,6 +23,9 @@
  #include "umutex.h"
  #include "uprops.h"

 +using icu::LocalPointer;
 +using icu::Normalizer2Factory;
 +using icu::Normalizer2Impl;
  using icu::UInitOnce;
  using icu::UnicodeSet;

 @@ -30,11 +33,13 @@ namespace {

  UBool U_CALLCONV characterproperties_cleanup();

 +constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
 +
  struct Inclusion {
      UnicodeSet  *fSet;
      UInitOnce    fInitOnce;
  };
 -Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
 +Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()

  UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};

 @@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() {
      return TRUE;
  }

 -}  // namespace
 -
 -U_NAMESPACE_BEGIN
 -
 -/*
 -Reduce excessive reallocation, and make it easier to detect initialization problems.
 -Usually you don't see smaller sets than this for Unicode 5.0.
 -*/
 -constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072;
 -
 -void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) {
 +void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
      // This function is invoked only via umtx_initOnce().
 -    // This function is a friend of class UnicodeSet.
 -
      U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
      if (src == UPROPS_SRC_NONE) {
          errorCode = U_INTERNAL_PROGRAM_ERROR;
          return;
      }
 -    UnicodeSet * &incl = gInclusions[src].fSet;
 -    U_ASSERT(incl == nullptr);
 +    U_ASSERT(gInclusions[src].fSet == nullptr);

 -    incl = new UnicodeSet();
 -    if (incl == nullptr) {
 +    LocalPointer<UnicodeSet> incl(new UnicodeSet());
 +    if (incl.isNull()) {
          errorCode = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
      USetAdder sa = {
 -        (USet *)incl,
 +        (USet *)incl.getAlias(),
          _set_add,
          _set_addRange,
          _set_addString,
 @@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo
          nullptr // don't need removeRange()
      };

 -    incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode);
      switch(src) {
      case UPROPS_SRC_CHAR:
          uchar_addPropertyStarts(&sa, &errorCode);
 @@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo
      }

      if (U_FAILURE(errorCode)) {
 -        delete incl;
 -        incl = nullptr;
          return;
      }
 -    // Compact for caching
 +    if (incl->isBogus()) {
 +        errorCode = U_MEMORY_ALLOCATION_ERROR;
 +        return;
 +    }
 +    // Compact for caching.
      incl->compact();
 +    gInclusions[src].fSet = incl.orphan();
      ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
  }

 @@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC
          return nullptr;
      }
      Inclusion &i = gInclusions[src];
 -    umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode);
 +    umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
      return i.fSet;
  }

 +void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
 +    // This function is invoked only via umtx_initOnce().
 +    U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
 +    int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
 +    U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
 +    UPropertySource src = uprops_getSource(prop);
 +    const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
 +    if (U_FAILURE(errorCode)) {
 +        return;
 +    }
 +
 +    LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
 +    if (intPropIncl.isNull()) {
 +        errorCode = U_MEMORY_ALLOCATION_ERROR;
 +        return;
 +    }
 +    int32_t numRanges = incl->getRangeCount();
 +    int32_t prevValue = 0;
 +    for (int32_t i = 0; i < numRanges; ++i) {
 +        UChar32 rangeEnd = incl->getRangeEnd(i);
 +        for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
 +            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
 +            int32_t value = u_getIntPropertyValue(c, prop);
 +            if (value != prevValue) {
 +                intPropIncl->add(c);
 +                prevValue = value;
 +            }
 +        }
 +    }
 +
 +    if (intPropIncl->isBogus()) {
 +        errorCode = U_MEMORY_ALLOCATION_ERROR;
 +        return;
 +    }
 +    // Compact for caching.
 +    intPropIncl->compact();
 +    gInclusions[inclIndex].fSet = intPropIncl.orphan();
 +    ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
 +}
 +
 +}  // namespace
 +
 +U_NAMESPACE_BEGIN
 +
  const UnicodeSet *CharacterProperties::getInclusionsForProperty(
          UProperty prop, UErrorCode &errorCode) {
      if (U_FAILURE(errorCode)) { return nullptr; }
 -    UPropertySource src = uprops_getSource(prop);
 -    return getInclusionsForSource(src, errorCode);
 +    if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
 +        int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
 +        Inclusion &i = gInclusions[inclIndex];
 +        umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
 +        return i.fSet;
 +    } else {
 +        UPropertySource src = uprops_getSource(prop);
 +        return getInclusionsForSource(src, errorCode);
 +    }
  }

  U_NAMESPACE_END
 @@ -216,7 +261,7 @@ namespace {

  UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
      if (U_FAILURE(errorCode)) { return nullptr; }
 -    icu::LocalPointer<UnicodeSet> set(new UnicodeSet());
 +    LocalPointer<UnicodeSet> set(new UnicodeSet());
      if (set.isNull()) {
          errorCode = U_MEMORY_ALLOCATION_ERROR;
          return nullptr;
 diff --git a/source/common/ucptrie.cpp b/source/common/ucptrie.cpp
 index 13496ad5..b72e3183 100644
 --- a/source/common/ucptrie.cpp
 +++ b/source/common/ucptrie.cpp
 @@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start,
      int32_t prevI3Block = -1;
      int32_t prevBlock = -1;
      UChar32 c = start;
 -    uint32_t value;
 +    uint32_t trieValue, value;
      bool haveValue = false;
      do {
          int32_t i3Block;
 @@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start,
                          return c - 1;
                      }
                  } else {
 +                    trieValue = trie->nullValue;
                      value = nullValue;
                      if (pValue != nullptr) { *pValue = nullValue; }
                      haveValue = true;
 @@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start,
                              return c - 1;
                          }
                      } else {
 +                        trieValue = trie->nullValue;
                          value = nullValue;
                          if (pValue != nullptr) { *pValue = nullValue; }
                          haveValue = true;
 @@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start,
                      c = (c + dataBlockLength) & ~dataMask;
                  } else {
                      int32_t di = block + (c & dataMask);
 -                    uint32_t value2 = getValue(trie->data, valueWidth, di);
 -                    value2 = maybeFilterValue(value2, trie->nullValue, nullValue,
 -                                              filter, context);
 +                    uint32_t trieValue2 = getValue(trie->data, valueWidth, di);
                      if (haveValue) {
 -                        if (value2 != value) {
 -                            return c - 1;
 +                        if (trieValue2 != trieValue) {
 +                            if (filter == nullptr ||
 +                                    maybeFilterValue(trieValue2, trie->nullValue, nullValue,
 +                                                     filter, context) != value) {
 +                                return c - 1;
 +                            }
 +                            trieValue = trieValue2;  // may or may not help
                          }
                      } else {
 -                        value = value2;
 +                        trieValue = trieValue2;
 +                        value = maybeFilterValue(trieValue2, trie->nullValue, nullValue,
 +                                                 filter, context);
                          if (pValue != nullptr) { *pValue = value; }
                          haveValue = true;
                      }
                      while ((++c & dataMask) != 0) {
 -                        if (maybeFilterValue(getValue(trie->data, valueWidth, ++di),
 -                                             trie->nullValue, nullValue,
 -                                             filter, context) != value) {
 -                            return c - 1;
 +                        trieValue2 = getValue(trie->data, valueWidth, ++di);
 +                        if (trieValue2 != trieValue) {
 +                            if (filter == nullptr ||
 +                                    maybeFilterValue(trieValue2, trie->nullValue, nullValue,
 +                                                     filter, context) != value) {
 +                                return c - 1;
 +                            }
 +                            trieValue = trieValue2;  // may or may not help
                          }
                      }
                  }
 diff --git a/source/common/umutablecptrie.cpp b/source/common/umutablecptrie.cpp
 index 44af8309..926be468 100644
 --- a/source/common/umutablecptrie.cpp
 +++ b/source/common/umutablecptrie.cpp
 @@ -304,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange(
      uint32_t nullValue = initialValue;
      if (filter != nullptr) { nullValue = filter(context, nullValue); }
      UChar32 c = start;
 -    uint32_t value;
 +    uint32_t trieValue, value;
      bool haveValue = false;
      int32_t i = c >> UCPTRIE_SHIFT_3;
      do {
          if (flags[i] == ALL_SAME) {
 -            uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue,
 -                                               filter, context);
 +            uint32_t trieValue2 = index[i];
              if (haveValue) {
 -                if (value2 != value) {
 -                    return c - 1;
 +                if (trieValue2 != trieValue) {
 +                    if (filter == nullptr ||
 +                            maybeFilterValue(trieValue2, initialValue, nullValue,
 +                                             filter, context) != value) {
 +                        return c - 1;
 +                    }
 +                    trieValue = trieValue2;  // may or may not help
                  }
              } else {
 -                value = value2;
 +                trieValue = trieValue2;
 +                value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
                  if (pValue != nullptr) { *pValue = value; }
                  haveValue = true;
              }
              c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
          } else /* MIXED */ {
              int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
 -            uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue,
 -                                               filter, context);
 +            uint32_t trieValue2 = data[di];
              if (haveValue) {
 -                if (value2 != value) {
 -                    return c - 1;
 +                if (trieValue2 != trieValue) {
 +                    if (filter == nullptr ||
 +                            maybeFilterValue(trieValue2, initialValue, nullValue,
 +                                             filter, context) != value) {
 +                        return c - 1;
 +                    }
 +                    trieValue = trieValue2;  // may or may not help
                  }
              } else {
 -                value = value2;
 +                trieValue = trieValue2;
 +                value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
                  if (pValue != nullptr) { *pValue = value; }
                  haveValue = true;
              }
              while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
 -                if (maybeFilterValue(data[++di], initialValue, nullValue,
 -                                     filter, context) != value) {
 -                    return c - 1;
 +                trieValue2 = data[++di];
 +                if (trieValue2 != trieValue) {
 +                    if (filter == nullptr ||
 +                            maybeFilterValue(trieValue2, initialValue, nullValue,
 +                                             filter, context) != value) {
 +                        return c - 1;
 +                    }
                  }
 +                trieValue = trieValue2;  // may or may not help
              }
          }
          ++i;
 diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h
 index 0abc7542..af56b872 100644
 --- a/source/common/unicode/uniset.h
 +++ b/source/common/unicode/uniset.h
 @@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN

  // Forward Declarations.
  class BMPSet;
 -class CharacterProperties;
  class ParsePosition;
  class RBBIRuleScanner;
  class SymbolTable;
 @@ -276,14 +275,23 @@ class RuleCharacterIterator;
   * @stable ICU 2.0
   */
  class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
 +private:
 +    /**
 +     * Enough for sets with few ranges.
 +     * For example, White_Space has 10 ranges, list length 21.
 +     */
 +    static constexpr int32_t INITIAL_CAPACITY = 25;
 +    // fFlags constant
 +    static constexpr uint8_t kIsBogus = 1;  // This set is bogus (i.e. not valid)
 +
 +    UChar32* list = stackList; // MUST be terminated with HIGH
 +    int32_t capacity = INITIAL_CAPACITY; // capacity of list
 +    int32_t len = 1; // length of list used; 1 <= len <= capacity
 +    uint8_t fFlags = 0;         // Bit flag (see constants above)

 -    int32_t len; // length of list used; 0 <= len <= capacity
 -    int32_t capacity; // capacity of list
 -    UChar32* list; // MUST be terminated with HIGH
 -    BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
 -    UChar32* buffer; // internal buffer, may be NULL
 -    int32_t bufferCapacity; // capacity of buffer
 -    int32_t patLen;
 +    BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
 +    UChar32* buffer = nullptr; // internal buffer, may be NULL
 +    int32_t bufferCapacity = 0; // capacity of buffer

      /**
       * The pattern representation of this set.  This may not be the
 @@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
       * indicating that toPattern() must generate a pattern
       * representation from the inversion list.
       */
 -    char16_t *pat;
 -    UVector* strings; // maintained in sorted order
 -    UnicodeSetStringSpan *stringSpan;
 +    char16_t *pat = nullptr;
 +    int32_t patLen = 0;
 +
 +    UVector* strings = nullptr; // maintained in sorted order
 +    UnicodeSetStringSpan *stringSpan = nullptr;
 +
 +    /**
 +     * Initial list array.
 +     * Avoids some heap allocations, and list is never nullptr.
 +     * Increases the object size a bit.
 +     */
 +    UChar32 stackList[INITIAL_CAPACITY];

 -private:
 -    enum { // constants
 -        kIsBogus = 1       // This set is bogus (i.e. not valid)
 -    };
 -    uint8_t fFlags;         // Bit flag (see constants above)
  public:
      /**
       * Determine if this object contains a valid set.
 @@ -1480,8 +1492,6 @@ private:

      friend class USetAccess;

 -    int32_t getStringCount() const;
 -
      const UnicodeString* getString(int32_t index) const;

      //----------------------------------------------------------------
 @@ -1528,13 +1538,18 @@ private:
      // Implementation: Utility methods
      //----------------------------------------------------------------

 -    void ensureCapacity(int32_t newLen, UErrorCode& ec);
 +    static int32_t nextCapacity(int32_t minCapacity);
 +
 +    bool ensureCapacity(int32_t newLen);

 -    void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
 +    bool ensureBufferCapacity(int32_t newLen);

      void swapBuffers(void);

      UBool allocateStrings(UErrorCode &status);
 +    UBool hasStrings() const;
 +    int32_t stringsSize() const;
 +    UBool stringsContains(const UnicodeString &s) const;

      UnicodeString& _toPattern(UnicodeString& result,
                                UBool escapeUnprintable) const;
 @@ -1614,7 +1629,6 @@ private:
                                UnicodeString& rebuiltPat,
                                UErrorCode& ec);

 -    friend class CharacterProperties;
      static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);

      /**
 @@ -1646,7 +1660,10 @@ private:
      /**
       * Set the new pattern to cache.
       */
 -    void setPattern(const UnicodeString& newPat);
 +    void setPattern(const UnicodeString& newPat) {
 +        setPattern(newPat.getBuffer(), newPat.length());
 +    }
 +    void setPattern(const char16_t *newPat, int32_t newPatLen);
      /**
       * Release existing cached pattern.
       */
 diff --git a/source/common/uniset.cpp b/source/common/uniset.cpp
 index e8378e0a..20242776 100644
 --- a/source/common/uniset.cpp
 +++ b/source/common/uniset.cpp
 @@ -14,6 +14,7 @@
  #include "unicode/parsepos.h"
  #include "unicode/symtable.h"
  #include "unicode/uniset.h"
 +#include "unicode/ustring.h"
  #include "unicode/utf8.h"
  #include "unicode/utf16.h"
  #include "ruleiter.h"
 @@ -53,11 +54,8 @@
  // LOW <= all valid values. ZERO for codepoints
  #define UNICODESET_LOW 0x000000

 -// initial storage. Must be >= 0
 -#define START_EXTRA 16
 -
 -// extra amount for growth. Must be >= 0
 -#define GROW_EXTRA START_EXTRA
 +/** Max list [0, 1, 2, ..., max code point, HIGH] */
 +constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;

  U_NAMESPACE_BEGIN

 @@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
      return a.compare(b);
  }

 +UBool UnicodeSet::hasStrings() const {
 +    return strings != nullptr && !strings->isEmpty();
 +}
 +
 +int32_t UnicodeSet::stringsSize() const {
 +    return strings == nullptr ? 0 : strings->size();
 +}
 +
 +UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
 +    return strings != nullptr && strings->contains((void*) &s);
 +}
 +
  //----------------------------------------------------------------
  // Constructors &c
  //----------------------------------------------------------------
 @@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
  /**
   * Constructs an empty set.
   */
 -UnicodeSet::UnicodeSet() :
 -    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
 -    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    UErrorCode status = U_ZERO_ERROR;
 -    allocateStrings(status);
 -    if (U_FAILURE(status)) {
 -        setToBogus(); // If memory allocation failed, set to bogus state.
 -        return;
 -    }
 -    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -    if(list!=NULL){
 -        list[0] = UNICODESET_HIGH;
 -    } else { // If memory allocation failed, set to bogus state.
 -        setToBogus();
 -        return;
 -    }
 +UnicodeSet::UnicodeSet() {
 +    list[0] = UNICODESET_HIGH;
      _dbgct(this);
  }

 @@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() :
   * @param start first character, inclusive, of range
   * @param end last character, inclusive, of range
   */
 -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
 -    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
 -    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    UErrorCode status = U_ZERO_ERROR;
 -    allocateStrings(status);
 -    if (U_FAILURE(status)) {
 -        setToBogus(); // If memory allocation failed, set to bogus state.
 -        return;
 -    }
 -    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -    if(list!=NULL){
 -        list[0] = UNICODESET_HIGH;
 -        complement(start, end);
 -    } else { // If memory allocation failed, set to bogus state.
 -        setToBogus();
 -        return;
 -    }
 +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
 +    list[0] = UNICODESET_HIGH;
 +    add(start, end);
      _dbgct(this);
  }

  /**
   * Constructs a set that is identical to the given UnicodeSet.
   */
 -UnicodeSet::UnicodeSet(const UnicodeSet& o) :
 -    UnicodeFilter(o),
 -    len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
 -    bmpSet(0),
 -    buffer(0), bufferCapacity(0),
 -    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    UErrorCode status = U_ZERO_ERROR;
 -    allocateStrings(status);
 -    if (U_FAILURE(status)) {
 -        setToBogus(); // If memory allocation failed, set to bogus state.
 -        return;
 -    }
 -    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -    if(list!=NULL){
 -        *this = o;
 -    } else { // If memory allocation failed, set to bogus state.
 -        setToBogus();
 -        return;
 -    }
 +UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
 +    *this = o;
      _dbgct(this);
  }

  // Copy-construct as thawed.
 -UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
 -    UnicodeFilter(o),
 -    len(0), capacity(o.len + GROW_EXTRA), list(0),
 -    bmpSet(0),
 -    buffer(0), bufferCapacity(0),
 -    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    UErrorCode status = U_ZERO_ERROR;
 -    allocateStrings(status);
 -    if (U_FAILURE(status)) {
 -        setToBogus(); // If memory allocation failed, set to bogus state.
 -        return;
 -    }
 -    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -    if(list!=NULL){
 +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
 +    if (ensureCapacity(o.len)) {
          // *this = o except for bmpSet and stringSpan
          len = o.len;
          uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
 -        if (strings != NULL && o.strings != NULL) {
 -            strings->assign(*o.strings, cloneUnicodeString, status);
 -        } else { // Invalid strings.
 -            setToBogus();
 -            return;
 +        if (o.hasStrings()) {
 +            UErrorCode status = U_ZERO_ERROR;
 +            if (!allocateStrings(status) ||
 +                    (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
 +                setToBogus();
 +                return;
 +            }
          }
          if (o.pat) {
 -            setPattern(UnicodeString(o.pat, o.patLen));
 +            setPattern(o.pat, o.patLen);
          }
 -    } else { // If memory allocation failed, set to bogus state.
 -        setToBogus();
 -        return;
 +        _dbgct(this);
      }
 -    _dbgct(this);
  }

  /**
 @@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
   */
  UnicodeSet::~UnicodeSet() {
      _dbgdt(this); // first!
 -    uprv_free(list);
 +    if (list != stackList) {
 +        uprv_free(list);
 +    }
      delete bmpSet;
 -    if (buffer) {
 +    if (buffer != stackList) {
          uprv_free(buffer);
      }
      delete strings;
 @@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
          setToBogus();
          return *this;
      }
 -    UErrorCode ec = U_ZERO_ERROR;
 -    ensureCapacity(o.len, ec);
 -    if (U_FAILURE(ec)) {
 +    if (!ensureCapacity(o.len)) {
          // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
          return *this;
      }
      len = o.len;
      uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
 -    if (o.bmpSet == NULL || asThawed) {
 -        bmpSet = NULL;
 -    } else {
 +    if (o.bmpSet != nullptr && !asThawed) {
          bmpSet = new BMPSet(*o.bmpSet, list, len);
          if (bmpSet == NULL) { // Check for memory allocation error.
              setToBogus();
              return *this;
          }
      }
 -    if (strings != NULL && o.strings != NULL) {
 -        strings->assign(*o.strings, cloneUnicodeString, ec);
 -    } else { // Invalid strings.
 -        setToBogus();
 -        return *this;
 +    if (o.hasStrings()) {
 +        UErrorCode status = U_ZERO_ERROR;
 +        if ((strings == nullptr && !allocateStrings(status)) ||
 +                (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
 +            setToBogus();
 +            return *this;
 +        }
 +    } else if (hasStrings()) {
 +        strings->removeAllElements();
      }
 -    if (o.stringSpan == NULL || asThawed) {
 -        stringSpan = NULL;
 -    } else {
 +    if (o.stringSpan != nullptr && !asThawed) {
          stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
          if (stringSpan == NULL) { // Check for memory allocation error.
              setToBogus();
 @@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
      }
      releasePattern();
      if (o.pat) {
 -        setPattern(UnicodeString(o.pat, o.patLen));
 +        setPattern(o.pat, o.patLen);
      }
      return *this;
  }
 @@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
      for (int32_t i = 0; i < len; ++i) {
          if (list[i] != o.list[i]) return FALSE;
      }
 -    if (*strings != *o.strings) return FALSE;
 +    if (hasStrings() != o.hasStrings()) { return FALSE; }
 +    if (hasStrings() && *strings != *o.strings) return FALSE;
      return TRUE;
  }

 @@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const {
      for (int32_t i = 0; i < count; ++i) {
          n += getRangeEnd(i) - getRangeStart(i) + 1;
      }
 -    return n + strings->size();
 +    return n + stringsSize();
  }

  /**
 @@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const {
   * @return <tt>true</tt> if this set contains no elements.
   */
  UBool UnicodeSet::isEmpty(void) const {
 -    return len == 1 && strings->size() == 0;
 +    return len == 1 && !hasStrings();
  }

  /**
 @@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const {
      if (s.length() == 0) return FALSE;
      int32_t cp = getSingleCP(s);
      if (cp < 0) {
 -        return strings->contains((void*) &s);
 +        return stringsContains(s);
      } else {
          return contains((UChar32) cp);
      }
 @@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
              return FALSE;
          }
      }
 -    if (!strings->containsAll(*c.strings)) return FALSE;
 -    return TRUE;
 +    return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
  }

  /**
 @@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
              return FALSE;
          }
      }
 -    if (!strings->containsNone(*c.strings)) return FALSE;
 -    return TRUE;
 +    return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
  }

  /**
 @@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
              return TRUE;
          }
      }
 -    if (strings->size() != 0) {
 +    if (hasStrings()) {
          for (i=0; i<strings->size(); ++i) {
              const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
              //if (s.length() == 0) {
 @@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
              return U_MISMATCH;
          }
      } else {
 -        if (strings->size() != 0) { // try strings first
 +        if (hasStrings()) { // try strings first

              // might separate forward and backward loops later
              // for now they are combined
 @@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
   */
  UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
      if (pinCodePoint(start) < pinCodePoint(end)) {
 -        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
 +        UChar32 limit = end + 1;
 +        // Fast path for adding a new range after the last one.
 +        // Odd list length: [..., lastStart, lastLimit, HIGH]
 +        if ((len & 1) != 0) {
 +            // If the list is empty, set lastLimit low enough to not be adjacent to 0.
 +            UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
 +            if (lastLimit <= start && !isFrozen() && !isBogus()) {
 +                if (lastLimit == start) {
 +                    // Extend the last range.
 +                    list[len - 2] = limit;
 +                    if (limit == UNICODESET_HIGH) {
 +                        --len;
 +                    }
 +                } else {
 +                    list[len - 1] = start;
 +                    if (limit < UNICODESET_HIGH) {
 +                        if (ensureCapacity(len + 2)) {
 +                            list[len++] = limit;
 +                            list[len++] = UNICODESET_HIGH;
 +                        }
 +                    } else {  // limit == UNICODESET_HIGH
 +                        if (ensureCapacity(len + 1)) {
 +                            list[len++] = UNICODESET_HIGH;
 +                        }
 +                    }
 +                }
 +                releasePattern();
 +                return *this;
 +            }
 +        }
 +        // This is slow. Could be much faster using findCodePoint(start)
 +        // and modifying the list, dealing with adjacent & overlapping ranges.
 +        UChar32 range[3] = { start, limit, UNICODESET_HIGH };
          add(range, 2, 0);
      } else if (start == end) {
          add(start);
 @@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
          list[i] = c;
          // if we touched the HIGH mark, then add a new one
          if (c == (UNICODESET_HIGH - 1)) {
 -            UErrorCode status = U_ZERO_ERROR;
 -            ensureCapacity(len+1, status);
 -            if (U_FAILURE(status)) {
 +            if (!ensureCapacity(len+1)) {
                  // ensureCapacity will mark the object as Bogus if OOM failure happens.
                  return *this;
              }
 @@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
          //                             ^
          //                             list[i]

 -        UErrorCode status = U_ZERO_ERROR;
 -        ensureCapacity(len+2, status);
 -        if (U_FAILURE(status)) {
 +        if (!ensureCapacity(len+2)) {
              // ensureCapacity will mark the object as Bogus if OOM failure happens.
              return *this;
          }

 -        //for (int32_t k=len-1; k>=i; --k) {
 -        //    list[k+2] = list[k];
 -        //}
 -        UChar32* src = list + len;
 -        UChar32* dst = src + 2;
 -        UChar32* srclimit = list + i;
 -        while (src > srclimit) *(--dst) = *(--src);
 -
 +        UChar32 *p = list + i;
 +        uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
          list[i] = c;
          list[i+1] = c+1;
          len += 2;
 @@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
      if (s.length() == 0 || isFrozen() || isBogus()) return *this;
      int32_t cp = getSingleCP(s);
      if (cp < 0) {
 -        if (!strings->contains((void*) &s)) {
 +        if (!stringsContains(s)) {
              _add(s);
              releasePattern();
          }
 @@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) {
      if (isFrozen() || isBogus()) {
          return;
      }
 +    UErrorCode ec = U_ZERO_ERROR;
 +    if (strings == nullptr && !allocateStrings(ec)) {
 +        setToBogus();
 +        return;
 +    }
      UnicodeString* t = new UnicodeString(s);
      if (t == NULL) { // Check for memory allocation error.
          setToBogus();
          return;
      }
 -    UErrorCode ec = U_ZERO_ERROR;
      strings->sortedInsert(t, compareUnicodeString, ec);
      if (U_FAILURE(ec)) {
          setToBogus();
 @@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
  }

  UnicodeSet& UnicodeSet::removeAllStrings() {
 -    strings->removeAllElements();
 +    if (!isFrozen() && hasStrings()) {
 +        strings->removeAllElements();
 +        releasePattern();
 +    }
      return *this;
  }

 @@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
      if (s.length() == 0 || isFrozen() || isBogus()) return *this;
      int32_t cp = getSingleCP(s);
      if (cp < 0) {
 -        strings->removeElement((void*) &s);
 -        releasePattern();
 +        if (strings != nullptr && strings->removeElement((void*) &s)) {
 +            releasePattern();
 +        }
      } else {
          remove((UChar32)cp, (UChar32)cp);
      }
 @@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) {
      if (isFrozen() || isBogus()) {
          return *this;
      }
 -    UErrorCode status = U_ZERO_ERROR;
      if (list[0] == UNICODESET_LOW) {
 -        ensureBufferCapacity(len-1, status);
 -        if (U_FAILURE(status)) {
 -            return *this;
 -        }
 -        uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
 +        uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
          --len;
      } else {
 -        ensureBufferCapacity(len+1, status);
 -        if (U_FAILURE(status)) {
 +        if (!ensureCapacity(len+1)) {
              return *this;
          }
 -        uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
 -        buffer[0] = UNICODESET_LOW;
 +        uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
 +        list[0] = UNICODESET_LOW;
          ++len;
      }
 -    swapBuffers();
      releasePattern();
      return *this;
  }
 @@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
      if (s.length() == 0 || isFrozen() || isBogus()) return *this;
      int32_t cp = getSingleCP(s);
      if (cp < 0) {
 -        if (strings->contains((void*) &s)) {
 +        if (stringsContains(s)) {
              strings->removeElement((void*) &s);
          } else {
              _add(s);
 @@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
      if ( c.strings!=NULL ) {
          for (int32_t i=0; i<c.strings->size(); ++i) {
              const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
 -            if (!strings->contains((void*) s)) {
 +            if (!stringsContains(*s)) {
                  _add(*s);
              }
          }
 @@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
          return *this;
      }
      retain(c.list, c.len, 0);
 -    strings->retainAll(*c.strings);
 +    if (hasStrings()) {
 +        if (!c.hasStrings()) {
 +            strings->removeAllElements();
 +        } else {
 +            strings->retainAll(*c.strings);
 +        }
 +    }
      return *this;
  }

 @@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
          return *this;
      }
      retain(c.list, c.len, 2);
 -    strings->removeAll(*c.strings);
 +    if (hasStrings() && c.hasStrings()) {
 +        strings->removeAll(*c.strings);
 +    }
      return *this;
  }

 @@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
      }
      exclusiveOr(c.list, c.len, 0);

 -    for (int32_t i=0; i<c.strings->size(); ++i) {
 -        void* e = c.strings->elementAt(i);
 -        if (!strings->removeElement(e)) {
 -            _add(*(const UnicodeString*)e);
 +    if (c.strings != nullptr) {
 +        for (int32_t i=0; i<c.strings->size(); ++i) {
 +            void* e = c.strings->elementAt(i);
 +            if (strings == nullptr || !strings->removeElement(e)) {
 +                _add(*(const UnicodeString*)e);
 +            }
          }
      }
      return *this;
 @@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) {
      if (isFrozen()) {
          return *this;
      }
 -    if (list != NULL) {
 -        list[0] = UNICODESET_HIGH;
 -    }
 +    list[0] = UNICODESET_HIGH;
      len = 1;
      releasePattern();
      if (strings != NULL) {
          strings->removeAllElements();
      }
 -    if (list != NULL && strings != NULL) {
 -        // Remove bogus
 -        fFlags = 0;
 -    }
 +    // Remove bogus
 +    fFlags = 0;
      return *this;
  }

 @@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
      return list[index*2 + 1] - 1;
  }

 -int32_t UnicodeSet::getStringCount() const {
 -    return strings->size();
 -}
 -
  const UnicodeString* UnicodeSet::getString(int32_t index) const {
      return (const UnicodeString*) strings->elementAt(index);
  }
 @@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() {
          return *this;
      }
      // Delete buffer first to defragment memory less.
 -    if (buffer != NULL) {
 +    if (buffer != stackList) {
          uprv_free(buffer);
          buffer = NULL;
 -    }
 -    if (len < capacity) {
 -        // Make the capacity equal to len or 1.
 -        // We don't want to realloc of 0 size.
 -        int32_t newCapacity = len + (len == 0);
 -        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
 +        bufferCapacity = 0;
 +    }
 +    if (list == stackList) {
 +        // pass
 +    } else if (len <= INITIAL_CAPACITY) {
 +        uprv_memcpy(stackList, list, len * sizeof(UChar32));
 +        uprv_free(list);
 +        list = stackList;
 +        capacity = INITIAL_CAPACITY;
 +    } else if ((len + 7) < capacity) {
 +        // If we have more than a little unused capacity, shrink it to len.
 +        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
          if (temp) {
              list = temp;
 -            capacity = newCapacity;
 +            capacity = len;
          }
          // else what the heck happened?! We allocated less memory!
          // Oh well. We'll keep our original array.
      }
 +    if (strings != nullptr && strings->isEmpty()) {
 +        delete strings;
 +        strings = nullptr;
 +    }
      return *this;
  }

 @@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() {
  /**
   * Deserialize constructor.
   */
 -UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
 -  : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
 -    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0) {
 +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
 +                       UErrorCode &ec) {

    if(U_FAILURE(ec)) {
      setToBogus();
 @@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se
      return;
    }

 -  allocateStrings(ec);
 -  if (U_FAILURE(ec)) {
 -    setToBogus();
 -    return;
 -  }
 -
    // bmp?
    int32_t headerSize = ((data[0]&0x8000)) ?2:1;
    int32_t bmpLength = (headerSize==1)?data[0]:data[1];

 -  len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
 +  int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
  #ifdef DEBUG_SERIALIZE
 -  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
 +  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
  #endif
 -  capacity = len+1;
 -  list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -  if(!list || U_FAILURE(ec)) {
 -    setToBogus();
 +  if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
      return;
    }
    // copy bmp
 @@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se
  #endif
    }
    // copy smp
 -  for(i=bmpLength;i<len;i++) {
 +  for(i=bmpLength;i<newLength;i++) {
      list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
                ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
  #ifdef DEBUG_SERIALIZE
      printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
  #endif
    }
 -  // terminator
 -  list[len++]=UNICODESET_HIGH;
 +  U_ASSERT(i == newLength);
 +  if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
 +    list[i++] = UNICODESET_HIGH;
 +  }
 +  len = i;
  }


 @@ -1664,33 +1634,65 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) {
      return TRUE;
  }

 -void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
 +int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
 +    // Grow exponentially to reduce the frequency of allocations.
 +    if (minCapacity < INITIAL_CAPACITY) {
 +        return minCapacity + INITIAL_CAPACITY;
 +    } else if (minCapacity <= 2500) {
 +        return 5 * minCapacity;
 +    } else {
 +        int32_t newCapacity = 2 * minCapacity;
 +        if (newCapacity > MAX_LENGTH) {
 +            newCapacity = MAX_LENGTH;
 +        }
 +        return newCapacity;
 +    }
 +}
 +
 +bool UnicodeSet::ensureCapacity(int32_t newLen) {
 +    if (newLen > MAX_LENGTH) {
 +        newLen = MAX_LENGTH;
 +    }
      if (newLen <= capacity) {
 -        return;
 +        return true;
      }
 -    UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
 +    int32_t newCapacity = nextCapacity(newLen);
 +    UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
      if (temp == NULL) {
 -        ec = U_MEMORY_ALLOCATION_ERROR;
          setToBogus(); // set the object to bogus state if an OOM failure occurred.
 -        return;
 +        return false;
 +    }
 +    // Copy only the actual contents.
 +    uprv_memcpy(temp, list, len * sizeof(UChar32));
 +    if (list != stackList) {
 +        uprv_free(list);
      }
      list = temp;
 -    capacity = newLen + GROW_EXTRA;
 -    // else we keep the original contents on the memory failure.
 +    capacity = newCapacity;
 +    return true;
  }

 -void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
 -    if (buffer != NULL && newLen <= bufferCapacity)
 -        return;
 -    UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
 +bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
 +    if (newLen > MAX_LENGTH) {
 +        newLen = MAX_LENGTH;
 +    }
 +    if (newLen <= bufferCapacity) {
 +        return true;
 +    }
 +    int32_t newCapacity = nextCapacity(newLen);
 +    UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
      if (temp == NULL) {
 -        ec = U_MEMORY_ALLOCATION_ERROR;
          setToBogus();
 -        return;
 +        return false;
 +    }
 +    // The buffer has no contents to be copied.
 +    // It is always filled from scratch after this call.
 +    if (buffer != stackList) {
 +        uprv_free(buffer);
      }
      buffer = temp;
 -    bufferCapacity = newLen + GROW_EXTRA;
 -    // else we keep the original contents on the memory failure.
 +    bufferCapacity = newCapacity;
 +    return true;
  }

  /**
 @@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola
      if (isFrozen() || isBogus()) {
          return;
      }
 -    UErrorCode status = U_ZERO_ERROR;
 -    ensureBufferCapacity(len + otherLen, status);
 -    if (U_FAILURE(status)) {
 +    if (!ensureBufferCapacity(len + otherLen)) {
          return;
      }

 @@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
      if (isFrozen() || isBogus() || other==NULL) {
          return;
      }
 -    UErrorCode status = U_ZERO_ERROR;
 -    ensureBufferCapacity(len + otherLen, status);
 -    if (U_FAILURE(status)) {
 +    if (!ensureBufferCapacity(len + otherLen)) {
          return;
      }

 @@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
      if (isFrozen() || isBogus()) {
          return;
      }
 -    UErrorCode status = U_ZERO_ERROR;
 -    ensureBufferCapacity(len + otherLen, status);
 -    if (U_FAILURE(status)) {
 +    if (!ensureBufferCapacity(len + otherLen)) {
          return;
      }

 @@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
          }
      }

 -    for (int32_t i = 0; i<strings->size(); ++i) {
 -        result.append(OPEN_BRACE);
 -        _appendToPat(result,
 -                     *(const UnicodeString*) strings->elementAt(i),
 -                     escapeUnprintable);
 -        result.append(CLOSE_BRACE);
 +    if (strings != nullptr) {
 +        for (int32_t i = 0; i<strings->size(); ++i) {
 +            result.append(OPEN_BRACE);
 +            _appendToPat(result,
 +                         *(const UnicodeString*) strings->elementAt(i),
 +                         escapeUnprintable);
 +            result.append(CLOSE_BRACE);
 +        }
      }
      return result.append(SET_CLOSE);
  }
 @@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() {
  /**
  * Set the new pattern to cache.
  */
 -void UnicodeSet::setPattern(const UnicodeString& newPat) {
 +void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
      releasePattern();
 -    int32_t newPatLen = newPat.length();
      pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
      if (pat) {
          patLen = newPatLen;
 -        newPat.extractBetween(0, patLen, pat);
 +        u_memcpy(pat, newPat, patLen);
          pat[patLen] = 0;
      }
      // else we don't care if malloc failed. This was just a nice cache.
 @@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) {

  UnicodeFunctor *UnicodeSet::freeze() {
      if(!isFrozen() && !isBogus()) {
 -        // Do most of what compact() does before freezing because
 -        // compact() will not work when the set is frozen.
 -        // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
 -
 -        // Delete buffer first to defragment memory less.
 -        if (buffer != NULL) {
 -            uprv_free(buffer);
 -            buffer = NULL;
 -        }
 -        if (capacity > (len + GROW_EXTRA)) {
 -            // Make the capacity equal to len or 1.
 -            // We don't want to realloc of 0 size.
 -            capacity = len + (len == 0);
 -            list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
 -            if (list == NULL) { // Check for memory allocation error.
 -                setToBogus();
 -                return this;
 -            }
 -        }
 +        compact();

          // Optimize contains() and span() and similar functions.
 -        if (!strings->isEmpty()) {
 +        if (hasStrings()) {
              stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
 -            if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
 +            if (stringSpan == nullptr) {
 +                setToBogus();
 +                return this;
 +            } else if (!stringSpan->needsStringSpanUTF16()) {
                  // All strings are irrelevant for span() etc. because
                  // all of each string's code points are contained in this set.
                  // Do not check needsStringSpanUTF8() because UTF-8 has at most as
 @@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC
      }
      if(stringSpan!=NULL) {
          return stringSpan->span(s, length, spanCondition);
 -    } else if(!strings->isEmpty()) {
 +    } else if(hasStrings()) {
          uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
                              UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
                              UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
 @@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s
      }
      if(stringSpan!=NULL) {
          return stringSpan->spanBack(s, length, spanCondition);
 -    } else if(!strings->isEmpty()) {
 +    } else if(hasStrings()) {
          uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
                              UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
                              UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
 @@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp
      }
      if(stringSpan!=NULL) {
          return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
 -    } else if(!strings->isEmpty()) {
 +    } else if(hasStrings()) {
          uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
                              UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
                              UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
 @@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio
      }
      if(stringSpan!=NULL) {
          return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
 -    } else if(!strings->isEmpty()) {
 +    } else if(hasStrings()) {
          uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
                              UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
                              UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
 diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp
 index 0b7da796..882231ba 100644
 --- a/source/common/uniset_closure.cpp
 +++ b/source/common/uniset_closure.cpp
 @@ -31,10 +31,6 @@
  #include "util.h"
  #include "uvector.h"

 -// initial storage. Must be >= 0
 -// *** same as in uniset.cpp ! ***
 -#define START_EXTRA 16
 -
  U_NAMESPACE_BEGIN

  // TODO memory debugging provided inside uniset.cpp
 @@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN
  UnicodeSet::UnicodeSet(const UnicodeString& pattern,
                         uint32_t options,
                         const SymbolTable* symbols,
 -                       UErrorCode& status) :
 -    len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 -    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    if(U_SUCCESS(status)){
 -        list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -        /* test for NULL */
 -        if(list == NULL) {
 -            status = U_MEMORY_ALLOCATION_ERROR;
 -        }else{
 -            allocateStrings(status);
 -            applyPattern(pattern, options, symbols, status);
 -        }
 -    }
 +                       UErrorCode& status) {
 +    applyPattern(pattern, options, symbols, status);
      _dbgct(this);
  }

  UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
                         uint32_t options,
                         const SymbolTable* symbols,
 -                       UErrorCode& status) :
 -    len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 -    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    if(U_SUCCESS(status)){
 -        list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -        /* test for NULL */
 -        if(list == NULL) {
 -            status = U_MEMORY_ALLOCATION_ERROR;
 -        }else{
 -            allocateStrings(status);
 -            applyPattern(pattern, pos, options, symbols, status);
 -        }
 -    }
 +                       UErrorCode& status) {
 +    applyPattern(pattern, pos, options, symbols, status);
      _dbgct(this);
  }

 @@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
              // start with input set to guarantee inclusion
              // USET_CASE: remove strings because the strings will actually be reduced (folded);
              //            therefore, start with no strings and add only those needed
 -            if (attribute & USET_CASE_INSENSITIVE) {
 +            if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
                  foldSet.strings->removeAllElements();
              }

 @@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
                      }
                  }
              }
 -            if (strings != NULL && strings->size() > 0) {
 +            if (hasStrings()) {
                  if (attribute & USET_CASE_INSENSITIVE) {
                      for (int32_t j=0; j<strings->size(); ++j) {
                          str = *(const UnicodeString *) strings->elementAt(j);
 diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp
 index 6cfd80a7..e98c175f 100644
 --- a/source/common/uniset_props.cpp
 +++ b/source/common/uniset_props.cpp
 @@ -47,10 +47,6 @@

  U_NAMESPACE_USE

 -// initial storage. Must be >= 0
 -// *** same as in uniset.cpp ! ***
 -#define START_EXTRA 16
 -
  // Define UChar constants using hex for EBCDIC compatibility
  // Used #define to reduce private static exports and memory access time.
  #define SET_OPEN        ((UChar)0x005B) /*[*/
 @@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
   * @param pattern a string specifying what characters are in the set
   */
  UnicodeSet::UnicodeSet(const UnicodeString& pattern,
 -                       UErrorCode& status) :
 -    len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 -    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 -    fFlags(0)
 -{
 -    if(U_SUCCESS(status)){
 -        list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 -        /* test for NULL */
 -        if(list == NULL) {
 -            status = U_MEMORY_ALLOCATION_ERROR;
 -        }else{
 -            allocateStrings(status);
 -            applyPattern(pattern, status);
 -        }
 -    }
 +                       UErrorCode& status) {
 +    applyPattern(pattern, status);
      _dbgct(this);
  }

 @@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) {
      return u_getNumericValue(ch) == *(double*)context;
  }

 +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
 +    int32_t value = *(int32_t*)context;
 +    return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
 +}
 +
  static UBool versionFilter(UChar32 ch, void* context) {
      static const UVersionInfo none = { 0, 0, 0, 0 };
      UVersionInfo v;
 @@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) {
      return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
  }

 +typedef struct {
 +    UProperty prop;
 +    int32_t value;
 +} IntPropertyContext;
 +
 +static UBool intPropertyFilter(UChar32 ch, void* context) {
 +    IntPropertyContext* c = (IntPropertyContext*)context;
 +    return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
 +}
 +
  static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
      return uscript_hasScript(ch, *(UScriptCode*)context);
  }
 @@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,

  namespace {

 -/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
 -uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
 -    uint32_t mask = *(const uint32_t *)context;
 -    value = U_MASK(value) & mask;
 -    if (value != 0) { value = 1; }
 -    return value;
 -}
 -
 -/** Maps one map value to 1, all others to 0. */
 -uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
 -    uint32_t v = *(const uint32_t *)context;
 -    return value == v ? 1 : 0;
 -}
 -
 -}  // namespace
 -
 -void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
 -                                       UCPMapValueFilter *filter, const void *context,
 -                                       UErrorCode &errorCode) {
 -    if (U_FAILURE(errorCode)) { return; }
 -    clear();
 -    UChar32 start = 0, end;
 -    uint32_t value;
 -    while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
 -                                  filter, context, &value)) >= 0) {
 -        if (value != 0) {
 -            add(start, end);
 -        }
 -        start = end + 1;
 -    }
 -    if (isBogus()) {
 -        errorCode = U_MEMORY_ALLOCATION_ERROR;
 -    }
 -}
 -
 -namespace {
 -
  static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
      /* Note: we use ' ' in compiler code page */
      int32_t j = 0;
 @@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {

  UnicodeSet&
  UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
 -    if (U_FAILURE(ec)) { return *this; }
 -    // All of the following check isFrozen() before modifying this set.
 +    if (U_FAILURE(ec) || isFrozen()) { return *this; }
      if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
 -        const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
 -        applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
 +        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
 +        applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
      } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
          const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
          UScriptCode script = (UScriptCode)value;
 @@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
              clear();
          }
      } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
 -        const UCPMap *map = u_getIntPropertyMap(prop, &ec);
 -        applyIntPropertyValue(map, intValueFilter, &value, ec);
 +        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
 +        IntPropertyContext c = {prop, value};
 +        applyFilter(intPropertyFilter, &c, inclusions, ec);
      } else {
 -        // This code used to always call getInclusions(property source)
 -        // which sets an error for an unsupported property.
          ec = U_ILLEGAL_ARGUMENT_ERROR;
 -        // Otherwise we would just clear() this set because
 -        // getIntPropertyValue(c, prop) returns 0 for all code points.
      }
      return *this;
  }
 diff --git a/source/common/uprops.h b/source/common/uprops.h
 index 1a8e4e84..34b3600b 100644
 --- a/source/common/uprops.h
 +++ b/source/common/uprops.h
 @@ -462,7 +462,6 @@ class UnicodeSet;
  class CharacterProperties {
  public:
      CharacterProperties() = delete;
 -    static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode);
      static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
  };

 diff --git a/source/common/uset.cpp b/source/common/uset.cpp
 index 39ad0a34..eae7981d 100644
 --- a/source/common/uset.cpp
 +++ b/source/common/uset.cpp
 @@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ {
  public:
      /* Try to have the compiler inline these*/
      inline static int32_t getStringCount(const UnicodeSet& set) {
 -        return set.getStringCount();
 +        return set.stringsSize();
      }
      inline static const UnicodeString* getString(const UnicodeSet& set,
                                                   int32_t i) {
 diff --git a/source/common/usetiter.cpp b/source/common/usetiter.cpp
 index 93048ba2..79151690 100644
 --- a/source/common/usetiter.cpp
 +++ b/source/common/usetiter.cpp
 @@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() {
          stringCount = 0;
      } else {
          endRange = set->getRangeCount() - 1;
 -        stringCount = set->strings->size();
 +        stringCount = set->stringsSize();
      }
      range = 0;
      endElement = -1;