| diff --git a/source/common/characterproperties.cpp b/source/common/characterproperties.cpp |
| index 3aff85b3..b416ef52 100644 |
| --- a/source/common/characterproperties.cpp |
| +++ b/source/common/characterproperties.cpp |
| @@ -23,6 +23,9 @@ |
| #include "umutex.h" |
| #include "uprops.h" |
| |
| +using icu::LocalPointer; |
| +using icu::Normalizer2Factory; |
| +using icu::Normalizer2Impl; |
| using icu::UInitOnce; |
| using icu::UnicodeSet; |
| |
| @@ -30,11 +33,13 @@ namespace { |
| |
| UBool U_CALLCONV characterproperties_cleanup(); |
| |
| +constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START; |
| + |
| struct Inclusion { |
| UnicodeSet *fSet; |
| UInitOnce fInitOnce; |
| }; |
| -Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() |
| +Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions() |
| |
| UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; |
| |
| @@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() { |
| return TRUE; |
| } |
| |
| -} // namespace |
| - |
| -U_NAMESPACE_BEGIN |
| - |
| -/* |
| -Reduce excessive reallocation, and make it easier to detect initialization problems. |
| -Usually you don't see smaller sets than this for Unicode 5.0. |
| -*/ |
| -constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; |
| - |
| -void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { |
| +void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) { |
| // This function is invoked only via umtx_initOnce(). |
| - // This function is a friend of class UnicodeSet. |
| - |
| U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); |
| if (src == UPROPS_SRC_NONE) { |
| errorCode = U_INTERNAL_PROGRAM_ERROR; |
| return; |
| } |
| - UnicodeSet * &incl = gInclusions[src].fSet; |
| - U_ASSERT(incl == nullptr); |
| + U_ASSERT(gInclusions[src].fSet == nullptr); |
| |
| - incl = new UnicodeSet(); |
| - if (incl == nullptr) { |
| + LocalPointer<UnicodeSet> incl(new UnicodeSet()); |
| + if (incl.isNull()) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| USetAdder sa = { |
| - (USet *)incl, |
| + (USet *)incl.getAlias(), |
| _set_add, |
| _set_addRange, |
| _set_addString, |
| @@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo |
| nullptr // don't need removeRange() |
| }; |
| |
| - incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); |
| switch(src) { |
| case UPROPS_SRC_CHAR: |
| uchar_addPropertyStarts(&sa, &errorCode); |
| @@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo |
| } |
| |
| if (U_FAILURE(errorCode)) { |
| - delete incl; |
| - incl = nullptr; |
| return; |
| } |
| - // Compact for caching |
| + if (incl->isBogus()) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + // Compact for caching. |
| incl->compact(); |
| + gInclusions[src].fSet = incl.orphan(); |
| ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); |
| } |
| |
| @@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC |
| return nullptr; |
| } |
| Inclusion &i = gInclusions[src]; |
| - umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); |
| + umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode); |
| return i.fSet; |
| } |
| |
| +void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) { |
| + // This function is invoked only via umtx_initOnce(). |
| + U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT); |
| + int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; |
| + U_ASSERT(gInclusions[inclIndex].fSet == nullptr); |
| + UPropertySource src = uprops_getSource(prop); |
| + const UnicodeSet *incl = getInclusionsForSource(src, errorCode); |
| + if (U_FAILURE(errorCode)) { |
| + return; |
| + } |
| + |
| + LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0)); |
| + if (intPropIncl.isNull()) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + int32_t numRanges = incl->getRangeCount(); |
| + int32_t prevValue = 0; |
| + for (int32_t i = 0; i < numRanges; ++i) { |
| + UChar32 rangeEnd = incl->getRangeEnd(i); |
| + for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) { |
| + // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. |
| + int32_t value = u_getIntPropertyValue(c, prop); |
| + if (value != prevValue) { |
| + intPropIncl->add(c); |
| + prevValue = value; |
| + } |
| + } |
| + } |
| + |
| + if (intPropIncl->isBogus()) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + // Compact for caching. |
| + intPropIncl->compact(); |
| + gInclusions[inclIndex].fSet = intPropIncl.orphan(); |
| + ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); |
| +} |
| + |
| +} // namespace |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| const UnicodeSet *CharacterProperties::getInclusionsForProperty( |
| UProperty prop, UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return nullptr; } |
| - UPropertySource src = uprops_getSource(prop); |
| - return getInclusionsForSource(src, errorCode); |
| + if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { |
| + int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; |
| + Inclusion &i = gInclusions[inclIndex]; |
| + umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode); |
| + return i.fSet; |
| + } else { |
| + UPropertySource src = uprops_getSource(prop); |
| + return getInclusionsForSource(src, errorCode); |
| + } |
| } |
| |
| U_NAMESPACE_END |
| @@ -216,7 +261,7 @@ namespace { |
| |
| UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return nullptr; } |
| - icu::LocalPointer<UnicodeSet> set(new UnicodeSet()); |
| + LocalPointer<UnicodeSet> set(new UnicodeSet()); |
| if (set.isNull()) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| diff --git a/source/common/ucptrie.cpp b/source/common/ucptrie.cpp |
| index 13496ad5..b72e3183 100644 |
| --- a/source/common/ucptrie.cpp |
| +++ b/source/common/ucptrie.cpp |
| @@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start, |
| int32_t prevI3Block = -1; |
| int32_t prevBlock = -1; |
| UChar32 c = start; |
| - uint32_t value; |
| + uint32_t trieValue, value; |
| bool haveValue = false; |
| do { |
| int32_t i3Block; |
| @@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start, |
| return c - 1; |
| } |
| } else { |
| + trieValue = trie->nullValue; |
| value = nullValue; |
| if (pValue != nullptr) { *pValue = nullValue; } |
| haveValue = true; |
| @@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start, |
| return c - 1; |
| } |
| } else { |
| + trieValue = trie->nullValue; |
| value = nullValue; |
| if (pValue != nullptr) { *pValue = nullValue; } |
| haveValue = true; |
| @@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start, |
| c = (c + dataBlockLength) & ~dataMask; |
| } else { |
| int32_t di = block + (c & dataMask); |
| - uint32_t value2 = getValue(trie->data, valueWidth, di); |
| - value2 = maybeFilterValue(value2, trie->nullValue, nullValue, |
| - filter, context); |
| + uint32_t trieValue2 = getValue(trie->data, valueWidth, di); |
| if (haveValue) { |
| - if (value2 != value) { |
| - return c - 1; |
| + if (trieValue2 != trieValue) { |
| + if (filter == nullptr || |
| + maybeFilterValue(trieValue2, trie->nullValue, nullValue, |
| + filter, context) != value) { |
| + return c - 1; |
| + } |
| + trieValue = trieValue2; // may or may not help |
| } |
| } else { |
| - value = value2; |
| + trieValue = trieValue2; |
| + value = maybeFilterValue(trieValue2, trie->nullValue, nullValue, |
| + filter, context); |
| if (pValue != nullptr) { *pValue = value; } |
| haveValue = true; |
| } |
| while ((++c & dataMask) != 0) { |
| - if (maybeFilterValue(getValue(trie->data, valueWidth, ++di), |
| - trie->nullValue, nullValue, |
| - filter, context) != value) { |
| - return c - 1; |
| + trieValue2 = getValue(trie->data, valueWidth, ++di); |
| + if (trieValue2 != trieValue) { |
| + if (filter == nullptr || |
| + maybeFilterValue(trieValue2, trie->nullValue, nullValue, |
| + filter, context) != value) { |
| + return c - 1; |
| + } |
| + trieValue = trieValue2; // may or may not help |
| } |
| } |
| } |
| diff --git a/source/common/umutablecptrie.cpp b/source/common/umutablecptrie.cpp |
| index 44af8309..926be468 100644 |
| --- a/source/common/umutablecptrie.cpp |
| +++ b/source/common/umutablecptrie.cpp |
| @@ -304,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange( |
| uint32_t nullValue = initialValue; |
| if (filter != nullptr) { nullValue = filter(context, nullValue); } |
| UChar32 c = start; |
| - uint32_t value; |
| + uint32_t trieValue, value; |
| bool haveValue = false; |
| int32_t i = c >> UCPTRIE_SHIFT_3; |
| do { |
| if (flags[i] == ALL_SAME) { |
| - uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue, |
| - filter, context); |
| + uint32_t trieValue2 = index[i]; |
| if (haveValue) { |
| - if (value2 != value) { |
| - return c - 1; |
| + if (trieValue2 != trieValue) { |
| + if (filter == nullptr || |
| + maybeFilterValue(trieValue2, initialValue, nullValue, |
| + filter, context) != value) { |
| + return c - 1; |
| + } |
| + trieValue = trieValue2; // may or may not help |
| } |
| } else { |
| - value = value2; |
| + trieValue = trieValue2; |
| + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); |
| if (pValue != nullptr) { *pValue = value; } |
| haveValue = true; |
| } |
| c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; |
| } else /* MIXED */ { |
| int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); |
| - uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue, |
| - filter, context); |
| + uint32_t trieValue2 = data[di]; |
| if (haveValue) { |
| - if (value2 != value) { |
| - return c - 1; |
| + if (trieValue2 != trieValue) { |
| + if (filter == nullptr || |
| + maybeFilterValue(trieValue2, initialValue, nullValue, |
| + filter, context) != value) { |
| + return c - 1; |
| + } |
| + trieValue = trieValue2; // may or may not help |
| } |
| } else { |
| - value = value2; |
| + trieValue = trieValue2; |
| + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); |
| if (pValue != nullptr) { *pValue = value; } |
| haveValue = true; |
| } |
| while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { |
| - if (maybeFilterValue(data[++di], initialValue, nullValue, |
| - filter, context) != value) { |
| - return c - 1; |
| + trieValue2 = data[++di]; |
| + if (trieValue2 != trieValue) { |
| + if (filter == nullptr || |
| + maybeFilterValue(trieValue2, initialValue, nullValue, |
| + filter, context) != value) { |
| + return c - 1; |
| + } |
| } |
| + trieValue = trieValue2; // may or may not help |
| } |
| } |
| ++i; |
| diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h |
| index 0abc7542..af56b872 100644 |
| --- a/source/common/unicode/uniset.h |
| +++ b/source/common/unicode/uniset.h |
| @@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN |
| |
| // Forward Declarations. |
| class BMPSet; |
| -class CharacterProperties; |
| class ParsePosition; |
| class RBBIRuleScanner; |
| class SymbolTable; |
| @@ -276,14 +275,23 @@ class RuleCharacterIterator; |
| * @stable ICU 2.0 |
| */ |
| class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { |
| +private: |
| + /** |
| + * Enough for sets with few ranges. |
| + * For example, White_Space has 10 ranges, list length 21. |
| + */ |
| + static constexpr int32_t INITIAL_CAPACITY = 25; |
| + // fFlags constant |
| + static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) |
| + |
| + UChar32* list = stackList; // MUST be terminated with HIGH |
| + int32_t capacity = INITIAL_CAPACITY; // capacity of list |
| + int32_t len = 1; // length of list used; 1 <= len <= capacity |
| + uint8_t fFlags = 0; // Bit flag (see constants above) |
| |
| - int32_t len; // length of list used; 0 <= len <= capacity |
| - int32_t capacity; // capacity of list |
| - UChar32* list; // MUST be terminated with HIGH |
| - BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. |
| - UChar32* buffer; // internal buffer, may be NULL |
| - int32_t bufferCapacity; // capacity of buffer |
| - int32_t patLen; |
| + BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL. |
| + UChar32* buffer = nullptr; // internal buffer, may be NULL |
| + int32_t bufferCapacity = 0; // capacity of buffer |
| |
| /** |
| * The pattern representation of this set. This may not be the |
| @@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { |
| * indicating that toPattern() must generate a pattern |
| * representation from the inversion list. |
| */ |
| - char16_t *pat; |
| - UVector* strings; // maintained in sorted order |
| - UnicodeSetStringSpan *stringSpan; |
| + char16_t *pat = nullptr; |
| + int32_t patLen = 0; |
| + |
| + UVector* strings = nullptr; // maintained in sorted order |
| + UnicodeSetStringSpan *stringSpan = nullptr; |
| + |
| + /** |
| + * Initial list array. |
| + * Avoids some heap allocations, and list is never nullptr. |
| + * Increases the object size a bit. |
| + */ |
| + UChar32 stackList[INITIAL_CAPACITY]; |
| |
| -private: |
| - enum { // constants |
| - kIsBogus = 1 // This set is bogus (i.e. not valid) |
| - }; |
| - uint8_t fFlags; // Bit flag (see constants above) |
| public: |
| /** |
| * Determine if this object contains a valid set. |
| @@ -1480,8 +1492,6 @@ private: |
| |
| friend class USetAccess; |
| |
| - int32_t getStringCount() const; |
| - |
| const UnicodeString* getString(int32_t index) const; |
| |
| //---------------------------------------------------------------- |
| @@ -1528,13 +1538,18 @@ private: |
| // Implementation: Utility methods |
| //---------------------------------------------------------------- |
| |
| - void ensureCapacity(int32_t newLen, UErrorCode& ec); |
| + static int32_t nextCapacity(int32_t minCapacity); |
| + |
| + bool ensureCapacity(int32_t newLen); |
| |
| - void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); |
| + bool ensureBufferCapacity(int32_t newLen); |
| |
| void swapBuffers(void); |
| |
| UBool allocateStrings(UErrorCode &status); |
| + UBool hasStrings() const; |
| + int32_t stringsSize() const; |
| + UBool stringsContains(const UnicodeString &s) const; |
| |
| UnicodeString& _toPattern(UnicodeString& result, |
| UBool escapeUnprintable) const; |
| @@ -1614,7 +1629,6 @@ private: |
| UnicodeString& rebuiltPat, |
| UErrorCode& ec); |
| |
| - friend class CharacterProperties; |
| static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); |
| |
| /** |
| @@ -1646,7 +1660,10 @@ private: |
| /** |
| * Set the new pattern to cache. |
| */ |
| - void setPattern(const UnicodeString& newPat); |
| + void setPattern(const UnicodeString& newPat) { |
| + setPattern(newPat.getBuffer(), newPat.length()); |
| + } |
| + void setPattern(const char16_t *newPat, int32_t newPatLen); |
| /** |
| * Release existing cached pattern. |
| */ |
| diff --git a/source/common/uniset.cpp b/source/common/uniset.cpp |
| index e8378e0a..20242776 100644 |
| --- a/source/common/uniset.cpp |
| +++ b/source/common/uniset.cpp |
| @@ -14,6 +14,7 @@ |
| #include "unicode/parsepos.h" |
| #include "unicode/symtable.h" |
| #include "unicode/uniset.h" |
| +#include "unicode/ustring.h" |
| #include "unicode/utf8.h" |
| #include "unicode/utf16.h" |
| #include "ruleiter.h" |
| @@ -53,11 +54,8 @@ |
| // LOW <= all valid values. ZERO for codepoints |
| #define UNICODESET_LOW 0x000000 |
| |
| -// initial storage. Must be >= 0 |
| -#define START_EXTRA 16 |
| - |
| -// extra amount for growth. Must be >= 0 |
| -#define GROW_EXTRA START_EXTRA |
| +/** Max list [0, 1, 2, ..., max code point, HIGH] */ |
| +constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; |
| |
| U_NAMESPACE_BEGIN |
| |
| @@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
| return a.compare(b); |
| } |
| |
| +UBool UnicodeSet::hasStrings() const { |
| + return strings != nullptr && !strings->isEmpty(); |
| +} |
| + |
| +int32_t UnicodeSet::stringsSize() const { |
| + return strings == nullptr ? 0 : strings->size(); |
| +} |
| + |
| +UBool UnicodeSet::stringsContains(const UnicodeString &s) const { |
| + return strings != nullptr && strings->contains((void*) &s); |
| +} |
| + |
| //---------------------------------------------------------------- |
| // Constructors &c |
| //---------------------------------------------------------------- |
| @@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
| /** |
| * Constructs an empty set. |
| */ |
| -UnicodeSet::UnicodeSet() : |
| - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), |
| - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - UErrorCode status = U_ZERO_ERROR; |
| - allocateStrings(status); |
| - if (U_FAILURE(status)) { |
| - setToBogus(); // If memory allocation failed, set to bogus state. |
| - return; |
| - } |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - if(list!=NULL){ |
| - list[0] = UNICODESET_HIGH; |
| - } else { // If memory allocation failed, set to bogus state. |
| - setToBogus(); |
| - return; |
| - } |
| +UnicodeSet::UnicodeSet() { |
| + list[0] = UNICODESET_HIGH; |
| _dbgct(this); |
| } |
| |
| @@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() : |
| * @param start first character, inclusive, of range |
| * @param end last character, inclusive, of range |
| */ |
| -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : |
| - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), |
| - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - UErrorCode status = U_ZERO_ERROR; |
| - allocateStrings(status); |
| - if (U_FAILURE(status)) { |
| - setToBogus(); // If memory allocation failed, set to bogus state. |
| - return; |
| - } |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - if(list!=NULL){ |
| - list[0] = UNICODESET_HIGH; |
| - complement(start, end); |
| - } else { // If memory allocation failed, set to bogus state. |
| - setToBogus(); |
| - return; |
| - } |
| +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { |
| + list[0] = UNICODESET_HIGH; |
| + add(start, end); |
| _dbgct(this); |
| } |
| |
| /** |
| * Constructs a set that is identical to the given UnicodeSet. |
| */ |
| -UnicodeSet::UnicodeSet(const UnicodeSet& o) : |
| - UnicodeFilter(o), |
| - len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), |
| - bmpSet(0), |
| - buffer(0), bufferCapacity(0), |
| - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - UErrorCode status = U_ZERO_ERROR; |
| - allocateStrings(status); |
| - if (U_FAILURE(status)) { |
| - setToBogus(); // If memory allocation failed, set to bogus state. |
| - return; |
| - } |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - if(list!=NULL){ |
| - *this = o; |
| - } else { // If memory allocation failed, set to bogus state. |
| - setToBogus(); |
| - return; |
| - } |
| +UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { |
| + *this = o; |
| _dbgct(this); |
| } |
| |
| // Copy-construct as thawed. |
| -UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : |
| - UnicodeFilter(o), |
| - len(0), capacity(o.len + GROW_EXTRA), list(0), |
| - bmpSet(0), |
| - buffer(0), bufferCapacity(0), |
| - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - UErrorCode status = U_ZERO_ERROR; |
| - allocateStrings(status); |
| - if (U_FAILURE(status)) { |
| - setToBogus(); // If memory allocation failed, set to bogus state. |
| - return; |
| - } |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - if(list!=NULL){ |
| +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { |
| + if (ensureCapacity(o.len)) { |
| // *this = o except for bmpSet and stringSpan |
| len = o.len; |
| uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); |
| - if (strings != NULL && o.strings != NULL) { |
| - strings->assign(*o.strings, cloneUnicodeString, status); |
| - } else { // Invalid strings. |
| - setToBogus(); |
| - return; |
| + if (o.hasStrings()) { |
| + UErrorCode status = U_ZERO_ERROR; |
| + if (!allocateStrings(status) || |
| + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { |
| + setToBogus(); |
| + return; |
| + } |
| } |
| if (o.pat) { |
| - setPattern(UnicodeString(o.pat, o.patLen)); |
| + setPattern(o.pat, o.patLen); |
| } |
| - } else { // If memory allocation failed, set to bogus state. |
| - setToBogus(); |
| - return; |
| + _dbgct(this); |
| } |
| - _dbgct(this); |
| } |
| |
| /** |
| @@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : |
| */ |
| UnicodeSet::~UnicodeSet() { |
| _dbgdt(this); // first! |
| - uprv_free(list); |
| + if (list != stackList) { |
| + uprv_free(list); |
| + } |
| delete bmpSet; |
| - if (buffer) { |
| + if (buffer != stackList) { |
| uprv_free(buffer); |
| } |
| delete strings; |
| @@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { |
| setToBogus(); |
| return *this; |
| } |
| - UErrorCode ec = U_ZERO_ERROR; |
| - ensureCapacity(o.len, ec); |
| - if (U_FAILURE(ec)) { |
| + if (!ensureCapacity(o.len)) { |
| // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. |
| return *this; |
| } |
| len = o.len; |
| uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); |
| - if (o.bmpSet == NULL || asThawed) { |
| - bmpSet = NULL; |
| - } else { |
| + if (o.bmpSet != nullptr && !asThawed) { |
| bmpSet = new BMPSet(*o.bmpSet, list, len); |
| if (bmpSet == NULL) { // Check for memory allocation error. |
| setToBogus(); |
| return *this; |
| } |
| } |
| - if (strings != NULL && o.strings != NULL) { |
| - strings->assign(*o.strings, cloneUnicodeString, ec); |
| - } else { // Invalid strings. |
| - setToBogus(); |
| - return *this; |
| + if (o.hasStrings()) { |
| + UErrorCode status = U_ZERO_ERROR; |
| + if ((strings == nullptr && !allocateStrings(status)) || |
| + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { |
| + setToBogus(); |
| + return *this; |
| + } |
| + } else if (hasStrings()) { |
| + strings->removeAllElements(); |
| } |
| - if (o.stringSpan == NULL || asThawed) { |
| - stringSpan = NULL; |
| - } else { |
| + if (o.stringSpan != nullptr && !asThawed) { |
| stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); |
| if (stringSpan == NULL) { // Check for memory allocation error. |
| setToBogus(); |
| @@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { |
| } |
| releasePattern(); |
| if (o.pat) { |
| - setPattern(UnicodeString(o.pat, o.patLen)); |
| + setPattern(o.pat, o.patLen); |
| } |
| return *this; |
| } |
| @@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { |
| for (int32_t i = 0; i < len; ++i) { |
| if (list[i] != o.list[i]) return FALSE; |
| } |
| - if (*strings != *o.strings) return FALSE; |
| + if (hasStrings() != o.hasStrings()) { return FALSE; } |
| + if (hasStrings() && *strings != *o.strings) return FALSE; |
| return TRUE; |
| } |
| |
| @@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const { |
| for (int32_t i = 0; i < count; ++i) { |
| n += getRangeEnd(i) - getRangeStart(i) + 1; |
| } |
| - return n + strings->size(); |
| + return n + stringsSize(); |
| } |
| |
| /** |
| @@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const { |
| * @return <tt>true</tt> if this set contains no elements. |
| */ |
| UBool UnicodeSet::isEmpty(void) const { |
| - return len == 1 && strings->size() == 0; |
| + return len == 1 && !hasStrings(); |
| } |
| |
| /** |
| @@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const { |
| if (s.length() == 0) return FALSE; |
| int32_t cp = getSingleCP(s); |
| if (cp < 0) { |
| - return strings->contains((void*) &s); |
| + return stringsContains(s); |
| } else { |
| return contains((UChar32) cp); |
| } |
| @@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { |
| return FALSE; |
| } |
| } |
| - if (!strings->containsAll(*c.strings)) return FALSE; |
| - return TRUE; |
| + return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); |
| } |
| |
| /** |
| @@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { |
| return FALSE; |
| } |
| } |
| - if (!strings->containsNone(*c.strings)) return FALSE; |
| - return TRUE; |
| + return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); |
| } |
| |
| /** |
| @@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { |
| return TRUE; |
| } |
| } |
| - if (strings->size() != 0) { |
| + if (hasStrings()) { |
| for (i=0; i<strings->size(); ++i) { |
| const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); |
| //if (s.length() == 0) { |
| @@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, |
| return U_MISMATCH; |
| } |
| } else { |
| - if (strings->size() != 0) { // try strings first |
| + if (hasStrings()) { // try strings first |
| |
| // might separate forward and backward loops later |
| // for now they are combined |
| @@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { |
| */ |
| UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { |
| if (pinCodePoint(start) < pinCodePoint(end)) { |
| - UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; |
| + UChar32 limit = end + 1; |
| + // Fast path for adding a new range after the last one. |
| + // Odd list length: [..., lastStart, lastLimit, HIGH] |
| + if ((len & 1) != 0) { |
| + // If the list is empty, set lastLimit low enough to not be adjacent to 0. |
| + UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; |
| + if (lastLimit <= start && !isFrozen() && !isBogus()) { |
| + if (lastLimit == start) { |
| + // Extend the last range. |
| + list[len - 2] = limit; |
| + if (limit == UNICODESET_HIGH) { |
| + --len; |
| + } |
| + } else { |
| + list[len - 1] = start; |
| + if (limit < UNICODESET_HIGH) { |
| + if (ensureCapacity(len + 2)) { |
| + list[len++] = limit; |
| + list[len++] = UNICODESET_HIGH; |
| + } |
| + } else { // limit == UNICODESET_HIGH |
| + if (ensureCapacity(len + 1)) { |
| + list[len++] = UNICODESET_HIGH; |
| + } |
| + } |
| + } |
| + releasePattern(); |
| + return *this; |
| + } |
| + } |
| + // This is slow. Could be much faster using findCodePoint(start) |
| + // and modifying the list, dealing with adjacent & overlapping ranges. |
| + UChar32 range[3] = { start, limit, UNICODESET_HIGH }; |
| add(range, 2, 0); |
| } else if (start == end) { |
| add(start); |
| @@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { |
| list[i] = c; |
| // if we touched the HIGH mark, then add a new one |
| if (c == (UNICODESET_HIGH - 1)) { |
| - UErrorCode status = U_ZERO_ERROR; |
| - ensureCapacity(len+1, status); |
| - if (U_FAILURE(status)) { |
| + if (!ensureCapacity(len+1)) { |
| // ensureCapacity will mark the object as Bogus if OOM failure happens. |
| return *this; |
| } |
| @@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { |
| // ^ |
| // list[i] |
| |
| - UErrorCode status = U_ZERO_ERROR; |
| - ensureCapacity(len+2, status); |
| - if (U_FAILURE(status)) { |
| + if (!ensureCapacity(len+2)) { |
| // ensureCapacity will mark the object as Bogus if OOM failure happens. |
| return *this; |
| } |
| |
| - //for (int32_t k=len-1; k>=i; --k) { |
| - // list[k+2] = list[k]; |
| - //} |
| - UChar32* src = list + len; |
| - UChar32* dst = src + 2; |
| - UChar32* srclimit = list + i; |
| - while (src > srclimit) *(--dst) = *(--src); |
| - |
| + UChar32 *p = list + i; |
| + uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); |
| list[i] = c; |
| list[i+1] = c+1; |
| len += 2; |
| @@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { |
| if (s.length() == 0 || isFrozen() || isBogus()) return *this; |
| int32_t cp = getSingleCP(s); |
| if (cp < 0) { |
| - if (!strings->contains((void*) &s)) { |
| + if (!stringsContains(s)) { |
| _add(s); |
| releasePattern(); |
| } |
| @@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) { |
| if (isFrozen() || isBogus()) { |
| return; |
| } |
| + UErrorCode ec = U_ZERO_ERROR; |
| + if (strings == nullptr && !allocateStrings(ec)) { |
| + setToBogus(); |
| + return; |
| + } |
| UnicodeString* t = new UnicodeString(s); |
| if (t == NULL) { // Check for memory allocation error. |
| setToBogus(); |
| return; |
| } |
| - UErrorCode ec = U_ZERO_ERROR; |
| strings->sortedInsert(t, compareUnicodeString, ec); |
| if (U_FAILURE(ec)) { |
| setToBogus(); |
| @@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { |
| } |
| |
| UnicodeSet& UnicodeSet::removeAllStrings() { |
| - strings->removeAllElements(); |
| + if (!isFrozen() && hasStrings()) { |
| + strings->removeAllElements(); |
| + releasePattern(); |
| + } |
| return *this; |
| } |
| |
| @@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { |
| if (s.length() == 0 || isFrozen() || isBogus()) return *this; |
| int32_t cp = getSingleCP(s); |
| if (cp < 0) { |
| - strings->removeElement((void*) &s); |
| - releasePattern(); |
| + if (strings != nullptr && strings->removeElement((void*) &s)) { |
| + releasePattern(); |
| + } |
| } else { |
| remove((UChar32)cp, (UChar32)cp); |
| } |
| @@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) { |
| if (isFrozen() || isBogus()) { |
| return *this; |
| } |
| - UErrorCode status = U_ZERO_ERROR; |
| if (list[0] == UNICODESET_LOW) { |
| - ensureBufferCapacity(len-1, status); |
| - if (U_FAILURE(status)) { |
| - return *this; |
| - } |
| - uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32)); |
| + uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); |
| --len; |
| } else { |
| - ensureBufferCapacity(len+1, status); |
| - if (U_FAILURE(status)) { |
| + if (!ensureCapacity(len+1)) { |
| return *this; |
| } |
| - uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32)); |
| - buffer[0] = UNICODESET_LOW; |
| + uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); |
| + list[0] = UNICODESET_LOW; |
| ++len; |
| } |
| - swapBuffers(); |
| releasePattern(); |
| return *this; |
| } |
| @@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { |
| if (s.length() == 0 || isFrozen() || isBogus()) return *this; |
| int32_t cp = getSingleCP(s); |
| if (cp < 0) { |
| - if (strings->contains((void*) &s)) { |
| + if (stringsContains(s)) { |
| strings->removeElement((void*) &s); |
| } else { |
| _add(s); |
| @@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { |
| if ( c.strings!=NULL ) { |
| for (int32_t i=0; i<c.strings->size(); ++i) { |
| const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); |
| - if (!strings->contains((void*) s)) { |
| + if (!stringsContains(*s)) { |
| _add(*s); |
| } |
| } |
| @@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { |
| return *this; |
| } |
| retain(c.list, c.len, 0); |
| - strings->retainAll(*c.strings); |
| + if (hasStrings()) { |
| + if (!c.hasStrings()) { |
| + strings->removeAllElements(); |
| + } else { |
| + strings->retainAll(*c.strings); |
| + } |
| + } |
| return *this; |
| } |
| |
| @@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { |
| return *this; |
| } |
| retain(c.list, c.len, 2); |
| - strings->removeAll(*c.strings); |
| + if (hasStrings() && c.hasStrings()) { |
| + strings->removeAll(*c.strings); |
| + } |
| return *this; |
| } |
| |
| @@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { |
| } |
| exclusiveOr(c.list, c.len, 0); |
| |
| - for (int32_t i=0; i<c.strings->size(); ++i) { |
| - void* e = c.strings->elementAt(i); |
| - if (!strings->removeElement(e)) { |
| - _add(*(const UnicodeString*)e); |
| + if (c.strings != nullptr) { |
| + for (int32_t i=0; i<c.strings->size(); ++i) { |
| + void* e = c.strings->elementAt(i); |
| + if (strings == nullptr || !strings->removeElement(e)) { |
| + _add(*(const UnicodeString*)e); |
| + } |
| } |
| } |
| return *this; |
| @@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) { |
| if (isFrozen()) { |
| return *this; |
| } |
| - if (list != NULL) { |
| - list[0] = UNICODESET_HIGH; |
| - } |
| + list[0] = UNICODESET_HIGH; |
| len = 1; |
| releasePattern(); |
| if (strings != NULL) { |
| strings->removeAllElements(); |
| } |
| - if (list != NULL && strings != NULL) { |
| - // Remove bogus |
| - fFlags = 0; |
| - } |
| + // Remove bogus |
| + fFlags = 0; |
| return *this; |
| } |
| |
| @@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { |
| return list[index*2 + 1] - 1; |
| } |
| |
| -int32_t UnicodeSet::getStringCount() const { |
| - return strings->size(); |
| -} |
| - |
| const UnicodeString* UnicodeSet::getString(int32_t index) const { |
| return (const UnicodeString*) strings->elementAt(index); |
| } |
| @@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() { |
| return *this; |
| } |
| // Delete buffer first to defragment memory less. |
| - if (buffer != NULL) { |
| + if (buffer != stackList) { |
| uprv_free(buffer); |
| buffer = NULL; |
| - } |
| - if (len < capacity) { |
| - // Make the capacity equal to len or 1. |
| - // We don't want to realloc of 0 size. |
| - int32_t newCapacity = len + (len == 0); |
| - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); |
| + bufferCapacity = 0; |
| + } |
| + if (list == stackList) { |
| + // pass |
| + } else if (len <= INITIAL_CAPACITY) { |
| + uprv_memcpy(stackList, list, len * sizeof(UChar32)); |
| + uprv_free(list); |
| + list = stackList; |
| + capacity = INITIAL_CAPACITY; |
| + } else if ((len + 7) < capacity) { |
| + // If we have more than a little unused capacity, shrink it to len. |
| + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); |
| if (temp) { |
| list = temp; |
| - capacity = newCapacity; |
| + capacity = len; |
| } |
| // else what the heck happened?! We allocated less memory! |
| // Oh well. We'll keep our original array. |
| } |
| + if (strings != nullptr && strings->isEmpty()) { |
| + delete strings; |
| + strings = nullptr; |
| + } |
| return *this; |
| } |
| |
| @@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() { |
| /** |
| * Deserialize constructor. |
| */ |
| -UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) |
| - : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), |
| - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) { |
| +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, |
| + UErrorCode &ec) { |
| |
| if(U_FAILURE(ec)) { |
| setToBogus(); |
| @@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se |
| return; |
| } |
| |
| - allocateStrings(ec); |
| - if (U_FAILURE(ec)) { |
| - setToBogus(); |
| - return; |
| - } |
| - |
| // bmp? |
| int32_t headerSize = ((data[0]&0x8000)) ?2:1; |
| int32_t bmpLength = (headerSize==1)?data[0]:data[1]; |
| |
| - len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; |
| + int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; |
| #ifdef DEBUG_SERIALIZE |
| - printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); |
| + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); |
| #endif |
| - capacity = len+1; |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - if(!list || U_FAILURE(ec)) { |
| - setToBogus(); |
| + if(!ensureCapacity(newLength + 1)) { // +1 for HIGH |
| return; |
| } |
| // copy bmp |
| @@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se |
| #endif |
| } |
| // copy smp |
| - for(i=bmpLength;i<len;i++) { |
| + for(i=bmpLength;i<newLength;i++) { |
| list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) + |
| ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]); |
| #ifdef DEBUG_SERIALIZE |
| printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]); |
| #endif |
| } |
| - // terminator |
| - list[len++]=UNICODESET_HIGH; |
| + U_ASSERT(i == newLength); |
| + if (i == 0 || list[i - 1] != UNICODESET_HIGH) { |
| + list[i++] = UNICODESET_HIGH; |
| + } |
| + len = i; |
| } |
| |
| |
| @@ -1664,33 +1634,65 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) { |
| return TRUE; |
| } |
| |
| -void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { |
| +int32_t UnicodeSet::nextCapacity(int32_t minCapacity) { |
| + // Grow exponentially to reduce the frequency of allocations. |
| + if (minCapacity < INITIAL_CAPACITY) { |
| + return minCapacity + INITIAL_CAPACITY; |
| + } else if (minCapacity <= 2500) { |
| + return 5 * minCapacity; |
| + } else { |
| + int32_t newCapacity = 2 * minCapacity; |
| + if (newCapacity > MAX_LENGTH) { |
| + newCapacity = MAX_LENGTH; |
| + } |
| + return newCapacity; |
| + } |
| +} |
| + |
| +bool UnicodeSet::ensureCapacity(int32_t newLen) { |
| + if (newLen > MAX_LENGTH) { |
| + newLen = MAX_LENGTH; |
| + } |
| if (newLen <= capacity) { |
| - return; |
| + return true; |
| } |
| - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); |
| + int32_t newCapacity = nextCapacity(newLen); |
| + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); |
| if (temp == NULL) { |
| - ec = U_MEMORY_ALLOCATION_ERROR; |
| setToBogus(); // set the object to bogus state if an OOM failure occurred. |
| - return; |
| + return false; |
| + } |
| + // Copy only the actual contents. |
| + uprv_memcpy(temp, list, len * sizeof(UChar32)); |
| + if (list != stackList) { |
| + uprv_free(list); |
| } |
| list = temp; |
| - capacity = newLen + GROW_EXTRA; |
| - // else we keep the original contents on the memory failure. |
| + capacity = newCapacity; |
| + return true; |
| } |
| |
| -void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { |
| - if (buffer != NULL && newLen <= bufferCapacity) |
| - return; |
| - UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); |
| +bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { |
| + if (newLen > MAX_LENGTH) { |
| + newLen = MAX_LENGTH; |
| + } |
| + if (newLen <= bufferCapacity) { |
| + return true; |
| + } |
| + int32_t newCapacity = nextCapacity(newLen); |
| + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); |
| if (temp == NULL) { |
| - ec = U_MEMORY_ALLOCATION_ERROR; |
| setToBogus(); |
| - return; |
| + return false; |
| + } |
| + // The buffer has no contents to be copied. |
| + // It is always filled from scratch after this call. |
| + if (buffer != stackList) { |
| + uprv_free(buffer); |
| } |
| buffer = temp; |
| - bufferCapacity = newLen + GROW_EXTRA; |
| - // else we keep the original contents on the memory failure. |
| + bufferCapacity = newCapacity; |
| + return true; |
| } |
| |
| /** |
| @@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola |
| if (isFrozen() || isBogus()) { |
| return; |
| } |
| - UErrorCode status = U_ZERO_ERROR; |
| - ensureBufferCapacity(len + otherLen, status); |
| - if (U_FAILURE(status)) { |
| + if (!ensureBufferCapacity(len + otherLen)) { |
| return; |
| } |
| |
| @@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { |
| if (isFrozen() || isBogus() || other==NULL) { |
| return; |
| } |
| - UErrorCode status = U_ZERO_ERROR; |
| - ensureBufferCapacity(len + otherLen, status); |
| - if (U_FAILURE(status)) { |
| + if (!ensureBufferCapacity(len + otherLen)) { |
| return; |
| } |
| |
| @@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) |
| if (isFrozen() || isBogus()) { |
| return; |
| } |
| - UErrorCode status = U_ZERO_ERROR; |
| - ensureBufferCapacity(len + otherLen, status); |
| - if (U_FAILURE(status)) { |
| + if (!ensureBufferCapacity(len + otherLen)) { |
| return; |
| } |
| |
| @@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, |
| } |
| } |
| |
| - for (int32_t i = 0; i<strings->size(); ++i) { |
| - result.append(OPEN_BRACE); |
| - _appendToPat(result, |
| - *(const UnicodeString*) strings->elementAt(i), |
| - escapeUnprintable); |
| - result.append(CLOSE_BRACE); |
| + if (strings != nullptr) { |
| + for (int32_t i = 0; i<strings->size(); ++i) { |
| + result.append(OPEN_BRACE); |
| + _appendToPat(result, |
| + *(const UnicodeString*) strings->elementAt(i), |
| + escapeUnprintable); |
| + result.append(CLOSE_BRACE); |
| + } |
| } |
| return result.append(SET_CLOSE); |
| } |
| @@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() { |
| /** |
| * Set the new pattern to cache. |
| */ |
| -void UnicodeSet::setPattern(const UnicodeString& newPat) { |
| +void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { |
| releasePattern(); |
| - int32_t newPatLen = newPat.length(); |
| pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); |
| if (pat) { |
| patLen = newPatLen; |
| - newPat.extractBetween(0, patLen, pat); |
| + u_memcpy(pat, newPat, patLen); |
| pat[patLen] = 0; |
| } |
| // else we don't care if malloc failed. This was just a nice cache. |
| @@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) { |
| |
| UnicodeFunctor *UnicodeSet::freeze() { |
| if(!isFrozen() && !isBogus()) { |
| - // Do most of what compact() does before freezing because |
| - // compact() will not work when the set is frozen. |
| - // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). |
| - |
| - // Delete buffer first to defragment memory less. |
| - if (buffer != NULL) { |
| - uprv_free(buffer); |
| - buffer = NULL; |
| - } |
| - if (capacity > (len + GROW_EXTRA)) { |
| - // Make the capacity equal to len or 1. |
| - // We don't want to realloc of 0 size. |
| - capacity = len + (len == 0); |
| - list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); |
| - if (list == NULL) { // Check for memory allocation error. |
| - setToBogus(); |
| - return this; |
| - } |
| - } |
| + compact(); |
| |
| // Optimize contains() and span() and similar functions. |
| - if (!strings->isEmpty()) { |
| + if (hasStrings()) { |
| stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); |
| - if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { |
| + if (stringSpan == nullptr) { |
| + setToBogus(); |
| + return this; |
| + } else if (!stringSpan->needsStringSpanUTF16()) { |
| // All strings are irrelevant for span() etc. because |
| // all of each string's code points are contained in this set. |
| // Do not check needsStringSpanUTF8() because UTF-8 has at most as |
| @@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC |
| } |
| if(stringSpan!=NULL) { |
| return stringSpan->span(s, length, spanCondition); |
| - } else if(!strings->isEmpty()) { |
| + } else if(hasStrings()) { |
| uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
| UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : |
| UnicodeSetStringSpan::FWD_UTF16_CONTAINED; |
| @@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s |
| } |
| if(stringSpan!=NULL) { |
| return stringSpan->spanBack(s, length, spanCondition); |
| - } else if(!strings->isEmpty()) { |
| + } else if(hasStrings()) { |
| uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
| UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : |
| UnicodeSetStringSpan::BACK_UTF16_CONTAINED; |
| @@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp |
| } |
| if(stringSpan!=NULL) { |
| return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); |
| - } else if(!strings->isEmpty()) { |
| + } else if(hasStrings()) { |
| uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
| UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : |
| UnicodeSetStringSpan::FWD_UTF8_CONTAINED; |
| @@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio |
| } |
| if(stringSpan!=NULL) { |
| return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); |
| - } else if(!strings->isEmpty()) { |
| + } else if(hasStrings()) { |
| uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
| UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : |
| UnicodeSetStringSpan::BACK_UTF8_CONTAINED; |
| diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp |
| index 0b7da796..882231ba 100644 |
| --- a/source/common/uniset_closure.cpp |
| +++ b/source/common/uniset_closure.cpp |
| @@ -31,10 +31,6 @@ |
| #include "util.h" |
| #include "uvector.h" |
| |
| -// initial storage. Must be >= 0 |
| -// *** same as in uniset.cpp ! *** |
| -#define START_EXTRA 16 |
| - |
| U_NAMESPACE_BEGIN |
| |
| // TODO memory debugging provided inside uniset.cpp |
| @@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN |
| UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
| uint32_t options, |
| const SymbolTable* symbols, |
| - UErrorCode& status) : |
| - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
| - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - if(U_SUCCESS(status)){ |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - /* test for NULL */ |
| - if(list == NULL) { |
| - status = U_MEMORY_ALLOCATION_ERROR; |
| - }else{ |
| - allocateStrings(status); |
| - applyPattern(pattern, options, symbols, status); |
| - } |
| - } |
| + UErrorCode& status) { |
| + applyPattern(pattern, options, symbols, status); |
| _dbgct(this); |
| } |
| |
| UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, |
| uint32_t options, |
| const SymbolTable* symbols, |
| - UErrorCode& status) : |
| - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
| - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - if(U_SUCCESS(status)){ |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - /* test for NULL */ |
| - if(list == NULL) { |
| - status = U_MEMORY_ALLOCATION_ERROR; |
| - }else{ |
| - allocateStrings(status); |
| - applyPattern(pattern, pos, options, symbols, status); |
| - } |
| - } |
| + UErrorCode& status) { |
| + applyPattern(pattern, pos, options, symbols, status); |
| _dbgct(this); |
| } |
| |
| @@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { |
| // start with input set to guarantee inclusion |
| // USET_CASE: remove strings because the strings will actually be reduced (folded); |
| // therefore, start with no strings and add only those needed |
| - if (attribute & USET_CASE_INSENSITIVE) { |
| + if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { |
| foldSet.strings->removeAllElements(); |
| } |
| |
| @@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { |
| } |
| } |
| } |
| - if (strings != NULL && strings->size() > 0) { |
| + if (hasStrings()) { |
| if (attribute & USET_CASE_INSENSITIVE) { |
| for (int32_t j=0; j<strings->size(); ++j) { |
| str = *(const UnicodeString *) strings->elementAt(j); |
| diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp |
| index 6cfd80a7..e98c175f 100644 |
| --- a/source/common/uniset_props.cpp |
| +++ b/source/common/uniset_props.cpp |
| @@ -47,10 +47,6 @@ |
| |
| U_NAMESPACE_USE |
| |
| -// initial storage. Must be >= 0 |
| -// *** same as in uniset.cpp ! *** |
| -#define START_EXTRA 16 |
| - |
| // Define UChar constants using hex for EBCDIC compatibility |
| // Used #define to reduce private static exports and memory access time. |
| #define SET_OPEN ((UChar)0x005B) /*[*/ |
| @@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) { |
| * @param pattern a string specifying what characters are in the set |
| */ |
| UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
| - UErrorCode& status) : |
| - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
| - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
| - fFlags(0) |
| -{ |
| - if(U_SUCCESS(status)){ |
| - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
| - /* test for NULL */ |
| - if(list == NULL) { |
| - status = U_MEMORY_ALLOCATION_ERROR; |
| - }else{ |
| - allocateStrings(status); |
| - applyPattern(pattern, status); |
| - } |
| - } |
| + UErrorCode& status) { |
| + applyPattern(pattern, status); |
| _dbgct(this); |
| } |
| |
| @@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) { |
| return u_getNumericValue(ch) == *(double*)context; |
| } |
| |
| +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { |
| + int32_t value = *(int32_t*)context; |
| + return (U_GET_GC_MASK((UChar32) ch) & value) != 0; |
| +} |
| + |
| static UBool versionFilter(UChar32 ch, void* context) { |
| static const UVersionInfo none = { 0, 0, 0, 0 }; |
| UVersionInfo v; |
| @@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) { |
| return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; |
| } |
| |
| +typedef struct { |
| + UProperty prop; |
| + int32_t value; |
| +} IntPropertyContext; |
| + |
| +static UBool intPropertyFilter(UChar32 ch, void* context) { |
| + IntPropertyContext* c = (IntPropertyContext*)context; |
| + return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; |
| +} |
| + |
| static UBool scriptExtensionsFilter(UChar32 ch, void* context) { |
| return uscript_hasScript(ch, *(UScriptCode*)context); |
| } |
| @@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, |
| |
| namespace { |
| |
| -/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ |
| -uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { |
| - uint32_t mask = *(const uint32_t *)context; |
| - value = U_MASK(value) & mask; |
| - if (value != 0) { value = 1; } |
| - return value; |
| -} |
| - |
| -/** Maps one map value to 1, all others to 0. */ |
| -uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { |
| - uint32_t v = *(const uint32_t *)context; |
| - return value == v ? 1 : 0; |
| -} |
| - |
| -} // namespace |
| - |
| -void UnicodeSet::applyIntPropertyValue(const UCPMap *map, |
| - UCPMapValueFilter *filter, const void *context, |
| - UErrorCode &errorCode) { |
| - if (U_FAILURE(errorCode)) { return; } |
| - clear(); |
| - UChar32 start = 0, end; |
| - uint32_t value; |
| - while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, |
| - filter, context, &value)) >= 0) { |
| - if (value != 0) { |
| - add(start, end); |
| - } |
| - start = end + 1; |
| - } |
| - if (isBogus()) { |
| - errorCode = U_MEMORY_ALLOCATION_ERROR; |
| - } |
| -} |
| - |
| -namespace { |
| - |
| static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { |
| /* Note: we use ' ' in compiler code page */ |
| int32_t j = 0; |
| @@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { |
| |
| UnicodeSet& |
| UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { |
| - if (U_FAILURE(ec)) { return *this; } |
| - // All of the following check isFrozen() before modifying this set. |
| + if (U_FAILURE(ec) || isFrozen()) { return *this; } |
| if (prop == UCHAR_GENERAL_CATEGORY_MASK) { |
| - const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); |
| - applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); |
| + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
| + applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); |
| } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { |
| const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
| UScriptCode script = (UScriptCode)value; |
| @@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) |
| clear(); |
| } |
| } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { |
| - const UCPMap *map = u_getIntPropertyMap(prop, &ec); |
| - applyIntPropertyValue(map, intValueFilter, &value, ec); |
| + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
| + IntPropertyContext c = {prop, value}; |
| + applyFilter(intPropertyFilter, &c, inclusions, ec); |
| } else { |
| - // This code used to always call getInclusions(property source) |
| - // which sets an error for an unsupported property. |
| ec = U_ILLEGAL_ARGUMENT_ERROR; |
| - // Otherwise we would just clear() this set because |
| - // getIntPropertyValue(c, prop) returns 0 for all code points. |
| } |
| return *this; |
| } |
| diff --git a/source/common/uprops.h b/source/common/uprops.h |
| index 1a8e4e84..34b3600b 100644 |
| --- a/source/common/uprops.h |
| +++ b/source/common/uprops.h |
| @@ -462,7 +462,6 @@ class UnicodeSet; |
| class CharacterProperties { |
| public: |
| CharacterProperties() = delete; |
| - static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); |
| static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); |
| }; |
| |
| diff --git a/source/common/uset.cpp b/source/common/uset.cpp |
| index 39ad0a34..eae7981d 100644 |
| --- a/source/common/uset.cpp |
| +++ b/source/common/uset.cpp |
| @@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ { |
| public: |
| /* Try to have the compiler inline these*/ |
| inline static int32_t getStringCount(const UnicodeSet& set) { |
| - return set.getStringCount(); |
| + return set.stringsSize(); |
| } |
| inline static const UnicodeString* getString(const UnicodeSet& set, |
| int32_t i) { |
| diff --git a/source/common/usetiter.cpp b/source/common/usetiter.cpp |
| index 93048ba2..79151690 100644 |
| --- a/source/common/usetiter.cpp |
| +++ b/source/common/usetiter.cpp |
| @@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() { |
| stringCount = 0; |
| } else { |
| endRange = set->getRangeCount() - 1; |
| - stringCount = set->strings->size(); |
| + stringCount = set->stringsSize(); |
| } |
| range = 0; |
| endElement = -1; |