X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/uniset.cpp diff --git a/icuSources/common/uniset.cpp b/icuSources/common/uniset.cpp index 4ff3f6b2..20242776 100644 --- a/icuSources/common/uniset.cpp +++ b/icuSources/common/uniset.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** -* Copyright (C) 1999-2009, International Business Machines +* Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description @@ -9,19 +11,22 @@ */ #include "unicode/utypes.h" -#include "unicode/uniset.h" #include "unicode/parsepos.h" #include "unicode/symtable.h" +#include "unicode/uniset.h" +#include "unicode/ustring.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" #include "ruleiter.h" #include "cmemory.h" #include "cstring.h" -#include "uhash.h" +#include "patternprops.h" +#include "uelement.h" #include "util.h" #include "uvector.h" #include "charstr.h" #include "ustrfmt.h" #include "uassert.h" -#include "hash.h" #include "bmpset.h" #include "unisetspan.h" @@ -49,11 +54,8 @@ // LOW <= all valid values. ZERO for codepoints #define UNICODESET_LOW 0x000000 -// initial storage. Must be >= 0 -#define START_EXTRA 16 - -// extra amount for growth. Must be >= 0 -#define GROW_EXTRA START_EXTRA +/** Max list [0, 1, 2, ..., max code point, HIGH] */ +constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; U_NAMESPACE_BEGIN @@ -123,16 +125,28 @@ static inline void _dbgdt(UnicodeSet* set) { // UnicodeString in UVector support //---------------------------------------------------------------- -static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) { +static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); } -static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) { +static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { const UnicodeString &a = *(const UnicodeString*)t1.pointer; const UnicodeString &b = *(const UnicodeString*)t2.pointer; return a.compare(b); } +UBool UnicodeSet::hasStrings() const { + return strings != nullptr && !strings->isEmpty(); +} + +int32_t UnicodeSet::stringsSize() const { + return strings == nullptr ? 0 : strings->size(); +} + +UBool UnicodeSet::stringsContains(const UnicodeString &s) const { + return strings != nullptr && strings->contains((void*) &s); +} + //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- @@ -140,23 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) { /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() : - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet() { + list[0] = UNICODESET_HIGH; _dbgct(this); } @@ -167,86 +166,39 @@ UnicodeSet::UnicodeSet() : * @param start first character, inclusive, of range * @param end last character, inclusive, of range */ -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - complement(start, end); - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { + list[0] = UNICODESET_HIGH; + add(start, end); _dbgct(this); } /** * Constructs a set that is identical to the given UnicodeSet. */ -UnicodeSet::UnicodeSet(const UnicodeSet& o) : - UnicodeFilter(o), - len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), - bmpSet(0), - buffer(0), bufferCapacity(0), - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - *this = o; - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { + *this = o; _dbgct(this); } // Copy-construct as thawed. -UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : - UnicodeFilter(o), - len(0), capacity(o.len + GROW_EXTRA), list(0), - bmpSet(0), - buffer(0), bufferCapacity(0), - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { + if (ensureCapacity(o.len)) { // *this = o except for bmpSet and stringSpan len = o.len; - uprv_memcpy(list, o.list, len*sizeof(UChar32)); - if (strings != NULL && o.strings != NULL) { - strings->assign(*o.strings, cloneUnicodeString, status); - } else { // Invalid strings. - setToBogus(); - return; + uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if (!allocateStrings(status) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return; + } } if (o.pat) { - setPattern(UnicodeString(o.pat, o.patLen)); + setPattern(o.pat, o.patLen); } - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; + _dbgct(this); } - _dbgct(this); } /** @@ -254,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : */ UnicodeSet::~UnicodeSet() { _dbgdt(this); // first! - uprv_free(list); + if (list != stackList) { + uprv_free(list); + } delete bmpSet; - if (buffer) { + if (buffer != stackList) { uprv_free(buffer); } delete strings; @@ -268,6 +222,10 @@ UnicodeSet::~UnicodeSet() { * Assigns this object to be a copy of another. */ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { + return copyFrom(o, FALSE); +} + +UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { if (this == &o) { return *this; } @@ -278,31 +236,30 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { setToBogus(); return *this; } - UErrorCode ec = U_ZERO_ERROR; - ensureCapacity(o.len, ec); - if (U_FAILURE(ec)) { - return *this; // There is no way to report this error :-( + if (!ensureCapacity(o.len)) { + // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. + return *this; } len = o.len; - uprv_memcpy(list, o.list, len*sizeof(UChar32)); - if (o.bmpSet == NULL) { - bmpSet = NULL; - } else { + uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); + if (o.bmpSet != nullptr && !asThawed) { bmpSet = new BMPSet(*o.bmpSet, list, len); if (bmpSet == NULL) { // Check for memory allocation error. setToBogus(); return *this; } } - if (strings != NULL && o.strings != NULL) { - strings->assign(*o.strings, cloneUnicodeString, ec); - } else { // Invalid strings. - setToBogus(); - return *this; + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if ((strings == nullptr && !allocateStrings(status)) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return *this; + } + } else if (hasStrings()) { + strings->removeAllElements(); } - if (o.stringSpan == NULL) { - stringSpan = NULL; - } else { + if (o.stringSpan != nullptr && !asThawed) { stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); if (stringSpan == NULL) { // Check for memory allocation error. setToBogus(); @@ -311,7 +268,7 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { } releasePattern(); if (o.pat) { - setPattern(UnicodeString(o.pat, o.patLen)); + setPattern(o.pat, o.patLen); } return *this; } @@ -344,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { for (int32_t i = 0; i < len; ++i) { if (list[i] != o.list[i]) return FALSE; } - if (*strings != *o.strings) return FALSE; + if (hasStrings() != o.hasStrings()) { return FALSE; } + if (hasStrings() && *strings != *o.strings) return FALSE; return TRUE; } @@ -355,12 +313,12 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { * @see Object#hashCode() */ int32_t UnicodeSet::hashCode(void) const { - int32_t result = len; + uint32_t result = static_cast(len); for (int32_t i = 0; i < len; ++i) { - result *= 1000003; + result *= 1000003u; result += list[i]; } - return result; + return static_cast(result); } //---------------------------------------------------------------- @@ -380,7 +338,7 @@ int32_t UnicodeSet::size(void) const { for (int32_t i = 0; i < count; ++i) { n += getRangeEnd(i) - getRangeStart(i) + 1; } - return n + strings->size(); + return n + stringsSize(); } /** @@ -389,7 +347,7 @@ int32_t UnicodeSet::size(void) const { * @return true if this set contains no elements. */ UBool UnicodeSet::isEmpty(void) const { - return len == 1 && strings->size() == 0; + return len == 1 && !hasStrings(); } /** @@ -489,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const { if (s.length() == 0) return FALSE; int32_t cp = getSingleCP(s); if (cp < 0) { - return strings->contains((void*) &s); + return stringsContains(s); } else { return contains((UChar32) cp); } @@ -511,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { return FALSE; } } - if (!strings->containsAll(*c.strings)) return FALSE; - return TRUE; + return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); } /** @@ -558,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { return FALSE; } } - if (!strings->containsNone(*c.strings)) return FALSE; - return TRUE; + return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); } /** @@ -600,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { return TRUE; } } - if (strings->size() != 0) { + if (hasStrings()) { for (i=0; isize(); ++i) { const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); //if (s.length() == 0) { @@ -635,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, return U_MISMATCH; } } else { - if (strings->size() != 0) { // try strings first + if (hasStrings()) { // try strings first // might separate forward and backward loops later // for now they are combined @@ -836,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { */ UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { if (pinCodePoint(start) < pinCodePoint(end)) { - UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; + UChar32 limit = end + 1; + // Fast path for adding a new range after the last one. + // Odd list length: [..., lastStart, lastLimit, HIGH] + if ((len & 1) != 0) { + // If the list is empty, set lastLimit low enough to not be adjacent to 0. + UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; + if (lastLimit <= start && !isFrozen() && !isBogus()) { + if (lastLimit == start) { + // Extend the last range. + list[len - 2] = limit; + if (limit == UNICODESET_HIGH) { + --len; + } + } else { + list[len - 1] = start; + if (limit < UNICODESET_HIGH) { + if (ensureCapacity(len + 2)) { + list[len++] = limit; + list[len++] = UNICODESET_HIGH; + } + } else { // limit == UNICODESET_HIGH + if (ensureCapacity(len + 1)) { + list[len++] = UNICODESET_HIGH; + } + } + } + releasePattern(); + return *this; + } + } + // This is slow. Could be much faster using findCodePoint(start) + // and modifying the list, dealing with adjacent & overlapping ranges. + UChar32 range[3] = { start, limit, UNICODESET_HIGH }; add(range, 2, 0); } else if (start == end) { add(start); @@ -905,10 +893,9 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { list[i] = c; // if we touched the HIGH mark, then add a new one if (c == (UNICODESET_HIGH - 1)) { - UErrorCode status = U_ZERO_ERROR; - ensureCapacity(len+1, status); - if (U_FAILURE(status)) { - return *this; // There is no way to report this error :-( + if (!ensureCapacity(len+1)) { + // ensureCapacity will mark the object as Bogus if OOM failure happens. + return *this; } list[len++] = UNICODESET_HIGH; } @@ -950,20 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { // ^ // list[i] - UErrorCode status = U_ZERO_ERROR; - ensureCapacity(len+2, status); - if (U_FAILURE(status)) { - return *this; // There is no way to report this error :-( + if (!ensureCapacity(len+2)) { + // ensureCapacity will mark the object as Bogus if OOM failure happens. + return *this; } - //for (int32_t k=len-1; k>=i; --k) { - // list[k+2] = list[k]; - //} - UChar32* src = list + len; - UChar32* dst = src + 2; - UChar32* srclimit = list + i; - while (src > srclimit) *(--dst) = *(--src); - + UChar32 *p = list + i; + uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); list[i] = c; list[i+1] = c+1; len += 2; @@ -999,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (!strings->contains((void*) &s)) { + if (!stringsContains(s)) { _add(s); releasePattern(); } @@ -1018,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) { if (isFrozen() || isBogus()) { return; } + UErrorCode ec = U_ZERO_ERROR; + if (strings == nullptr && !allocateStrings(ec)) { + setToBogus(); + return; + } UnicodeString* t = new UnicodeString(s); if (t == NULL) { // Check for memory allocation error. setToBogus(); return; } - UErrorCode ec = U_ZERO_ERROR; strings->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); @@ -1059,7 +1043,7 @@ int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { */ UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { UChar32 cp; - for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { + for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { cp = s.char32At(i); add(cp); } @@ -1106,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { } UnicodeSet& UnicodeSet::removeAllStrings() { - strings->removeAllElements(); + if (!isFrozen() && hasStrings()) { + strings->removeAllElements(); + releasePattern(); + } return *this; } @@ -1202,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - strings->removeElement((void*) &s); - releasePattern(); + if (strings != nullptr && strings->removeElement((void*) &s)) { + releasePattern(); + } } else { remove((UChar32)cp, (UChar32)cp); } @@ -1245,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) { if (isFrozen() || isBogus()) { return *this; } - UErrorCode status = U_ZERO_ERROR; if (list[0] == UNICODESET_LOW) { - ensureBufferCapacity(len-1, status); - if (U_FAILURE(status)) { - return *this; - } - uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); + uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); --len; } else { - ensureBufferCapacity(len+1, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+1)) { return *this; } - uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); - buffer[0] = UNICODESET_LOW; + uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); + list[0] = UNICODESET_LOW; ++len; } - swapBuffers(); releasePattern(); return *this; } @@ -1279,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (strings->contains((void*) &s)) { + if (stringsContains(s)) { strings->removeElement((void*) &s); } else { _add(s); @@ -1310,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { if ( c.strings!=NULL ) { for (int32_t i=0; isize(); ++i) { const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); - if (!strings->contains((void*) s)) { + if (!stringsContains(*s)) { _add(*s); } } @@ -1332,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { return *this; } retain(c.list, c.len, 0); - strings->retainAll(*c.strings); + if (hasStrings()) { + if (!c.hasStrings()) { + strings->removeAllElements(); + } else { + strings->retainAll(*c.strings); + } + } return *this; } @@ -1350,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { return *this; } retain(c.list, c.len, 2); - strings->removeAll(*c.strings); + if (hasStrings() && c.hasStrings()) { + strings->removeAll(*c.strings); + } return *this; } @@ -1368,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { } exclusiveOr(c.list, c.len, 0); - for (int32_t i=0; isize(); ++i) { - void* e = c.strings->elementAt(i); - if (!strings->removeElement(e)) { - _add(*(const UnicodeString*)e); + if (c.strings != nullptr) { + for (int32_t i=0; isize(); ++i) { + void* e = c.strings->elementAt(i); + if (strings == nullptr || !strings->removeElement(e)) { + _add(*(const UnicodeString*)e); + } } } return *this; @@ -1385,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) { if (isFrozen()) { return *this; } - if (list != NULL) { - list[0] = UNICODESET_HIGH; - } + list[0] = UNICODESET_HIGH; len = 1; releasePattern(); if (strings != NULL) { strings->removeAllElements(); } - if (list != NULL && strings != NULL) { - // Remove bogus - fFlags = 0; - } + // Remove bogus + fFlags = 0; return *this; } @@ -1430,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { return list[index*2 + 1] - 1; } -int32_t UnicodeSet::getStringCount() const { - return strings->size(); -} - const UnicodeString* UnicodeSet::getString(int32_t index) const { return (const UnicodeString*) strings->elementAt(index); } @@ -1447,25 +1430,93 @@ UnicodeSet& UnicodeSet::compact() { return *this; } // Delete buffer first to defragment memory less. - if (buffer != NULL) { + if (buffer != stackList) { uprv_free(buffer); buffer = NULL; - } - if (len < capacity) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - int32_t newCapacity = len + (len == 0); - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); + bufferCapacity = 0; + } + if (list == stackList) { + // pass + } else if (len <= INITIAL_CAPACITY) { + uprv_memcpy(stackList, list, len * sizeof(UChar32)); + uprv_free(list); + list = stackList; + capacity = INITIAL_CAPACITY; + } else if ((len + 7) < capacity) { + // If we have more than a little unused capacity, shrink it to len. + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); if (temp) { list = temp; - capacity = newCapacity; + capacity = len; } // else what the heck happened?! We allocated less memory! // Oh well. We'll keep our original array. } + if (strings != nullptr && strings->isEmpty()) { + delete strings; + strings = nullptr; + } return *this; } +#ifdef DEBUG_SERIALIZE +#include +#endif + +/** + * Deserialize constructor. + */ +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, + UErrorCode &ec) { + + if(U_FAILURE(ec)) { + setToBogus(); + return; + } + + if( (serialization != kSerialized) + || (data==NULL) + || (dataLen < 1)) { + ec = U_ILLEGAL_ARGUMENT_ERROR; + setToBogus(); + return; + } + + // bmp? + int32_t headerSize = ((data[0]&0x8000)) ?2:1; + int32_t bmpLength = (headerSize==1)?data[0]:data[1]; + + int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; +#ifdef DEBUG_SERIALIZE + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); +#endif + if(!ensureCapacity(newLength + 1)) { // +1 for HIGH + return; + } + // copy bmp + int32_t i; + for(i = 0; i< bmpLength;i++) { + list[i] = data[i+headerSize]; +#ifdef DEBUG_SERIALIZE + printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); +#endif + } + // copy smp + for(i=bmpLength;ilist[bmpLength]<=0xffff; ++bmpLength) {} length=bmpLength+2*(length-bmpLength); } - +#ifdef DEBUG_SERIALIZE + printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); +#endif /* length: number of 16-bit array units */ if (length>0x7fff) { /* there are only 15 bits for the length in the first serialized word */ @@ -1523,6 +1576,9 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& const UChar32 *p; int32_t i; +#ifdef DEBUG_SERIALIZE + printf("writeHdr\n"); +#endif *dest=(uint16_t)length; if (length>bmpLength) { *dest|=0x8000; @@ -1533,11 +1589,17 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& /* write the BMP part of the array */ p=this->list; for (i=0; i>16); *dest++=(uint16_t)*p++; } @@ -1558,7 +1620,7 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) { if (U_FAILURE(status)) { return FALSE; } - strings = new UVector(uhash_deleteUnicodeString, + strings = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, 1, status); if (strings == NULL) { // Check for memory allocation error. status = U_MEMORY_ALLOCATION_ERROR; @@ -1572,32 +1634,65 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) { return TRUE; } -void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { - if (newLen <= capacity) - return; - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); +int32_t UnicodeSet::nextCapacity(int32_t minCapacity) { + // Grow exponentially to reduce the frequency of allocations. + if (minCapacity < INITIAL_CAPACITY) { + return minCapacity + INITIAL_CAPACITY; + } else if (minCapacity <= 2500) { + return 5 * minCapacity; + } else { + int32_t newCapacity = 2 * minCapacity; + if (newCapacity > MAX_LENGTH) { + newCapacity = MAX_LENGTH; + } + return newCapacity; + } +} + +bool UnicodeSet::ensureCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= capacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); if (temp == NULL) { - ec = U_MEMORY_ALLOCATION_ERROR; - setToBogus(); - return; + setToBogus(); // set the object to bogus state if an OOM failure occurred. + return false; + } + // Copy only the actual contents. + uprv_memcpy(temp, list, len * sizeof(UChar32)); + if (list != stackList) { + uprv_free(list); } list = temp; - capacity = newLen + GROW_EXTRA; - // else we keep the original contents on the memory failure. + capacity = newCapacity; + return true; } -void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { - if (buffer != NULL && newLen <= bufferCapacity) - return; - UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); +bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= bufferCapacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); if (temp == NULL) { - ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); - return; + return false; + } + // The buffer has no contents to be copied. + // It is always filled from scratch after this call. + if (buffer != stackList) { + uprv_free(buffer); } buffer = temp; - bufferCapacity = newLen + GROW_EXTRA; - // else we keep the original contents on the memory failure. + bufferCapacity = newCapacity; + return true; } /** @@ -1634,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola if (isFrozen() || isBogus()) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } @@ -1684,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { if (isFrozen() || isBogus() || other==NULL) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } @@ -1797,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) if (isFrozen() || isBogus()) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } @@ -1892,7 +1981,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) { UChar32 cp; - for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { + for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); } } @@ -1926,7 +2015,7 @@ escapeUnprintable) { break; default: // Escape whitespace - if (uprv_isRuleWhiteSpace(c)) { + if (PatternProps::isWhiteSpace(c)) { buf.append(BACKSLASH); } break; @@ -2045,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, } } - for (int32_t i = 0; isize(); ++i) { - result.append(OPEN_BRACE); - _appendToPat(result, - *(const UnicodeString*) strings->elementAt(i), - escapeUnprintable); - result.append(CLOSE_BRACE); + if (strings != nullptr) { + for (int32_t i = 0; isize(); ++i) { + result.append(OPEN_BRACE); + _appendToPat(result, + *(const UnicodeString*) strings->elementAt(i), + escapeUnprintable); + result.append(CLOSE_BRACE); + } } return result.append(SET_CLOSE); } @@ -2069,13 +2160,12 @@ void UnicodeSet::releasePattern() { /** * Set the new pattern to cache. */ -void UnicodeSet::setPattern(const UnicodeString& newPat) { +void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { releasePattern(); - int32_t newPatLen = newPat.length(); pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); if (pat) { patLen = newPatLen; - newPat.extractBetween(0, patLen, pat); + u_memcpy(pat, newPat, patLen); pat[patLen] = 0; } // else we don't care if malloc failed. This was just a nice cache. @@ -2084,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) { UnicodeFunctor *UnicodeSet::freeze() { if(!isFrozen() && !isBogus()) { - // Do most of what compact() does before freezing because - // compact() will not work when the set is frozen. - // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). - - // Delete buffer first to defragment memory less. - if (buffer != NULL) { - uprv_free(buffer); - buffer = NULL; - } - if (capacity > (len + GROW_EXTRA)) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - capacity = len + (len == 0); - list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); - if (list == NULL) { // Check for memory allocation error. - setToBogus(); - return this; - } - } + compact(); // Optimize contains() and span() and similar functions. - if (!strings->isEmpty()) { + if (hasStrings()) { stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); - if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { + if (stringSpan == nullptr) { + setToBogus(); + return this; + } else if (!stringSpan->needsStringSpanUTF16()) { // All strings are irrelevant for span() etc. because // all of each string's code points are contained in this set. // Do not check needsStringSpanUTF8() because UTF-8 has at most as @@ -2140,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC } if(stringSpan!=NULL) { return stringSpan->span(s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF16_CONTAINED; @@ -2177,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s } if(stringSpan!=NULL) { return stringSpan->spanBack(s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF16_CONTAINED; @@ -2215,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp } if(stringSpan!=NULL) { return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF8_CONTAINED; @@ -2232,10 +2307,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp UChar32 c; int32_t start=0, prev=0; do { - U8_NEXT(s, start, length, c); - if(c<0) { - c=0xfffd; - } + U8_NEXT_OR_FFFD(s, start, length, c); if(spanCondition!=contains(c)) { break; } @@ -2256,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio } if(stringSpan!=NULL) { return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF8_CONTAINED; @@ -2273,10 +2345,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio UChar32 c; int32_t prev=length; do { - U8_PREV(s, 0, length, c); - if(c<0) { - c=0xfffd; - } + U8_PREV_OR_FFFD(s, 0, length, c); if(spanCondition!=contains(c)) { break; }