X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..3d1f044b704633e2e541231cd17ae9ecf9ad5c7a:/icuSources/common/uniset.cpp diff --git a/icuSources/common/uniset.cpp b/icuSources/common/uniset.cpp index 07794e70..20242776 100644 --- a/icuSources/common/uniset.cpp +++ b/icuSources/common/uniset.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** -* Copyright (C) 1999-2004, International Business Machines +* Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description @@ -9,19 +11,24 @@ */ #include "unicode/utypes.h" -#include "unicode/uniset.h" #include "unicode/parsepos.h" #include "unicode/symtable.h" +#include "unicode/uniset.h" +#include "unicode/ustring.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" #include "ruleiter.h" #include "cmemory.h" -#include "uhash.h" +#include "cstring.h" +#include "patternprops.h" +#include "uelement.h" #include "util.h" #include "uvector.h" #include "charstr.h" #include "ustrfmt.h" -#include "mutex.h" #include "uassert.h" -#include "hash.h" +#include "bmpset.h" +#include "unisetspan.h" // Define UChar constants using hex for EBCDIC compatibility // Used #define to reduce private static exports and memory access time. @@ -47,26 +54,13 @@ // LOW <= all valid values. ZERO for codepoints #define UNICODESET_LOW 0x000000 -// initial storage. Must be >= 0 -#define START_EXTRA 16 - -// extra amount for growth. Must be >= 0 -#define GROW_EXTRA START_EXTRA +/** Max list [0, 1, 2, ..., max code point, HIGH] */ +constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; U_NAMESPACE_BEGIN SymbolTable::~SymbolTable() {} -/** - * Minimum value that can be stored in a UnicodeSet. - */ -const UChar32 UnicodeSet::MIN_VALUE = UNICODESET_LOW; - -/** - * Maximum value that can be stored in a UnicodeSet. - */ -const UChar32 UnicodeSet::MAX_VALUE = UNICODESET_HIGH - 1; - UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) /** @@ -131,16 +125,28 @@ static inline void _dbgdt(UnicodeSet* set) { // UnicodeString in UVector support //---------------------------------------------------------------- -static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) { +static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); } -static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) { +static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { const UnicodeString &a = *(const UnicodeString*)t1.pointer; const UnicodeString &b = *(const UnicodeString*)t2.pointer; return a.compare(b); } +UBool UnicodeSet::hasStrings() const { + return strings != nullptr && !strings->isEmpty(); +} + +int32_t UnicodeSet::stringsSize() const { + return strings == nullptr ? 0 : strings->size(); +} + +UBool UnicodeSet::stringsContains(const UnicodeString &s) const { + return strings != nullptr && strings->contains((void*) &s); +} + //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- @@ -148,15 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) { /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() : - len(1), capacity(1 + START_EXTRA), bufferCapacity(0), - list(0), buffer(0), strings(0) -{ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - } - allocateStrings(); +UnicodeSet::UnicodeSet() { + list[0] = UNICODESET_HIGH; _dbgct(this); } @@ -167,60 +166,126 @@ UnicodeSet::UnicodeSet() : * @param start first character, inclusive, of range * @param end last character, inclusive, of range */ -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : - len(1), capacity(1 + START_EXTRA), bufferCapacity(0), - list(0), buffer(0), strings(0) -{ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - } - allocateStrings(); - complement(start, end); +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { + list[0] = UNICODESET_HIGH; + add(start, end); _dbgct(this); } /** * Constructs a set that is identical to the given UnicodeSet. */ -UnicodeSet::UnicodeSet(const UnicodeSet& o) : - UnicodeFilter(o), - len(0), capacity(o.len + GROW_EXTRA), bufferCapacity(0), - list(0), buffer(0), strings(0) -{ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - allocateStrings(); - *this = o; - } +UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { + *this = o; _dbgct(this); } +// Copy-construct as thawed. +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { + if (ensureCapacity(o.len)) { + // *this = o except for bmpSet and stringSpan + len = o.len; + uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if (!allocateStrings(status) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return; + } + } + if (o.pat) { + setPattern(o.pat, o.patLen); + } + _dbgct(this); + } +} + /** * Destructs the set. */ UnicodeSet::~UnicodeSet() { _dbgdt(this); // first! - uprv_free(list); - if (buffer) { + if (list != stackList) { + uprv_free(list); + } + delete bmpSet; + if (buffer != stackList) { uprv_free(buffer); } delete strings; + delete stringSpan; + releasePattern(); } /** * Assigns this object to be a copy of another. */ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { - ensureCapacity(o.len); + return copyFrom(o, FALSE); +} + +UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { + if (this == &o) { + return *this; + } + if (isFrozen()) { + return *this; + } + if (o.isBogus()) { + setToBogus(); + return *this; + } + if (!ensureCapacity(o.len)) { + // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. + return *this; + } len = o.len; - uprv_memcpy(list, o.list, len*sizeof(UChar32)); - UErrorCode ec = U_ZERO_ERROR; - strings->assign(*o.strings, cloneUnicodeString, ec); - pat = o.pat; + uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); + if (o.bmpSet != nullptr && !asThawed) { + bmpSet = new BMPSet(*o.bmpSet, list, len); + if (bmpSet == NULL) { // Check for memory allocation error. + setToBogus(); + return *this; + } + } + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if ((strings == nullptr && !allocateStrings(status)) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return *this; + } + } else if (hasStrings()) { + strings->removeAllElements(); + } + if (o.stringSpan != nullptr && !asThawed) { + stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); + if (stringSpan == NULL) { // Check for memory allocation error. + setToBogus(); + return *this; + } + } + releasePattern(); + if (o.pat) { + setPattern(o.pat, o.patLen); + } return *this; } +/** + * Returns a copy of this object. All UnicodeMatcher objects have + * to support cloning in order to allow classes using + * UnicodeMatchers, such as Transliterator, to implement cloning. + */ +UnicodeFunctor* UnicodeSet::clone() const { + return new UnicodeSet(*this); +} + +UnicodeFunctor *UnicodeSet::cloneAsThawed() const { + return new UnicodeSet(*this, TRUE); +} + /** * Compares the specified object with this set for equality. Returns * true if the two sets @@ -236,19 +301,11 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { for (int32_t i = 0; i < len; ++i) { if (list[i] != o.list[i]) return FALSE; } - if (*strings != *o.strings) return FALSE; + if (hasStrings() != o.hasStrings()) { return FALSE; } + if (hasStrings() && *strings != *o.strings) return FALSE; return TRUE; } -/** - * Returns a copy of this object. All UnicodeMatcher objects have - * to support cloning in order to allow classes using - * UnicodeMatchers, such as Transliterator, to implement cloning. - */ -UnicodeFunctor* UnicodeSet::clone() const { - return new UnicodeSet(*this); -} - /** * Returns the hash code value for this set. * @@ -256,32 +313,18 @@ UnicodeFunctor* UnicodeSet::clone() const { * @see Object#hashCode() */ int32_t UnicodeSet::hashCode(void) const { - int32_t result = len; + uint32_t result = static_cast(len); for (int32_t i = 0; i < len; ++i) { - result *= 1000003; + result *= 1000003u; result += list[i]; } - return result; + return static_cast(result); } //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- -/** - * Make this object represent the range start - end. - * If end > start then this object is set to an - * an empty range. - * - * @param start first character in the set, inclusive - * @rparam end last character in the set, inclusive - */ -UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { - clear(); - complement(start, end); - return *this; -} - /** * Returns the number of elements in this set (its cardinality), * Note than the elements of a set may include both individual @@ -295,7 +338,7 @@ int32_t UnicodeSet::size(void) const { for (int32_t i = 0; i < count; ++i) { n += getRangeEnd(i) - getRangeStart(i) + 1; } - return n + strings->size(); + return n + stringsSize(); } /** @@ -304,7 +347,7 @@ int32_t UnicodeSet::size(void) const { * @return true if this set contains no elements. */ UBool UnicodeSet::isEmpty(void) const { - return len == 1 && strings->size() == 0; + return len == 1 && !hasStrings(); } /** @@ -320,11 +363,17 @@ UBool UnicodeSet::contains(UChar32 c) const { //for (;;) { // if (c < list[++i]) break; //} + if (bmpSet != NULL) { + return bmpSet->contains(c); + } + if (stringSpan != NULL) { + return stringSpan->contains(c); + } if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound return FALSE; } int32_t i = findCodePoint(c); - return ((i & 1) != 0); // return true if odd + return (UBool)(i & 1); // return true if odd } /** @@ -349,24 +398,27 @@ int32_t UnicodeSet::findCodePoint(UChar32 c) const { // Return the smallest i such that c < list[i]. Assume // list[len - 1] == HIGH and that c is legal (0..HIGH-1). - if (c < list[0]) return 0; + if (c < list[0]) + return 0; // High runner test. c is often after the last range, so an // initial check for this condition pays off. - if (len >= 2 && c >= list[len-2]) return len-1; int32_t lo = 0; int32_t hi = len - 1; + if (lo >= hi || c >= list[hi-1]) + return hi; // invariant: c >= list[lo] // invariant: c < list[hi] for (;;) { int32_t i = (lo + hi) >> 1; - if (i == lo) return hi; - if (c < list[i]) { + if (i == lo) { + break; // Found! + } else if (c < list[i]) { hi = i; } else { lo = i; } } - return 0; // To make compiler happy; never reached + return hi; } /** @@ -395,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const { if (s.length() == 0) return FALSE; int32_t cp = getSingleCP(s); if (cp < 0) { - return strings->contains((void*) &s); + return stringsContains(s); } else { return contains((UChar32) cp); } @@ -417,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { return FALSE; } } - if (!strings->containsAll(*c.strings)) return FALSE; - return TRUE; + return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); } /** @@ -428,12 +479,8 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { * @return true if the test condition is met */ UBool UnicodeSet::containsAll(const UnicodeString& s) const { - UChar32 cp; - for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { - cp = s.char32At(i); - if (!contains(cp)) return FALSE; - } - return TRUE; + return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == + s.length()); } /** @@ -468,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { return FALSE; } } - if (!strings->containsNone(*c.strings)) return FALSE; - return TRUE; + return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); } /** @@ -479,12 +525,8 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { * @return true if the test condition is met */ UBool UnicodeSet::containsNone(const UnicodeString& s) const { - UChar32 cp; - for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { - cp = s.char32At(i); - if (contains(cp)) return FALSE; - } - return TRUE; + return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == + s.length()); } /** @@ -502,7 +544,8 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { * time zone month containment logic.) */ int32_t i; - for (i=0; isize() != 0) { + if (hasStrings()) { for (i=0; isize(); ++i) { const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); //if (s.length() == 0) { @@ -548,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, return U_MISMATCH; } } else { - if (strings->size() != 0) { // try strings first + if (hasStrings()) { // try strings first // might separate forward and backward loops later // for now they are combined @@ -722,6 +765,20 @@ UChar32 UnicodeSet::charAt(int32_t index) const { return (UChar32)-1; } +/** + * Make this object represent the range start - end. + * If end > start then this object is set to an + * an empty range. + * + * @param start first character in the set, inclusive + * @rparam end last character in the set, inclusive + */ +UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { + clear(); + complement(start, end); + return *this; +} + /** * Adds the specified range to this set if it is not already * present. If this set already contains the specified range, @@ -735,7 +792,39 @@ UChar32 UnicodeSet::charAt(int32_t index) const { */ UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { if (pinCodePoint(start) < pinCodePoint(end)) { - UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; + UChar32 limit = end + 1; + // Fast path for adding a new range after the last one. + // Odd list length: [..., lastStart, lastLimit, HIGH] + if ((len & 1) != 0) { + // If the list is empty, set lastLimit low enough to not be adjacent to 0. + UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; + if (lastLimit <= start && !isFrozen() && !isBogus()) { + if (lastLimit == start) { + // Extend the last range. + list[len - 2] = limit; + if (limit == UNICODESET_HIGH) { + --len; + } + } else { + list[len - 1] = start; + if (limit < UNICODESET_HIGH) { + if (ensureCapacity(len + 2)) { + list[len++] = limit; + list[len++] = UNICODESET_HIGH; + } + } else { // limit == UNICODESET_HIGH + if (ensureCapacity(len + 1)) { + list[len++] = UNICODESET_HIGH; + } + } + } + releasePattern(); + return *this; + } + } + // This is slow. Could be much faster using findCodePoint(start) + // and modifying the list, dealing with adjacent & overlapping ranges. + UChar32 range[3] = { start, limit, UNICODESET_HIGH }; add(range, 2, 0); } else if (start == end) { add(start); @@ -776,7 +865,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { int32_t i = findCodePoint(pinCodePoint(c)); // already in set? - if ((i & 1) != 0) return *this; + if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; // HIGH is 0x110000 // assert(list[len-1] == HIGH); @@ -804,7 +893,10 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { list[i] = c; // if we touched the HIGH mark, then add a new one if (c == (UNICODESET_HIGH - 1)) { - ensureCapacity(len+1); + if (!ensureCapacity(len+1)) { + // ensureCapacity will mark the object as Bogus if OOM failure happens. + return *this; + } list[len++] = UNICODESET_HIGH; } if (i > 0 && c == list[i-1]) { @@ -845,16 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { // ^ // list[i] - ensureCapacity(len+2); - - //for (int32_t k=len-1; k>=i; --k) { - // list[k+2] = list[k]; - //} - UChar32* src = list + len; - UChar32* dst = src + 2; - UChar32* srclimit = list + i; - while (src > srclimit) *(--dst) = *(--src); + if (!ensureCapacity(len+2)) { + // ensureCapacity will mark the object as Bogus if OOM failure happens. + return *this; + } + UChar32 *p = list + i; + uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); list[i] = c; list[i+1] = c+1; len += 2; @@ -873,7 +962,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { } #endif - pat.truncate(0); + releasePattern(); return *this; } @@ -887,15 +976,15 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { * @return the modified set, for chaining */ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { - if (s.length() == 0) return *this; + if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (!strings->contains((void*) &s)) { + if (!stringsContains(s)) { _add(s); - pat.truncate(0); + releasePattern(); } } else { - add((UChar32)cp, (UChar32)cp); + add((UChar32)cp); } return *this; } @@ -906,9 +995,24 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { * already be in 'strings'. */ void UnicodeSet::_add(const UnicodeString& s) { - UnicodeString* t = new UnicodeString(s); + if (isFrozen() || isBogus()) { + return; + } UErrorCode ec = U_ZERO_ERROR; + if (strings == nullptr && !allocateStrings(ec)) { + setToBogus(); + return; + } + UnicodeString* t = new UnicodeString(s); + if (t == NULL) { // Check for memory allocation error. + setToBogus(); + return; + } strings->sortedInsert(t, compareUnicodeString, ec); + if (U_FAILURE(ec)) { + setToBogus(); + delete t; + } } /** @@ -939,9 +1043,9 @@ int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { */ UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { UChar32 cp; - for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { + for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { cp = s.char32At(i); - add(cp, cp); + add(cp); } return *this; } @@ -985,6 +1089,15 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { return *this; } +UnicodeSet& UnicodeSet::removeAllStrings() { + if (!isFrozen() && hasStrings()) { + strings->removeAllElements(); + releasePattern(); + } + return *this; +} + + /** * Makes a set from a multicharacter string. Thus "ch" => {"ch"} *
Warning: you cannot add an empty string ("") to a UnicodeSet. @@ -993,7 +1106,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { */ UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { UnicodeSet *set = new UnicodeSet(); - set->add(s); + if (set != NULL) { // Check for memory allocation error. + set->add(s); + } return set; } @@ -1005,7 +1120,9 @@ UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { */ UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { UnicodeSet *set = new UnicodeSet(); - set->addAll(s); + if (set != NULL) { // Check for memory allocation error. + set->addAll(s); + } return set; } @@ -1069,11 +1186,12 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) { * @return the modified set, for chaining */ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { - if (s.length() == 0) return *this; + if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - strings->removeElement((void*) &s); - pat.truncate(0); + if (strings != nullptr && strings->removeElement((void*) &s)) { + releasePattern(); + } } else { remove((UChar32)cp, (UChar32)cp); } @@ -1092,11 +1210,14 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { * from this set. */ UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { + if (isFrozen() || isBogus()) { + return *this; + } if (pinCodePoint(start) <= pinCodePoint(end)) { UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; exclusiveOr(range, 2, 0); } - pat.truncate(0); + releasePattern(); return *this; } @@ -1109,18 +1230,21 @@ UnicodeSet& UnicodeSet::complement(UChar32 c) { * complement(MIN_VALUE, MAX_VALUE). */ UnicodeSet& UnicodeSet::complement(void) { + if (isFrozen() || isBogus()) { + return *this; + } if (list[0] == UNICODESET_LOW) { - ensureBufferCapacity(len-1); - uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); + uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); --len; } else { - ensureBufferCapacity(len+1); - uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); - buffer[0] = UNICODESET_LOW; + if (!ensureCapacity(len+1)) { + return *this; + } + uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); + list[0] = UNICODESET_LOW; ++len; } - swapBuffers(); - pat.truncate(0); + releasePattern(); return *this; } @@ -1133,15 +1257,15 @@ UnicodeSet& UnicodeSet::complement(void) { * @return this object, for chaining */ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { - if (s.length() == 0) return *this; + if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (strings->contains((void*) &s)) { + if (stringsContains(s)) { strings->removeElement((void*) &s); } else { _add(s); } - pat.truncate(0); + releasePattern(); } else { complement((UChar32)cp, (UChar32)cp); } @@ -1159,13 +1283,17 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { * @see #add(char, char) */ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { - add(c.list, c.len, 0); + if ( c.len>0 && c.list!=NULL ) { + add(c.list, c.len, 0); + } // Add strings in order - for (int32_t i=0; isize(); ++i) { - const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); - if (!strings->contains((void*) s)) { - _add(*s); + if ( c.strings!=NULL ) { + for (int32_t i=0; isize(); ++i) { + const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); + if (!stringsContains(*s)) { + _add(*s); + } } } return *this; @@ -1181,8 +1309,17 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { * @param c set that defines which elements this set will retain. */ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { + if (isFrozen() || isBogus()) { + return *this; + } retain(c.list, c.len, 0); - strings->retainAll(*c.strings); + if (hasStrings()) { + if (!c.hasStrings()) { + strings->removeAllElements(); + } else { + strings->retainAll(*c.strings); + } + } return *this; } @@ -1196,8 +1333,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { * this set. */ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { + if (isFrozen() || isBogus()) { + return *this; + } retain(c.list, c.len, 2); - strings->removeAll(*c.strings); + if (hasStrings() && c.hasStrings()) { + strings->removeAll(*c.strings); + } return *this; } @@ -1210,12 +1352,17 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { * this set. */ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { + if (isFrozen() || isBogus()) { + return *this; + } exclusiveOr(c.list, c.len, 0); - for (int32_t i=0; isize(); ++i) { - void* e = c.strings->elementAt(i); - if (!strings->removeElement(e)) { - _add(*(const UnicodeString*)e); + if (c.strings != nullptr) { + for (int32_t i=0; isize(); ++i) { + void* e = c.strings->elementAt(i); + if (strings == nullptr || !strings->removeElement(e)) { + _add(*(const UnicodeString*)e); + } } } return *this; @@ -1226,10 +1373,17 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { * empty after this call returns. */ UnicodeSet& UnicodeSet::clear(void) { + if (isFrozen()) { + return *this; + } list[0] = UNICODESET_HIGH; len = 1; - pat.truncate(0); - strings->removeAllElements(); + releasePattern(); + if (strings != NULL) { + strings->removeAllElements(); + } + // Remove bogus + fFlags = 0; return *this; } @@ -1263,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { return list[index*2 + 1] - 1; } -int32_t UnicodeSet::getStringCount() const { - return strings->size(); -} - const UnicodeString* UnicodeSet::getString(int32_t index) const { return (const UnicodeString*) strings->elementAt(index); } @@ -1276,18 +1426,97 @@ const UnicodeString* UnicodeSet::getString(int32_t index) const { * possible space, without changing this object's value. */ UnicodeSet& UnicodeSet::compact() { - if (len != capacity) { - capacity = len; - UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - uprv_memcpy(temp, list, len*sizeof(UChar32)); + if (isFrozen() || isBogus()) { + return *this; + } + // Delete buffer first to defragment memory less. + if (buffer != stackList) { + uprv_free(buffer); + buffer = NULL; + bufferCapacity = 0; + } + if (list == stackList) { + // pass + } else if (len <= INITIAL_CAPACITY) { + uprv_memcpy(stackList, list, len * sizeof(UChar32)); uprv_free(list); - list = temp; + list = stackList; + capacity = INITIAL_CAPACITY; + } else if ((len + 7) < capacity) { + // If we have more than a little unused capacity, shrink it to len. + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); + if (temp) { + list = temp; + capacity = len; + } + // else what the heck happened?! We allocated less memory! + // Oh well. We'll keep our original array. + } + if (strings != nullptr && strings->isEmpty()) { + delete strings; + strings = nullptr; } - uprv_free(buffer); - buffer = NULL; return *this; } +#ifdef DEBUG_SERIALIZE +#include +#endif + +/** + * Deserialize constructor. + */ +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, + UErrorCode &ec) { + + if(U_FAILURE(ec)) { + setToBogus(); + return; + } + + if( (serialization != kSerialized) + || (data==NULL) + || (dataLen < 1)) { + ec = U_ILLEGAL_ARGUMENT_ERROR; + setToBogus(); + return; + } + + // bmp? + int32_t headerSize = ((data[0]&0x8000)) ?2:1; + int32_t bmpLength = (headerSize==1)?data[0]:data[1]; + + int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; +#ifdef DEBUG_SERIALIZE + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); +#endif + if(!ensureCapacity(newLength + 1)) { // +1 for HIGH + return; + } + // copy bmp + int32_t i; + for(i = 0; i< bmpLength;i++) { + list[i] = data[i+headerSize]; +#ifdef DEBUG_SERIALIZE + printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); +#endif + } + // copy smp + for(i=bmpLength;ilist[bmpLength]<=0xffff; ++bmpLength) {} length=bmpLength+2*(length-bmpLength); } - +#ifdef DEBUG_SERIALIZE + printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); +#endif /* length: number of 16-bit array units */ if (length>0x7fff) { /* there are only 15 bits for the length in the first serialized word */ @@ -1345,6 +1576,9 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& const UChar32 *p; int32_t i; +#ifdef DEBUG_SERIALIZE + printf("writeHdr\n"); +#endif *dest=(uint16_t)length; if (length>bmpLength) { *dest|=0x8000; @@ -1355,11 +1589,17 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& /* write the BMP part of the array */ p=this->list; for (i=0; i>16); *dest++=(uint16_t)*p++; } @@ -1376,36 +1616,83 @@ int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& /** * Allocate our strings vector and return TRUE if successful. */ -UBool UnicodeSet::allocateStrings() { - UErrorCode ec = U_ZERO_ERROR; - strings = new UVector(uhash_deleteUnicodeString, - uhash_compareUnicodeString, ec); - if (U_FAILURE(ec)) { +UBool UnicodeSet::allocateStrings(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + strings = new UVector(uprv_deleteUObject, + uhash_compareUnicodeString, 1, status); + if (strings == NULL) { // Check for memory allocation error. + status = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + if (U_FAILURE(status)) { delete strings; strings = NULL; return FALSE; - } + } return TRUE; } -void UnicodeSet::ensureCapacity(int32_t newLen) { - if (newLen <= capacity) - return; - capacity = newLen + GROW_EXTRA; - UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - uprv_memcpy(temp, list, len*sizeof(UChar32)); - uprv_free(list); +int32_t UnicodeSet::nextCapacity(int32_t minCapacity) { + // Grow exponentially to reduce the frequency of allocations. + if (minCapacity < INITIAL_CAPACITY) { + return minCapacity + INITIAL_CAPACITY; + } else if (minCapacity <= 2500) { + return 5 * minCapacity; + } else { + int32_t newCapacity = 2 * minCapacity; + if (newCapacity > MAX_LENGTH) { + newCapacity = MAX_LENGTH; + } + return newCapacity; + } +} + +bool UnicodeSet::ensureCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= capacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); + if (temp == NULL) { + setToBogus(); // set the object to bogus state if an OOM failure occurred. + return false; + } + // Copy only the actual contents. + uprv_memcpy(temp, list, len * sizeof(UChar32)); + if (list != stackList) { + uprv_free(list); + } list = temp; + capacity = newCapacity; + return true; } -void UnicodeSet::ensureBufferCapacity(int32_t newLen) { - if (buffer != NULL && newLen <= bufferCapacity) - return; - if (buffer) { +bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= bufferCapacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); + if (temp == NULL) { + setToBogus(); + return false; + } + // The buffer has no contents to be copied. + // It is always filled from scratch after this call. + if (buffer != stackList) { uprv_free(buffer); } - bufferCapacity = newLen + GROW_EXTRA; - buffer = (UChar32*) uprv_malloc(sizeof(UChar32) * bufferCapacity); + buffer = temp; + bufferCapacity = newCapacity; + return true; } /** @@ -1422,6 +1709,11 @@ void UnicodeSet::swapBuffers(void) { bufferCapacity = c; } +void UnicodeSet::setToBogus() { + clear(); // Remove everything in the set. + fFlags = kIsBogus; +} + //---------------------------------------------------------------- // Implementation: Fundamental operators //---------------------------------------------------------------- @@ -1434,7 +1726,13 @@ static inline UChar32 max(UChar32 a, UChar32 b) { // polarity = 1, 2: x xor ~y == x === y void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { - ensureBufferCapacity(len + otherLen); + if (isFrozen() || isBogus()) { + return; + } + if (!ensureBufferCapacity(len + otherLen)) { + return; + } + int32_t i = 0, j = 0, k = 0; UChar32 a = list[i++]; UChar32 b; @@ -1467,7 +1765,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola } } swapBuffers(); - pat.truncate(0); + releasePattern(); } // polarity = 0 is normal: x union y @@ -1476,7 +1774,13 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola // polarity = 3: ~x union ~y void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { - ensureBufferCapacity(len + otherLen); + if (isFrozen() || isBogus() || other==NULL) { + return; + } + if (!ensureBufferCapacity(len + otherLen)) { + return; + } + int32_t i = 0, j = 0, k = 0; UChar32 a = list[i++]; UChar32 b = other[j++]; @@ -1572,7 +1876,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { buffer[k++] = UNICODESET_HIGH; // terminate len = k; swapBuffers(); - pat.truncate(0); + releasePattern(); } // polarity = 0 is normal: x intersect y @@ -1581,7 +1885,13 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { // polarity = 3: ~x intersect ~y void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { - ensureBufferCapacity(len + otherLen); + if (isFrozen() || isBogus()) { + return; + } + if (!ensureBufferCapacity(len + otherLen)) { + return; + } + int32_t i = 0, j = 0, k = 0; UChar32 a = list[i++]; UChar32 b = other[j++]; @@ -1661,7 +1971,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) buffer[k++] = UNICODESET_HIGH; // terminate len = k; swapBuffers(); - pat.truncate(0); + releasePattern(); } /** @@ -1671,7 +1981,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) { UChar32 cp; - for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { + for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); } } @@ -1705,7 +2015,7 @@ escapeUnprintable) { break; default: // Escape whitespace - if (uprv_isRuleWhiteSpace(c)) { + if (PatternProps::isWhiteSpace(c)) { buf.append(BACKSLASH); } break; @@ -1719,13 +2029,14 @@ escapeUnprintable) { * is one. Otherwise it will be generated. */ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, - UBool escapeUnprintable) const { - if (pat.length() > 0) { + UBool escapeUnprintable) const +{ + if (pat != NULL) { int32_t i; int32_t backslashCount = 0; - for (i=0; isize(); ++i) { - result.append(OPEN_BRACE); - _appendToPat(result, - *(const UnicodeString*) strings->elementAt(i), - escapeUnprintable); - result.append(CLOSE_BRACE); + if (strings != nullptr) { + for (int32_t i = 0; isize(); ++i) { + result.append(OPEN_BRACE); + _appendToPat(result, + *(const UnicodeString*) strings->elementAt(i), + escapeUnprintable); + result.append(CLOSE_BRACE); + } } return result.append(SET_CLOSE); } +/** +* Release existing cached pattern +*/ +void UnicodeSet::releasePattern() { + if (pat) { + uprv_free(pat); + pat = NULL; + patLen = 0; + } +} + +/** +* Set the new pattern to cache. +*/ +void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { + releasePattern(); + pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); + if (pat) { + patLen = newPatLen; + u_memcpy(pat, newPat, patLen); + pat[patLen] = 0; + } + // else we don't care if malloc failed. This was just a nice cache. + // We can regenerate an equivalent pattern later when requested. +} + +UnicodeFunctor *UnicodeSet::freeze() { + if(!isFrozen() && !isBogus()) { + compact(); + + // Optimize contains() and span() and similar functions. + if (hasStrings()) { + stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); + if (stringSpan == nullptr) { + setToBogus(); + return this; + } else if (!stringSpan->needsStringSpanUTF16()) { + // All strings are irrelevant for span() etc. because + // all of each string's code points are contained in this set. + // Do not check needsStringSpanUTF8() because UTF-8 has at most as + // many relevant strings as UTF-16. + // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) + delete stringSpan; + stringSpan = NULL; + } + } + if (stringSpan == NULL) { + // No span-relevant strings: Optimize for code point spans. + bmpSet=new BMPSet(list, len); + if (bmpSet == NULL) { // Check for memory allocation error. + setToBogus(); + } + } + } + return this; +} + +int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { + if(length>0 && bmpSet!=NULL) { + return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s); + } + if(length<0) { + length=u_strlen(s); + } + if(length==0) { + return 0; + } + if(stringSpan!=NULL) { + return stringSpan->span(s, length, spanCondition); + } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : + UnicodeSetStringSpan::FWD_UTF16_CONTAINED; + UnicodeSetStringSpan strSpan(*this, *strings, which); + if(strSpan.needsStringSpanUTF16()) { + return strSpan.span(s, length, spanCondition); + } + } + + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. + } + + UChar32 c; + int32_t start=0, prev=0; + do { + U16_NEXT(s, start, length, c); + if(spanCondition!=contains(c)) { + break; + } + } while((prev=start)0 && bmpSet!=NULL) { + return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s); + } + if(length<0) { + length=u_strlen(s); + } + if(length==0) { + return 0; + } + if(stringSpan!=NULL) { + return stringSpan->spanBack(s, length, spanCondition); + } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : + UnicodeSetStringSpan::BACK_UTF16_CONTAINED; + UnicodeSetStringSpan strSpan(*this, *strings, which); + if(strSpan.needsStringSpanUTF16()) { + return strSpan.spanBack(s, length, spanCondition); + } + } + + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. + } + + UChar32 c; + int32_t prev=length; + do { + U16_PREV(s, 0, length, c); + if(spanCondition!=contains(c)) { + break; + } + } while((prev=length)>0); + return prev; +} + +int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { + if(length>0 && bmpSet!=NULL) { + const uint8_t *s0=(const uint8_t *)s; + return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0); + } + if(length<0) { + length=(int32_t)uprv_strlen(s); + } + if(length==0) { + return 0; + } + if(stringSpan!=NULL) { + return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); + } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : + UnicodeSetStringSpan::FWD_UTF8_CONTAINED; + UnicodeSetStringSpan strSpan(*this, *strings, which); + if(strSpan.needsStringSpanUTF8()) { + return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); + } + } + + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. + } + + UChar32 c; + int32_t start=0, prev=0; + do { + U8_NEXT_OR_FFFD(s, start, length, c); + if(spanCondition!=contains(c)) { + break; + } + } while((prev=start)0 && bmpSet!=NULL) { + const uint8_t *s0=(const uint8_t *)s; + return bmpSet->spanBackUTF8(s0, length, spanCondition); + } + if(length<0) { + length=(int32_t)uprv_strlen(s); + } + if(length==0) { + return 0; + } + if(stringSpan!=NULL) { + return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); + } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : + UnicodeSetStringSpan::BACK_UTF8_CONTAINED; + UnicodeSetStringSpan strSpan(*this, *strings, which); + if(strSpan.needsStringSpanUTF8()) { + return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); + } + } + + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. + } + + UChar32 c; + int32_t prev=length; + do { + U8_PREV_OR_FFFD(s, 0, length, c); + if(spanCondition!=contains(c)) { + break; + } + } while((prev=length)>0); + return prev; +} U_NAMESPACE_END