icuSources/common/uniset.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 1999-2015, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   10/20/99    alan        Creation.
  10 **********************************************************************
  11 */
  12
  13 #include "unicode/utypes.h"
  14 #include "unicode/parsepos.h"
  15 #include "unicode/symtable.h"
  16 #include "unicode/uniset.h"
  17 #include "unicode/ustring.h"
  18 #include "unicode/utf8.h"
  19 #include "unicode/utf16.h"
  20 #include "ruleiter.h"
  21 #include "cmemory.h"
  22 #include "cstring.h"
  23 #include "patternprops.h"
  24 #include "uelement.h"
  25 #include "util.h"
  26 #include "uvector.h"
  27 #include "charstr.h"
  28 #include "ustrfmt.h"
  29 #include "uassert.h"
  30 #include "bmpset.h"
  31 #include "unisetspan.h"
  32
  33 // Define UChar constants using hex for EBCDIC compatibility
  34 // Used #define to reduce private static exports and memory access time.
  35 #define SET_OPEN        ((UChar)0x005B) /*[*/
  36 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  37 #define HYPHEN          ((UChar)0x002D) /*-*/
  38 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  39 #define COLON           ((UChar)0x003A) /*:*/
  40 #define BACKSLASH       ((UChar)0x005C) /*\*/
  41 #define INTERSECTION    ((UChar)0x0026) /*&*/
  42 #define UPPER_U         ((UChar)0x0055) /*U*/
  43 #define LOWER_U         ((UChar)0x0075) /*u*/
  44 #define OPEN_BRACE      ((UChar)123)    /*{*/
  45 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  46 #define UPPER_P         ((UChar)0x0050) /*P*/
  47 #define LOWER_P         ((UChar)0x0070) /*p*/
  48 #define UPPER_N         ((UChar)78)     /*N*/
  49 #define EQUALS          ((UChar)0x003D) /*=*/
  50
  51 // HIGH_VALUE > all valid values. 110000 for codepoints
  52 #define UNICODESET_HIGH 0x0110000
  53
  54 // LOW <= all valid values. ZERO for codepoints
  55 #define UNICODESET_LOW 0x000000
  56
  57 /** Max list [0, 1, 2, ..., max code point, HIGH] */
  58 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
  59
  60 U_NAMESPACE_BEGIN
  61
  62 SymbolTable::~SymbolTable() {}
  63
  64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
  65
  66 /**
  67  * Modify the given UChar32 variable so that it is in range, by
  68  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
  69  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
  70  * It modifies its argument in-place and also returns it.
  71  */
  72 static inline UChar32 pinCodePoint(UChar32& c) {
  73     if (c < UNICODESET_LOW) {
  74         c = UNICODESET_LOW;
  75     } else if (c > (UNICODESET_HIGH-1)) {
  76         c = (UNICODESET_HIGH-1);
  77     }
  78     return c;
  79 }
  80
  81 //----------------------------------------------------------------
  82 // Debugging
  83 //----------------------------------------------------------------
  84
  85 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
  86 // To enable the debugging, define the symbol DEBUG_MEM in the line
  87 // below.  This will result in text being sent to stdout that looks
  88 // like this:
  89 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
  90 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
  91 // Each line lists a construction (ct) or destruction (dt) event, the
  92 // object address, the number of outstanding objects after the event,
  93 // and the pattern of the object in question.
  94
  95 // #define DEBUG_MEM
  96
  97 #ifdef DEBUG_MEM
  98 #include <stdio.h>
  99 static int32_t _dbgCount = 0;
 100
 101 static inline void _dbgct(UnicodeSet* set) {
 102     UnicodeString str;
 103     set->toPattern(str, TRUE);
 104     char buf[40];
 105     str.extract(0, 39, buf, "");
 106     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
 107 }
 108
 109 static inline void _dbgdt(UnicodeSet* set) {
 110     UnicodeString str;
 111     set->toPattern(str, TRUE);
 112     char buf[40];
 113     str.extract(0, 39, buf, "");
 114     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
 115 }
 116
 117 #else
 118
 119 #define _dbgct(set)
 120 #define _dbgdt(set)
 121
 122 #endif
 123
 124 //----------------------------------------------------------------
 125 // UnicodeString in UVector support
 126 //----------------------------------------------------------------
 127
 128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
 129     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
 130 }
 131
 132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
 133     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
 134     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
 135     return a.compare(b);
 136 }
 137
 138 UBool UnicodeSet::hasStrings() const {
 139     return strings != nullptr && !strings->isEmpty();
 140 }
 141
 142 int32_t UnicodeSet::stringsSize() const {
 143     return strings == nullptr ? 0 : strings->size();
 144 }
 145
 146 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
 147     return strings != nullptr && strings->contains((void*) &s);
 148 }
 149
 150 //----------------------------------------------------------------
 151 // Constructors &c
 152 //----------------------------------------------------------------
 153
 154 /**
 155  * Constructs an empty set.
 156  */
 157 UnicodeSet::UnicodeSet() {
 158     list[0] = UNICODESET_HIGH;
 159     _dbgct(this);
 160 }
 161
 162 /**
 163  * Constructs a set containing the given range. If <code>end >
 164  * start</code> then an empty set is created.
 165  *
 166  * @param start first character, inclusive, of range
 167  * @param end last character, inclusive, of range
 168  */
 169 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
 170     list[0] = UNICODESET_HIGH;
 171     add(start, end);
 172     _dbgct(this);
 173 }
 174
 175 /**
 176  * Constructs a set that is identical to the given UnicodeSet.
 177  */
 178 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
 179     *this = o;
 180     _dbgct(this);
 181 }
 182
 183 // Copy-construct as thawed.
 184 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
 185     if (ensureCapacity(o.len)) {
 186         // *this = o except for bmpSet and stringSpan
 187         len = o.len;
 188         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
 189         if (o.hasStrings()) {
 190             UErrorCode status = U_ZERO_ERROR;
 191             if (!allocateStrings(status) ||
 192                     (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
 193                 setToBogus();
 194                 return;
 195             }
 196         }
 197         if (o.pat) {
 198             setPattern(o.pat, o.patLen);
 199         }
 200         _dbgct(this);
 201     }
 202 }
 203
 204 /**
 205  * Destructs the set.
 206  */
 207 UnicodeSet::~UnicodeSet() {
 208     _dbgdt(this); // first!
 209     if (list != stackList) {
 210         uprv_free(list);
 211     }
 212     delete bmpSet;
 213     if (buffer != stackList) {
 214         uprv_free(buffer);
 215     }
 216     delete strings;
 217     delete stringSpan;
 218     releasePattern();
 219 }
 220
 221 /**
 222  * Assigns this object to be a copy of another.
 223  */
 224 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
 225     return copyFrom(o, FALSE);
 226 }
 227
 228 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
 229     if (this == &o) {
 230         return *this;
 231     }
 232     if (isFrozen()) {
 233         return *this;
 234     }
 235     if (o.isBogus()) {
 236         setToBogus();
 237         return *this;
 238     }
 239     if (!ensureCapacity(o.len)) {
 240         // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
 241         return *this;
 242     }
 243     len = o.len;
 244     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
 245     if (o.bmpSet != nullptr && !asThawed) {
 246         bmpSet = new BMPSet(*o.bmpSet, list, len);
 247         if (bmpSet == NULL) { // Check for memory allocation error.
 248             setToBogus();
 249             return *this;
 250         }
 251     }
 252     if (o.hasStrings()) {
 253         UErrorCode status = U_ZERO_ERROR;
 254         if ((strings == nullptr && !allocateStrings(status)) ||
 255                 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
 256             setToBogus();
 257             return *this;
 258         }
 259     } else if (hasStrings()) {
 260         strings->removeAllElements();
 261     }
 262     if (o.stringSpan != nullptr && !asThawed) {
 263         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
 264         if (stringSpan == NULL) { // Check for memory allocation error.
 265             setToBogus();
 266             return *this;
 267         }
 268     }
 269     releasePattern();
 270     if (o.pat) {
 271         setPattern(o.pat, o.patLen);
 272     }
 273     return *this;
 274 }
 275
 276 /**
 277  * Returns a copy of this object.  All UnicodeMatcher objects have
 278  * to support cloning in order to allow classes using
 279  * UnicodeMatchers, such as Transliterator, to implement cloning.
 280  */
 281 UnicodeSet* UnicodeSet::clone() const {
 282     return new UnicodeSet(*this);
 283 }
 284
 285 UnicodeSet *UnicodeSet::cloneAsThawed() const {
 286     return new UnicodeSet(*this, TRUE);
 287 }
 288
 289 /**
 290  * Compares the specified object with this set for equality.  Returns
 291  * <tt>true</tt> if the two sets
 292  * have the same size, and every member of the specified set is
 293  * contained in this set (or equivalently, every member of this set is
 294  * contained in the specified set).
 295  *
 296  * @param o set to be compared for equality with this set.
 297  * @return <tt>true</tt> if the specified set is equal to this set.
 298  */
 299 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
 300     if (len != o.len) return FALSE;
 301     for (int32_t i = 0; i < len; ++i) {
 302         if (list[i] != o.list[i]) return FALSE;
 303     }
 304     if (hasStrings() != o.hasStrings()) { return FALSE; }
 305     if (hasStrings() && *strings != *o.strings) return FALSE;
 306     return TRUE;
 307 }
 308
 309 /**
 310  * Returns the hash code value for this set.
 311  *
 312  * @return the hash code value for this set.
 313  * @see Object#hashCode()
 314  */
 315 int32_t UnicodeSet::hashCode(void) const {
 316     uint32_t result = static_cast<uint32_t>(len);
 317     for (int32_t i = 0; i < len; ++i) {
 318         result *= 1000003u;
 319         result += list[i];
 320     }
 321     return static_cast<int32_t>(result);
 322 }
 323
 324 //----------------------------------------------------------------
 325 // Public API
 326 //----------------------------------------------------------------
 327
 328 /**
 329  * Returns the number of elements in this set (its cardinality),
 330  * Note than the elements of a set may include both individual
 331  * codepoints and strings.
 332  *
 333  * @return the number of elements in this set (its cardinality).
 334  */
 335 int32_t UnicodeSet::size(void) const {
 336     int32_t n = 0;
 337     int32_t count = getRangeCount();
 338     for (int32_t i = 0; i < count; ++i) {
 339         n += getRangeEnd(i) - getRangeStart(i) + 1;
 340     }
 341     return n + stringsSize();
 342 }
 343
 344 /**
 345  * Returns <tt>true</tt> if this set contains no elements.
 346  *
 347  * @return <tt>true</tt> if this set contains no elements.
 348  */
 349 UBool UnicodeSet::isEmpty(void) const {
 350     return len == 1 && !hasStrings();
 351 }
 352
 353 /**
 354  * Returns true if this set contains the given character.
 355  * @param c character to be checked for containment
 356  * @return true if the test condition is met
 357  */
 358 UBool UnicodeSet::contains(UChar32 c) const {
 359     // Set i to the index of the start item greater than ch
 360     // We know we will terminate without length test!
 361     // LATER: for large sets, add binary search
 362     //int32_t i = -1;
 363     //for (;;) {
 364     //    if (c < list[++i]) break;
 365     //}
 366     if (bmpSet != NULL) {
 367         return bmpSet->contains(c);
 368     }
 369     if (stringSpan != NULL) {
 370         return stringSpan->contains(c);
 371     }
 372     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
 373         return FALSE;
 374     }
 375     int32_t i = findCodePoint(c);
 376     return (UBool)(i & 1); // return true if odd
 377 }
 378
 379 /**
 380  * Returns the smallest value i such that c < list[i].  Caller
 381  * must ensure that c is a legal value or this method will enter
 382  * an infinite loop.  This method performs a binary search.
 383  * @param c a character in the range MIN_VALUE..MAX_VALUE
 384  * inclusive
 385  * @return the smallest integer i in the range 0..len-1,
 386  * inclusive, such that c < list[i]
 387  */
 388 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
 389     /* Examples:
 390                                        findCodePoint(c)
 391        set              list[]         c=0 1 3 4 7 8
 392        ===              ==============   ===========
 393        []               [110000]         0 0 0 0 0 0
 394        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 395        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 396        [:Any:]          [0, 110000]      1 1 1 1 1 1
 397      */
 398
 399     // Return the smallest i such that c < list[i].  Assume
 400     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 401     if (c < list[0])
 402         return 0;
 403     // High runner test.  c is often after the last range, so an
 404     // initial check for this condition pays off.
 405     int32_t lo = 0;
 406     int32_t hi = len - 1;
 407     if (lo >= hi || c >= list[hi-1])
 408         return hi;
 409     // invariant: c >= list[lo]
 410     // invariant: c < list[hi]
 411     for (;;) {
 412         int32_t i = (lo + hi) >> 1;
 413         if (i == lo) {
 414             break; // Found!
 415         } else if (c < list[i]) {
 416             hi = i;
 417         } else {
 418             lo = i;
 419         }
 420     }
 421     return hi;
 422 }
 423
 424 /**
 425  * Returns true if this set contains every character
 426  * of the given range.
 427  * @param start first character, inclusive, of the range
 428  * @param end last character, inclusive, of the range
 429  * @return true if the test condition is met
 430  */
 431 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
 432     //int32_t i = -1;
 433     //for (;;) {
 434     //    if (start < list[++i]) break;
 435     //}
 436     int32_t i = findCodePoint(start);
 437     return ((i & 1) != 0 && end < list[i]);
 438 }
 439
 440 /**
 441  * Returns <tt>true</tt> if this set contains the given
 442  * multicharacter string.
 443  * @param s string to be checked for containment
 444  * @return <tt>true</tt> if this set contains the specified string
 445  */
 446 UBool UnicodeSet::contains(const UnicodeString& s) const {
 447     if (s.length() == 0) return FALSE;
 448     int32_t cp = getSingleCP(s);
 449     if (cp < 0) {
 450         return stringsContains(s);
 451     } else {
 452         return contains((UChar32) cp);
 453     }
 454 }
 455
 456 /**
 457  * Returns true if this set contains all the characters and strings
 458  * of the given set.
 459  * @param c set to be checked for containment
 460  * @return true if the test condition is met
 461  */
 462 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
 463     // The specified set is a subset if all of its pairs are contained in
 464     // this set.  It's possible to code this more efficiently in terms of
 465     // direct manipulation of the inversion lists if the need arises.
 466     int32_t n = c.getRangeCount();
 467     for (int i=0; i<n; ++i) {
 468         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
 469             return FALSE;
 470         }
 471     }
 472     return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
 473 }
 474
 475 /**
 476  * Returns true if this set contains all the characters
 477  * of the given string.
 478  * @param s string containing characters to be checked for containment
 479  * @return true if the test condition is met
 480  */
 481 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
 482     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
 483                    s.length());
 484 }
 485
 486 /**
 487  * Returns true if this set contains none of the characters
 488  * of the given range.
 489  * @param start first character, inclusive, of the range
 490  * @param end last character, inclusive, of the range
 491  * @return true if the test condition is met
 492  */
 493 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
 494     //int32_t i = -1;
 495     //for (;;) {
 496     //    if (start < list[++i]) break;
 497     //}
 498     int32_t i = findCodePoint(start);
 499     return ((i & 1) == 0 && end < list[i]);
 500 }
 501
 502 /**
 503  * Returns true if this set contains none of the characters and strings
 504  * of the given set.
 505  * @param c set to be checked for containment
 506  * @return true if the test condition is met
 507  */
 508 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
 509     // The specified set is a subset if all of its pairs are contained in
 510     // this set.  It's possible to code this more efficiently in terms of
 511     // direct manipulation of the inversion lists if the need arises.
 512     int32_t n = c.getRangeCount();
 513     for (int32_t i=0; i<n; ++i) {
 514         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
 515             return FALSE;
 516         }
 517     }
 518     return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
 519 }
 520
 521 /**
 522  * Returns true if this set contains none of the characters
 523  * of the given string.
 524  * @param s string containing characters to be checked for containment
 525  * @return true if the test condition is met
 526  */
 527 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
 528     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
 529                    s.length());
 530 }
 531
 532 /**
 533  * Returns <tt>true</tt> if this set contains any character whose low byte
 534  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
 535  * indexing.
 536  */
 537 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
 538     /* The index value v, in the range [0,255], is contained in this set if
 539      * it is contained in any pair of this set.  Pairs either have the high
 540      * bytes equal, or unequal.  If the high bytes are equal, then we have
 541      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
 542      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
 543      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
 544      * time zone month containment logic.)
 545      */
 546     int32_t i;
 547     int32_t rangeCount=getRangeCount();
 548     for (i=0; i<rangeCount; ++i) {
 549         UChar32 low = getRangeStart(i);
 550         UChar32 high = getRangeEnd(i);
 551         if ((low & ~0xFF) == (high & ~0xFF)) {
 552             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
 553                 return TRUE;
 554             }
 555         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
 556             return TRUE;
 557         }
 558     }
 559     if (hasStrings()) {
 560         for (i=0; i<strings->size(); ++i) {
 561             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
 562             //if (s.length() == 0) {
 563             //    // Empty strings match everything
 564             //    return TRUE;
 565             //}
 566             // assert(s.length() != 0); // We enforce this elsewhere
 567             UChar32 c = s.char32At(0);
 568             if ((c & 0xFF) == v) {
 569                 return TRUE;
 570             }
 571         }
 572     }
 573     return FALSE;
 574 }
 575
 576 /**
 577  * Implementation of UnicodeMatcher::matches().  Always matches the
 578  * longest possible multichar string.
 579  */
 580 UMatchDegree UnicodeSet::matches(const Replaceable& text,
 581                                  int32_t& offset,
 582                                  int32_t limit,
 583                                  UBool incremental) {
 584     if (offset == limit) {
 585         // Strings, if any, have length != 0, so we don't worry
 586         // about them here.  If we ever allow zero-length strings
 587         // we much check for them here.
 588         if (contains(U_ETHER)) {
 589             return incremental ? U_PARTIAL_MATCH : U_MATCH;
 590         } else {
 591             return U_MISMATCH;
 592         }
 593     } else {
 594         if (hasStrings()) { // try strings first
 595
 596             // might separate forward and backward loops later
 597             // for now they are combined
 598
 599             // TODO Improve efficiency of this, at least in the forward
 600             // direction, if not in both.  In the forward direction we
 601             // can assume the strings are sorted.
 602
 603             int32_t i;
 604             UBool forward = offset < limit;
 605
 606             // firstChar is the leftmost char to match in the
 607             // forward direction or the rightmost char to match in
 608             // the reverse direction.
 609             UChar firstChar = text.charAt(offset);
 610
 611             // If there are multiple strings that can match we
 612             // return the longest match.
 613             int32_t highWaterLength = 0;
 614
 615             for (i=0; i<strings->size(); ++i) {
 616                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
 617
 618                 //if (trial.length() == 0) {
 619                 //    return U_MATCH; // null-string always matches
 620                 //}
 621                 // assert(trial.length() != 0); // We ensure this elsewhere
 622
 623                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 624
 625                 // Strings are sorted, so we can optimize in the
 626                 // forward direction.
 627                 if (forward && c > firstChar) break;
 628                 if (c != firstChar) continue;
 629
 630                 int32_t matchLen = matchRest(text, offset, limit, trial);
 631
 632                 if (incremental) {
 633                     int32_t maxLen = forward ? limit-offset : offset-limit;
 634                     if (matchLen == maxLen) {
 635                         // We have successfully matched but only up to limit.
 636                         return U_PARTIAL_MATCH;
 637                     }
 638                 }
 639
 640                 if (matchLen == trial.length()) {
 641                     // We have successfully matched the whole string.
 642                     if (matchLen > highWaterLength) {
 643                         highWaterLength = matchLen;
 644                     }
 645                     // In the forward direction we know strings
 646                     // are sorted so we can bail early.
 647                     if (forward && matchLen < highWaterLength) {
 648                         break;
 649                     }
 650                     continue;
 651                 }
 652             }
 653
 654             // We've checked all strings without a partial match.
 655             // If we have full matches, return the longest one.
 656             if (highWaterLength != 0) {
 657                 offset += forward ? highWaterLength : -highWaterLength;
 658                 return U_MATCH;
 659             }
 660         }
 661         return UnicodeFilter::matches(text, offset, limit, incremental);
 662     }
 663 }
 664
 665 /**
 666  * Returns the longest match for s in text at the given position.
 667  * If limit > start then match forward from start+1 to limit
 668  * matching all characters except s.charAt(0).  If limit < start,
 669  * go backward starting from start-1 matching all characters
 670  * except s.charAt(s.length()-1).  This method assumes that the
 671  * first character, text.charAt(start), matches s, so it does not
 672  * check it.
 673  * @param text the text to match
 674  * @param start the first character to match.  In the forward
 675  * direction, text.charAt(start) is matched against s.charAt(0).
 676  * In the reverse direction, it is matched against
 677  * s.charAt(s.length()-1).
 678  * @param limit the limit offset for matching, either last+1 in
 679  * the forward direction, or last-1 in the reverse direction,
 680  * where last is the index of the last character to match.
 681  * @return If part of s matches up to the limit, return |limit -
 682  * start|.  If all of s matches before reaching the limit, return
 683  * s.length().  If there is a mismatch between s and text, return
 684  * 0
 685  */
 686 int32_t UnicodeSet::matchRest(const Replaceable& text,
 687                               int32_t start, int32_t limit,
 688                               const UnicodeString& s) {
 689     int32_t i;
 690     int32_t maxLen;
 691     int32_t slen = s.length();
 692     if (start < limit) {
 693         maxLen = limit - start;
 694         if (maxLen > slen) maxLen = slen;
 695         for (i = 1; i < maxLen; ++i) {
 696             if (text.charAt(start + i) != s.charAt(i)) return 0;
 697         }
 698     } else {
 699         maxLen = start - limit;
 700         if (maxLen > slen) maxLen = slen;
 701         --slen; // <=> slen = s.length() - 1;
 702         for (i = 1; i < maxLen; ++i) {
 703             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
 704         }
 705     }
 706     return maxLen;
 707 }
 708
 709 /**
 710  * Implement of UnicodeMatcher
 711  */
 712 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
 713     toUnionTo.addAll(*this);
 714 }
 715
 716 /**
 717  * Returns the index of the given character within this set, where
 718  * the set is ordered by ascending code point.  If the character
 719  * is not in this set, return -1.  The inverse of this method is
 720  * <code>charAt()</code>.
 721  * @return an index from 0..size()-1, or -1
 722  */
 723 int32_t UnicodeSet::indexOf(UChar32 c) const {
 724     if (c < MIN_VALUE || c > MAX_VALUE) {
 725         return -1;
 726     }
 727     int32_t i = 0;
 728     int32_t n = 0;
 729     for (;;) {
 730         UChar32 start = list[i++];
 731         if (c < start) {
 732             return -1;
 733         }
 734         UChar32 limit = list[i++];
 735         if (c < limit) {
 736             return n + c - start;
 737         }
 738         n += limit - start;
 739     }
 740 }
 741
 742 /**
 743  * Returns the character at the given index within this set, where
 744  * the set is ordered by ascending code point.  If the index is
 745  * out of range, return (UChar32)-1.  The inverse of this method is
 746  * <code>indexOf()</code>.
 747  * @param index an index from 0..size()-1
 748  * @return the character at the given index, or (UChar32)-1.
 749  */
 750 UChar32 UnicodeSet::charAt(int32_t index) const {
 751     if (index >= 0) {
 752         // len2 is the largest even integer <= len, that is, it is len
 753         // for even values and len-1 for odd values.  With odd values
 754         // the last entry is UNICODESET_HIGH.
 755         int32_t len2 = len & ~1;
 756         for (int32_t i=0; i < len2;) {
 757             UChar32 start = list[i++];
 758             int32_t count = list[i++] - start;
 759             if (index < count) {
 760                 return (UChar32)(start + index);
 761             }
 762             index -= count;
 763         }
 764     }
 765     return (UChar32)-1;
 766 }
 767
 768 /**
 769  * Make this object represent the range <code>start - end</code>.
 770  * If <code>end > start</code> then this object is set to an
 771  * an empty range.
 772  *
 773  * @param start first character in the set, inclusive
 774  * @rparam end last character in the set, inclusive
 775  */
 776 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
 777     clear();
 778     complement(start, end);
 779     return *this;
 780 }
 781
 782 /**
 783  * Adds the specified range to this set if it is not already
 784  * present.  If this set already contains the specified range,
 785  * the call leaves this set unchanged.  If <code>end > start</code>
 786  * then an empty range is added, leaving the set unchanged.
 787  *
 788  * @param start first character, inclusive, of range to be added
 789  * to this set.
 790  * @param end last character, inclusive, of range to be added
 791  * to this set.
 792  */
 793 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
 794     if (pinCodePoint(start) < pinCodePoint(end)) {
 795         UChar32 limit = end + 1;
 796         // Fast path for adding a new range after the last one.
 797         // Odd list length: [..., lastStart, lastLimit, HIGH]
 798         if ((len & 1) != 0) {
 799             // If the list is empty, set lastLimit low enough to not be adjacent to 0.
 800             UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
 801             if (lastLimit <= start && !isFrozen() && !isBogus()) {
 802                 if (lastLimit == start) {
 803                     // Extend the last range.
 804                     list[len - 2] = limit;
 805                     if (limit == UNICODESET_HIGH) {
 806                         --len;
 807                     }
 808                 } else {
 809                     list[len - 1] = start;
 810                     if (limit < UNICODESET_HIGH) {
 811                         if (ensureCapacity(len + 2)) {
 812                             list[len++] = limit;
 813                             list[len++] = UNICODESET_HIGH;
 814                         }
 815                     } else {  // limit == UNICODESET_HIGH
 816                         if (ensureCapacity(len + 1)) {
 817                             list[len++] = UNICODESET_HIGH;
 818                         }
 819                     }
 820                 }
 821                 releasePattern();
 822                 return *this;
 823             }
 824         }
 825         // This is slow. Could be much faster using findCodePoint(start)
 826         // and modifying the list, dealing with adjacent & overlapping ranges.
 827         UChar32 range[3] = { start, limit, UNICODESET_HIGH };
 828         add(range, 2, 0);
 829     } else if (start == end) {
 830         add(start);
 831     }
 832     return *this;
 833 }
 834
 835 // #define DEBUG_US_ADD
 836
 837 #ifdef DEBUG_US_ADD
 838 #include <stdio.h>
 839 void dump(UChar32 c) {
 840     if (c <= 0xFF) {
 841         printf("%c", (char)c);
 842     } else {
 843         printf("U+%04X", c);
 844     }
 845 }
 846 void dump(const UChar32* list, int32_t len) {
 847     printf("[");
 848     for (int32_t i=0; i<len; ++i) {
 849         if (i != 0) printf(", ");
 850         dump(list[i]);
 851     }
 852     printf("]");
 853 }
 854 #endif
 855
 856 /**
 857  * Adds the specified character to this set if it is not already
 858  * present.  If this set already contains the specified character,
 859  * the call leaves this set unchanged.
 860  */
 861 UnicodeSet& UnicodeSet::add(UChar32 c) {
 862     // find smallest i such that c < list[i]
 863     // if odd, then it is IN the set
 864     // if even, then it is OUT of the set
 865     int32_t i = findCodePoint(pinCodePoint(c));
 866
 867     // already in set?
 868     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
 869
 870     // HIGH is 0x110000
 871     // assert(list[len-1] == HIGH);
 872
 873     // empty = [HIGH]
 874     // [start_0, limit_0, start_1, limit_1, HIGH]
 875
 876     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 877     //                             ^
 878     //                             list[i]
 879
 880     // i == 0 means c is before the first range
 881
 882 #ifdef DEBUG_US_ADD
 883     printf("Add of ");
 884     dump(c);
 885     printf(" found at %d", i);
 886     printf(": ");
 887     dump(list, len);
 888     printf(" => ");
 889 #endif
 890
 891     if (c == list[i]-1) {
 892         // c is before start of next range
 893         list[i] = c;
 894         // if we touched the HIGH mark, then add a new one
 895         if (c == (UNICODESET_HIGH - 1)) {
 896             if (!ensureCapacity(len+1)) {
 897                 // ensureCapacity will mark the object as Bogus if OOM failure happens.
 898                 return *this;
 899             }
 900             list[len++] = UNICODESET_HIGH;
 901         }
 902         if (i > 0 && c == list[i-1]) {
 903             // collapse adjacent ranges
 904
 905             // [..., start_k-1, c, c, limit_k, ..., HIGH]
 906             //                     ^
 907             //                     list[i]
 908
 909             //for (int32_t k=i-1; k<len-2; ++k) {
 910             //    list[k] = list[k+2];
 911             //}
 912             UChar32* dst = list + i - 1;
 913             UChar32* src = dst + 2;
 914             UChar32* srclimit = list + len;
 915             while (src < srclimit) *(dst++) = *(src++);
 916
 917             len -= 2;
 918         }
 919     }
 920
 921     else if (i > 0 && c == list[i-1]) {
 922         // c is after end of prior range
 923         list[i-1]++;
 924         // no need to check for collapse here
 925     }
 926
 927     else {
 928         // At this point we know the new char is not adjacent to
 929         // any existing ranges, and it is not 10FFFF.
 930
 931
 932         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 933         //                             ^
 934         //                             list[i]
 935
 936         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 937         //                             ^
 938         //                             list[i]
 939
 940         if (!ensureCapacity(len+2)) {
 941             // ensureCapacity will mark the object as Bogus if OOM failure happens.
 942             return *this;
 943         }
 944
 945         UChar32 *p = list + i;
 946         uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
 947         list[i] = c;
 948         list[i+1] = c+1;
 949         len += 2;
 950     }
 951
 952 #ifdef DEBUG_US_ADD
 953     dump(list, len);
 954     printf("\n");
 955
 956     for (i=1; i<len; ++i) {
 957         if (list[i] <= list[i-1]) {
 958             // Corrupt array!
 959             printf("ERROR: list has been corrupted\n");
 960             exit(1);
 961         }
 962     }
 963 #endif
 964
 965     releasePattern();
 966     return *this;
 967 }
 968
 969 /**
 970  * Adds the specified multicharacter to this set if it is not already
 971  * present.  If this set already contains the multicharacter,
 972  * the call leaves this set unchanged.
 973  * Thus "ch" => {"ch"}
 974  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 975  * @param s the source string
 976  * @return the modified set, for chaining
 977  */
 978 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
 979     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
 980     int32_t cp = getSingleCP(s);
 981     if (cp < 0) {
 982         if (!stringsContains(s)) {
 983             _add(s);
 984             releasePattern();
 985         }
 986     } else {
 987         add((UChar32)cp);
 988     }
 989     return *this;
 990 }
 991
 992 /**
 993  * Adds the given string, in order, to 'strings'.  The given string
 994  * must have been checked by the caller to not be empty and to not
 995  * already be in 'strings'.
 996  */
 997 void UnicodeSet::_add(const UnicodeString& s) {
 998     if (isFrozen() || isBogus()) {
 999         return;
1000     }
1001     UErrorCode ec = U_ZERO_ERROR;
1002     if (strings == nullptr && !allocateStrings(ec)) {
1003         setToBogus();
1004         return;
1005     }
1006     UnicodeString* t = new UnicodeString(s);
1007     if (t == NULL) { // Check for memory allocation error.
1008         setToBogus();
1009         return;
1010     }
1011     strings->sortedInsert(t, compareUnicodeString, ec);
1012     if (U_FAILURE(ec)) {
1013         setToBogus();
1014         delete t;
1015     }
1016 }
1017
1018 /**
1019  * @return a code point IF the string consists of a single one.
1020  * otherwise returns -1.
1021  * @param string to test
1022  */
1023 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1024     //if (s.length() < 1) {
1025     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1026     //}
1027     if (s.length() > 2) return -1;
1028     if (s.length() == 1) return s.charAt(0);
1029
1030     // at this point, len = 2
1031     UChar32 cp = s.char32At(0);
1032     if (cp > 0xFFFF) { // is surrogate pair
1033         return cp;
1034     }
1035     return -1;
1036 }
1037
1038 /**
1039  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1040  * If this set already any particular character, it has no effect on that character.
1041  * @param the source string
1042  * @return the modified set, for chaining
1043  */
1044 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1045     UChar32 cp;
1046     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1047         cp = s.char32At(i);
1048         add(cp);
1049     }
1050     return *this;
1051 }
1052
1053 /**
1054  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1055  * If this set already any particular character, it has no effect on that character.
1056  * @param the source string
1057  * @return the modified set, for chaining
1058  */
1059 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1060     UnicodeSet set;
1061     set.addAll(s);
1062     retainAll(set);
1063     return *this;
1064 }
1065
1066 /**
1067  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1068  * If this set already any particular character, it has no effect on that character.
1069  * @param the source string
1070  * @return the modified set, for chaining
1071  */
1072 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1073     UnicodeSet set;
1074     set.addAll(s);
1075     complementAll(set);
1076     return *this;
1077 }
1078
1079 /**
1080  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1081  * If this set already any particular character, it has no effect on that character.
1082  * @param the source string
1083  * @return the modified set, for chaining
1084  */
1085 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1086     UnicodeSet set;
1087     set.addAll(s);
1088     removeAll(set);
1089     return *this;
1090 }
1091
1092 UnicodeSet& UnicodeSet::removeAllStrings() {
1093     if (!isFrozen() && hasStrings()) {
1094         strings->removeAllElements();
1095         releasePattern();
1096     }
1097     return *this;
1098 }
1099
1100
1101 /**
1102  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1103  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1104  * @param the source string
1105  * @return a newly created set containing the given string
1106  */
1107 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1108     UnicodeSet *set = new UnicodeSet();
1109     if (set != NULL) { // Check for memory allocation error.
1110         set->add(s);
1111     }
1112     return set;
1113 }
1114
1115
1116 /**
1117  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1118  * @param the source string
1119  * @return a newly created set containing the given characters
1120  */
1121 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1122     UnicodeSet *set = new UnicodeSet();
1123     if (set != NULL) { // Check for memory allocation error.
1124         set->addAll(s);
1125     }
1126     return set;
1127 }
1128
1129 /**
1130  * Retain only the elements in this set that are contained in the
1131  * specified range.  If <code>end > start</code> then an empty range is
1132  * retained, leaving the set empty.
1133  *
1134  * @param start first character, inclusive, of range to be retained
1135  * to this set.
1136  * @param end last character, inclusive, of range to be retained
1137  * to this set.
1138  */
1139 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1140     if (pinCodePoint(start) <= pinCodePoint(end)) {
1141         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1142         retain(range, 2, 0);
1143     } else {
1144         clear();
1145     }
1146     return *this;
1147 }
1148
1149 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1150     return retain(c, c);
1151 }
1152
1153 /**
1154  * Removes the specified range from this set if it is present.
1155  * The set will not contain the specified range once the call
1156  * returns.  If <code>end > start</code> then an empty range is
1157  * removed, leaving the set unchanged.
1158  *
1159  * @param start first character, inclusive, of range to be removed
1160  * from this set.
1161  * @param end last character, inclusive, of range to be removed
1162  * from this set.
1163  */
1164 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1165     if (pinCodePoint(start) <= pinCodePoint(end)) {
1166         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1167         retain(range, 2, 2);
1168     }
1169     return *this;
1170 }
1171
1172 /**
1173  * Removes the specified character from this set if it is present.
1174  * The set will not contain the specified range once the call
1175  * returns.
1176  */
1177 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1178     return remove(c, c);
1179 }
1180
1181 /**
1182  * Removes the specified string from this set if it is present.
1183  * The set will not contain the specified character once the call
1184  * returns.
1185  * @param the source string
1186  * @return the modified set, for chaining
1187  */
1188 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1189     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1190     int32_t cp = getSingleCP(s);
1191     if (cp < 0) {
1192         if (strings != nullptr && strings->removeElement((void*) &s)) {
1193             releasePattern();
1194         }
1195     } else {
1196         remove((UChar32)cp, (UChar32)cp);
1197     }
1198     return *this;
1199 }
1200
1201 /**
1202  * Complements the specified range in this set.  Any character in
1203  * the range will be removed if it is in this set, or will be
1204  * added if it is not in this set.  If <code>end > start</code>
1205  * then an empty range is xor'ed, leaving the set unchanged.
1206  *
1207  * @param start first character, inclusive, of range to be removed
1208  * from this set.
1209  * @param end last character, inclusive, of range to be removed
1210  * from this set.
1211  */
1212 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1213     if (isFrozen() || isBogus()) {
1214         return *this;
1215     }
1216     if (pinCodePoint(start) <= pinCodePoint(end)) {
1217         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1218         exclusiveOr(range, 2, 0);
1219     }
1220     releasePattern();
1221     return *this;
1222 }
1223
1224 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1225     return complement(c, c);
1226 }
1227
1228 /**
1229  * This is equivalent to
1230  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1231  */
1232 UnicodeSet& UnicodeSet::complement(void) {
1233     if (isFrozen() || isBogus()) {
1234         return *this;
1235     }
1236     if (list[0] == UNICODESET_LOW) {
1237         uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1238         --len;
1239     } else {
1240         if (!ensureCapacity(len+1)) {
1241             return *this;
1242         }
1243         uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1244         list[0] = UNICODESET_LOW;
1245         ++len;
1246     }
1247     releasePattern();
1248     return *this;
1249 }
1250
1251 /**
1252  * Complement the specified string in this set.
1253  * The set will not contain the specified string once the call
1254  * returns.
1255  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1256  * @param s the string to complement
1257  * @return this object, for chaining
1258  */
1259 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1260     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1261     int32_t cp = getSingleCP(s);
1262     if (cp < 0) {
1263         if (stringsContains(s)) {
1264             strings->removeElement((void*) &s);
1265         } else {
1266             _add(s);
1267         }
1268         releasePattern();
1269     } else {
1270         complement((UChar32)cp, (UChar32)cp);
1271     }
1272     return *this;
1273 }
1274
1275 /**
1276  * Adds all of the elements in the specified set to this set if
1277  * they're not already present.  This operation effectively
1278  * modifies this set so that its value is the <i>union</i> of the two
1279  * sets.  The behavior of this operation is unspecified if the specified
1280  * collection is modified while the operation is in progress.
1281  *
1282  * @param c set whose elements are to be added to this set.
1283  * @see #add(char, char)
1284  */
1285 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1286     if ( c.len>0 && c.list!=NULL ) {
1287         add(c.list, c.len, 0);
1288     }
1289
1290     // Add strings in order
1291     if ( c.strings!=NULL ) {
1292         for (int32_t i=0; i<c.strings->size(); ++i) {
1293             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1294             if (!stringsContains(*s)) {
1295                 _add(*s);
1296             }
1297         }
1298     }
1299     return *this;
1300 }
1301
1302 /**
1303  * Retains only the elements in this set that are contained in the
1304  * specified set.  In other words, removes from this set all of
1305  * its elements that are not contained in the specified set.  This
1306  * operation effectively modifies this set so that its value is
1307  * the <i>intersection</i> of the two sets.
1308  *
1309  * @param c set that defines which elements this set will retain.
1310  */
1311 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1312     if (isFrozen() || isBogus()) {
1313         return *this;
1314     }
1315     retain(c.list, c.len, 0);
1316     if (hasStrings()) {
1317         if (!c.hasStrings()) {
1318             strings->removeAllElements();
1319         } else {
1320             strings->retainAll(*c.strings);
1321         }
1322     }
1323     return *this;
1324 }
1325
1326 /**
1327  * Removes from this set all of its elements that are contained in the
1328  * specified set.  This operation effectively modifies this
1329  * set so that its value is the <i>asymmetric set difference</i> of
1330  * the two sets.
1331  *
1332  * @param c set that defines which elements will be removed from
1333  *          this set.
1334  */
1335 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1336     if (isFrozen() || isBogus()) {
1337         return *this;
1338     }
1339     retain(c.list, c.len, 2);
1340     if (hasStrings() && c.hasStrings()) {
1341         strings->removeAll(*c.strings);
1342     }
1343     return *this;
1344 }
1345
1346 /**
1347  * Complements in this set all elements contained in the specified
1348  * set.  Any character in the other set will be removed if it is
1349  * in this set, or will be added if it is not in this set.
1350  *
1351  * @param c set that defines which elements will be xor'ed from
1352  *          this set.
1353  */
1354 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1355     if (isFrozen() || isBogus()) {
1356         return *this;
1357     }
1358     exclusiveOr(c.list, c.len, 0);
1359
1360     if (c.strings != nullptr) {
1361         for (int32_t i=0; i<c.strings->size(); ++i) {
1362             void* e = c.strings->elementAt(i);
1363             if (strings == nullptr || !strings->removeElement(e)) {
1364                 _add(*(const UnicodeString*)e);
1365             }
1366         }
1367     }
1368     return *this;
1369 }
1370
1371 /**
1372  * Removes all of the elements from this set.  This set will be
1373  * empty after this call returns.
1374  */
1375 UnicodeSet& UnicodeSet::clear(void) {
1376     if (isFrozen()) {
1377         return *this;
1378     }
1379     list[0] = UNICODESET_HIGH;
1380     len = 1;
1381     releasePattern();
1382     if (strings != NULL) {
1383         strings->removeAllElements();
1384     }
1385     // Remove bogus
1386     fFlags = 0;
1387     return *this;
1388 }
1389
1390 /**
1391  * Iteration method that returns the number of ranges contained in
1392  * this set.
1393  * @see #getRangeStart
1394  * @see #getRangeEnd
1395  */
1396 int32_t UnicodeSet::getRangeCount() const {
1397     return len/2;
1398 }
1399
1400 /**
1401  * Iteration method that returns the first character in the
1402  * specified range of this set.
1403  * @see #getRangeCount
1404  * @see #getRangeEnd
1405  */
1406 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1407     return list[index*2];
1408 }
1409
1410 /**
1411  * Iteration method that returns the last character in the
1412  * specified range of this set.
1413  * @see #getRangeStart
1414  * @see #getRangeEnd
1415  */
1416 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1417     return list[index*2 + 1] - 1;
1418 }
1419
1420 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1421     return (const UnicodeString*) strings->elementAt(index);
1422 }
1423
1424 /**
1425  * Reallocate this objects internal structures to take up the least
1426  * possible space, without changing this object's value.
1427  */
1428 UnicodeSet& UnicodeSet::compact() {
1429     if (isFrozen() || isBogus()) {
1430         return *this;
1431     }
1432     // Delete buffer first to defragment memory less.
1433     if (buffer != stackList) {
1434         uprv_free(buffer);
1435         buffer = NULL;
1436         bufferCapacity = 0;
1437     }
1438     if (list == stackList) {
1439         // pass
1440     } else if (len <= INITIAL_CAPACITY) {
1441         uprv_memcpy(stackList, list, len * sizeof(UChar32));
1442         uprv_free(list);
1443         list = stackList;
1444         capacity = INITIAL_CAPACITY;
1445     } else if ((len + 7) < capacity) {
1446         // If we have more than a little unused capacity, shrink it to len.
1447         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1448         if (temp) {
1449             list = temp;
1450             capacity = len;
1451         }
1452         // else what the heck happened?! We allocated less memory!
1453         // Oh well. We'll keep our original array.
1454     }
1455     if (strings != nullptr && strings->isEmpty()) {
1456         delete strings;
1457         strings = nullptr;
1458     }
1459     return *this;
1460 }
1461
1462 #ifdef DEBUG_SERIALIZE
1463 #include <stdio.h>
1464 #endif
1465
1466 /**
1467  * Deserialize constructor.
1468  */
1469 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1470                        UErrorCode &ec) {
1471
1472   if(U_FAILURE(ec)) {
1473     setToBogus();
1474     return;
1475   }
1476
1477   if( (serialization != kSerialized)
1478       || (data==NULL)
1479       || (dataLen < 1)) {
1480     ec = U_ILLEGAL_ARGUMENT_ERROR;
1481     setToBogus();
1482     return;
1483   }
1484
1485   // bmp?
1486   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1487   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1488
1489   int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1490 #ifdef DEBUG_SERIALIZE
1491   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1492 #endif
1493   if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
1494     return;
1495   }
1496   // copy bmp
1497   int32_t i;
1498   for(i = 0; i< bmpLength;i++) {
1499     list[i] = data[i+headerSize];
1500 #ifdef DEBUG_SERIALIZE
1501     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1502 #endif
1503   }
1504   // copy smp
1505   for(i=bmpLength;i<newLength;i++) {
1506     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1507               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1508 #ifdef DEBUG_SERIALIZE
1509     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1510 #endif
1511   }
1512   U_ASSERT(i == newLength);
1513   if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1514     list[i++] = UNICODESET_HIGH;
1515   }
1516   len = i;
1517 }
1518
1519
1520 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1521     int32_t bmpLength, length, destLength;
1522
1523     if (U_FAILURE(ec)) {
1524         return 0;
1525     }
1526
1527     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1528         ec=U_ILLEGAL_ARGUMENT_ERROR;
1529         return 0;
1530     }
1531
1532     /* count necessary 16-bit units */
1533     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1534     // assert(length>=0);
1535     if (length==0) {
1536         /* empty set */
1537         if (destCapacity>0) {
1538             *dest=0;
1539         } else {
1540             ec=U_BUFFER_OVERFLOW_ERROR;
1541         }
1542         return 1;
1543     }
1544     /* now length>0 */
1545
1546     if (this->list[length-1]<=0xffff) {
1547         /* all BMP */
1548         bmpLength=length;
1549     } else if (this->list[0]>=0x10000) {
1550         /* all supplementary */
1551         bmpLength=0;
1552         length*=2;
1553     } else {
1554         /* some BMP, some supplementary */
1555         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1556         length=bmpLength+2*(length-bmpLength);
1557     }
1558 #ifdef DEBUG_SERIALIZE
1559     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1560 #endif
1561     /* length: number of 16-bit array units */
1562     if (length>0x7fff) {
1563         /* there are only 15 bits for the length in the first serialized word */
1564         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1565         return 0;
1566     }
1567
1568     /*
1569      * total serialized length:
1570      * number of 16-bit array units (length) +
1571      * 1 length unit (always) +
1572      * 1 bmpLength unit (if there are supplementary values)
1573      */
1574     destLength=length+((length>bmpLength)?2:1);
1575     if (destLength<=destCapacity) {
1576         const UChar32 *p;
1577         int32_t i;
1578
1579 #ifdef DEBUG_SERIALIZE
1580         printf("writeHdr\n");
1581 #endif
1582         *dest=(uint16_t)length;
1583         if (length>bmpLength) {
1584             *dest|=0x8000;
1585             *++dest=(uint16_t)bmpLength;
1586         }
1587         ++dest;
1588
1589         /* write the BMP part of the array */
1590         p=this->list;
1591         for (i=0; i<bmpLength; ++i) {
1592 #ifdef DEBUG_SERIALIZE
1593           printf("writebmp: %x\n", (int)*p);
1594 #endif
1595             *dest++=(uint16_t)*p++;
1596         }
1597
1598         /* write the supplementary part of the array */
1599         for (; i<length; i+=2) {
1600 #ifdef DEBUG_SERIALIZE
1601           printf("write32: %x\n", (int)*p);
1602 #endif
1603             *dest++=(uint16_t)(*p>>16);
1604             *dest++=(uint16_t)*p++;
1605         }
1606     } else {
1607         ec=U_BUFFER_OVERFLOW_ERROR;
1608     }
1609     return destLength;
1610 }
1611
1612 //----------------------------------------------------------------
1613 // Implementation: Utility methods
1614 //----------------------------------------------------------------
1615
1616 /**
1617  * Allocate our strings vector and return TRUE if successful.
1618  */
1619 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1620     if (U_FAILURE(status)) {
1621         return FALSE;
1622     }
1623     strings = new UVector(uprv_deleteUObject,
1624                           uhash_compareUnicodeString, 1, status);
1625     if (strings == NULL) { // Check for memory allocation error.
1626         status = U_MEMORY_ALLOCATION_ERROR;
1627         return FALSE;
1628     }
1629     if (U_FAILURE(status)) {
1630         delete strings;
1631         strings = NULL;
1632         return FALSE;
1633     }
1634     return TRUE;
1635 }
1636
1637 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1638     // Grow exponentially to reduce the frequency of allocations.
1639     if (minCapacity < INITIAL_CAPACITY) {
1640         return minCapacity + INITIAL_CAPACITY;
1641     } else if (minCapacity <= 2500) {
1642         return 5 * minCapacity;
1643     } else {
1644         int32_t newCapacity = 2 * minCapacity;
1645         if (newCapacity > MAX_LENGTH) {
1646             newCapacity = MAX_LENGTH;
1647         }
1648         return newCapacity;
1649     }
1650 }
1651
1652 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1653     if (newLen > MAX_LENGTH) {
1654         newLen = MAX_LENGTH;
1655     }
1656     if (newLen <= capacity) {
1657         return true;
1658     }
1659     int32_t newCapacity = nextCapacity(newLen);
1660     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1661     if (temp == NULL) {
1662         setToBogus(); // set the object to bogus state if an OOM failure occurred.
1663         return false;
1664     }
1665     // Copy only the actual contents.
1666     uprv_memcpy(temp, list, len * sizeof(UChar32));
1667     if (list != stackList) {
1668         uprv_free(list);
1669     }
1670     list = temp;
1671     capacity = newCapacity;
1672     return true;
1673 }
1674
1675 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1676     if (newLen > MAX_LENGTH) {
1677         newLen = MAX_LENGTH;
1678     }
1679     if (newLen <= bufferCapacity) {
1680         return true;
1681     }
1682     int32_t newCapacity = nextCapacity(newLen);
1683     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1684     if (temp == NULL) {
1685         setToBogus();
1686         return false;
1687     }
1688     // The buffer has no contents to be copied.
1689     // It is always filled from scratch after this call.
1690     if (buffer != stackList) {
1691         uprv_free(buffer);
1692     }
1693     buffer = temp;
1694     bufferCapacity = newCapacity;
1695     return true;
1696 }
1697
1698 /**
1699  * Swap list and buffer.
1700  */
1701 void UnicodeSet::swapBuffers(void) {
1702     // swap list and buffer
1703     UChar32* temp = list;
1704     list = buffer;
1705     buffer = temp;
1706
1707     int32_t c = capacity;
1708     capacity = bufferCapacity;
1709     bufferCapacity = c;
1710 }
1711
1712 void UnicodeSet::setToBogus() {
1713     clear(); // Remove everything in the set.
1714     fFlags = kIsBogus;
1715 }
1716
1717 //----------------------------------------------------------------
1718 // Implementation: Fundamental operators
1719 //----------------------------------------------------------------
1720
1721 static inline UChar32 max(UChar32 a, UChar32 b) {
1722     return (a > b) ? a : b;
1723 }
1724
1725 // polarity = 0, 3 is normal: x xor y
1726 // polarity = 1, 2: x xor ~y == x === y
1727
1728 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1729     if (isFrozen() || isBogus()) {
1730         return;
1731     }
1732     if (!ensureBufferCapacity(len + otherLen)) {
1733         return;
1734     }
1735
1736     int32_t i = 0, j = 0, k = 0;
1737     UChar32 a = list[i++];
1738     UChar32 b;
1739     if (polarity == 1 || polarity == 2) {
1740         b = UNICODESET_LOW;
1741         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1742             ++j;
1743             b = other[j];
1744         }
1745     } else {
1746         b = other[j++];
1747     }
1748     // simplest of all the routines
1749     // sort the values, discarding identicals!
1750     for (;;) {
1751         if (a < b) {
1752             buffer[k++] = a;
1753             a = list[i++];
1754         } else if (b < a) {
1755             buffer[k++] = b;
1756             b = other[j++];
1757         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1758             // discard both values!
1759             a = list[i++];
1760             b = other[j++];
1761         } else { // DONE!
1762             buffer[k++] = UNICODESET_HIGH;
1763             len = k;
1764             break;
1765         }
1766     }
1767     swapBuffers();
1768     releasePattern();
1769 }
1770
1771 // polarity = 0 is normal: x union y
1772 // polarity = 2: x union ~y
1773 // polarity = 1: ~x union y
1774 // polarity = 3: ~x union ~y
1775
1776 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1777     if (isFrozen() || isBogus() || other==NULL) {
1778         return;
1779     }
1780     if (!ensureBufferCapacity(len + otherLen)) {
1781         return;
1782     }
1783
1784     int32_t i = 0, j = 0, k = 0;
1785     UChar32 a = list[i++];
1786     UChar32 b = other[j++];
1787     // change from xor is that we have to check overlapping pairs
1788     // polarity bit 1 means a is second, bit 2 means b is.
1789     for (;;) {
1790         switch (polarity) {
1791           case 0: // both first; take lower if unequal
1792             if (a < b) { // take a
1793                 // Back up over overlapping ranges in buffer[]
1794                 if (k > 0 && a <= buffer[k-1]) {
1795                     // Pick latter end value in buffer[] vs. list[]
1796                     a = max(list[i], buffer[--k]);
1797                 } else {
1798                     // No overlap
1799                     buffer[k++] = a;
1800                     a = list[i];
1801                 }
1802                 i++; // Common if/else code factored out
1803                 polarity ^= 1;
1804             } else if (b < a) { // take b
1805                 if (k > 0 && b <= buffer[k-1]) {
1806                     b = max(other[j], buffer[--k]);
1807                 } else {
1808                     buffer[k++] = b;
1809                     b = other[j];
1810                 }
1811                 j++;
1812                 polarity ^= 2;
1813             } else { // a == b, take a, drop b
1814                 if (a == UNICODESET_HIGH) goto loop_end;
1815                 // This is symmetrical; it doesn't matter if
1816                 // we backtrack with a or b. - liu
1817                 if (k > 0 && a <= buffer[k-1]) {
1818                     a = max(list[i], buffer[--k]);
1819                 } else {
1820                     // No overlap
1821                     buffer[k++] = a;
1822                     a = list[i];
1823                 }
1824                 i++;
1825                 polarity ^= 1;
1826                 b = other[j++];
1827                 polarity ^= 2;
1828             }
1829             break;
1830           case 3: // both second; take higher if unequal, and drop other
1831             if (b <= a) { // take a
1832                 if (a == UNICODESET_HIGH) goto loop_end;
1833                 buffer[k++] = a;
1834             } else { // take b
1835                 if (b == UNICODESET_HIGH) goto loop_end;
1836                 buffer[k++] = b;
1837             }
1838             a = list[i++];
1839             polarity ^= 1;   // factored common code
1840             b = other[j++];
1841             polarity ^= 2;
1842             break;
1843           case 1: // a second, b first; if b < a, overlap
1844             if (a < b) { // no overlap, take a
1845                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1846             } else if (b < a) { // OVERLAP, drop b
1847                 b = other[j++];
1848                 polarity ^= 2;
1849             } else { // a == b, drop both!
1850                 if (a == UNICODESET_HIGH) goto loop_end;
1851                 a = list[i++];
1852                 polarity ^= 1;
1853                 b = other[j++];
1854                 polarity ^= 2;
1855             }
1856             break;
1857           case 2: // a first, b second; if a < b, overlap
1858             if (b < a) { // no overlap, take b
1859                 buffer[k++] = b;
1860                 b = other[j++];
1861                 polarity ^= 2;
1862             } else  if (a < b) { // OVERLAP, drop a
1863                 a = list[i++];
1864                 polarity ^= 1;
1865             } else { // a == b, drop both!
1866                 if (a == UNICODESET_HIGH) goto loop_end;
1867                 a = list[i++];
1868                 polarity ^= 1;
1869                 b = other[j++];
1870                 polarity ^= 2;
1871             }
1872             break;
1873         }
1874     }
1875  loop_end:
1876     buffer[k++] = UNICODESET_HIGH;    // terminate
1877     len = k;
1878     swapBuffers();
1879     releasePattern();
1880 }
1881
1882 // polarity = 0 is normal: x intersect y
1883 // polarity = 2: x intersect ~y == set-minus
1884 // polarity = 1: ~x intersect y
1885 // polarity = 3: ~x intersect ~y
1886
1887 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1888     if (isFrozen() || isBogus()) {
1889         return;
1890     }
1891     if (!ensureBufferCapacity(len + otherLen)) {
1892         return;
1893     }
1894
1895     int32_t i = 0, j = 0, k = 0;
1896     UChar32 a = list[i++];
1897     UChar32 b = other[j++];
1898     // change from xor is that we have to check overlapping pairs
1899     // polarity bit 1 means a is second, bit 2 means b is.
1900     for (;;) {
1901         switch (polarity) {
1902           case 0: // both first; drop the smaller
1903             if (a < b) { // drop a
1904                 a = list[i++];
1905                 polarity ^= 1;
1906             } else if (b < a) { // drop b
1907                 b = other[j++];
1908                 polarity ^= 2;
1909             } else { // a == b, take one, drop other
1910                 if (a == UNICODESET_HIGH) goto loop_end;
1911                 buffer[k++] = a;
1912                 a = list[i++];
1913                 polarity ^= 1;
1914                 b = other[j++];
1915                 polarity ^= 2;
1916             }
1917             break;
1918           case 3: // both second; take lower if unequal
1919             if (a < b) { // take a
1920                 buffer[k++] = a;
1921                 a = list[i++];
1922                 polarity ^= 1;
1923             } else if (b < a) { // take b
1924                 buffer[k++] = b;
1925                 b = other[j++];
1926                 polarity ^= 2;
1927             } else { // a == b, take one, drop other
1928                 if (a == UNICODESET_HIGH) goto loop_end;
1929                 buffer[k++] = a;
1930                 a = list[i++];
1931                 polarity ^= 1;
1932                 b = other[j++];
1933                 polarity ^= 2;
1934             }
1935             break;
1936           case 1: // a second, b first;
1937             if (a < b) { // NO OVERLAP, drop a
1938                 a = list[i++];
1939                 polarity ^= 1;
1940             } else if (b < a) { // OVERLAP, take b
1941                 buffer[k++] = b;
1942                 b = other[j++];
1943                 polarity ^= 2;
1944             } else { // a == b, drop both!
1945                 if (a == UNICODESET_HIGH) goto loop_end;
1946                 a = list[i++];
1947                 polarity ^= 1;
1948                 b = other[j++];
1949                 polarity ^= 2;
1950             }
1951             break;
1952           case 2: // a first, b second; if a < b, overlap
1953             if (b < a) { // no overlap, drop b
1954                 b = other[j++];
1955                 polarity ^= 2;
1956             } else  if (a < b) { // OVERLAP, take a
1957                 buffer[k++] = a;
1958                 a = list[i++];
1959                 polarity ^= 1;
1960             } else { // a == b, drop both!
1961                 if (a == UNICODESET_HIGH) goto loop_end;
1962                 a = list[i++];
1963                 polarity ^= 1;
1964                 b = other[j++];
1965                 polarity ^= 2;
1966             }
1967             break;
1968         }
1969     }
1970  loop_end:
1971     buffer[k++] = UNICODESET_HIGH;    // terminate
1972     len = k;
1973     swapBuffers();
1974     releasePattern();
1975 }
1976
1977 /**
1978  * Append the <code>toPattern()</code> representation of a
1979  * string to the given <code>StringBuffer</code>.
1980  */
1981 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1982 escapeUnprintable) {
1983     UChar32 cp;
1984     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1985         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1986     }
1987 }
1988
1989 /**
1990  * Append the <code>toPattern()</code> representation of a
1991  * character to the given <code>StringBuffer</code>.
1992  */
1993 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1994 escapeUnprintable) {
1995     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1996         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1997         // unprintable
1998         if (ICU_Utility::escapeUnprintable(buf, c)) {
1999             return;
2000         }
2001     }
2002     // Okay to let ':' pass through
2003     switch (c) {
2004     case SET_OPEN:
2005     case SET_CLOSE:
2006     case HYPHEN:
2007     case COMPLEMENT:
2008     case INTERSECTION:
2009     case BACKSLASH:
2010     case OPEN_BRACE:
2011     case CLOSE_BRACE:
2012     case COLON:
2013     case SymbolTable::SYMBOL_REF:
2014         buf.append(BACKSLASH);
2015         break;
2016     default:
2017         // Escape whitespace
2018         if (PatternProps::isWhiteSpace(c)) {
2019             buf.append(BACKSLASH);
2020         }
2021         break;
2022     }
2023     buf.append(c);
2024 }
2025
2026 /**
2027  * Append a string representation of this set to result.  This will be
2028  * a cleaned version of the string passed to applyPattern(), if there
2029  * is one.  Otherwise it will be generated.
2030  */
2031 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2032                                       UBool escapeUnprintable) const
2033 {
2034     if (pat != NULL) {
2035         int32_t i;
2036         int32_t backslashCount = 0;
2037         for (i=0; i<patLen; ) {
2038             UChar32 c;
2039             U16_NEXT(pat, i, patLen, c);
2040             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2041                 // If the unprintable character is preceded by an odd
2042                 // number of backslashes, then it has been escaped.
2043                 // Before unescaping it, we delete the final
2044                 // backslash.
2045                 if ((backslashCount % 2) == 1) {
2046                     result.truncate(result.length() - 1);
2047                 }
2048                 ICU_Utility::escapeUnprintable(result, c);
2049                 backslashCount = 0;
2050             } else {
2051                 result.append(c);
2052                 if (c == BACKSLASH) {
2053                     ++backslashCount;
2054                 } else {
2055                     backslashCount = 0;
2056                 }
2057             }
2058         }
2059         return result;
2060     }
2061
2062     return _generatePattern(result, escapeUnprintable);
2063 }
2064
2065 /**
2066  * Returns a string representation of this set.  If the result of
2067  * calling this function is passed to a UnicodeSet constructor, it
2068  * will produce another set that is equal to this one.
2069  */
2070 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071                                      UBool escapeUnprintable) const
2072 {
2073     result.truncate(0);
2074     return _toPattern(result, escapeUnprintable);
2075 }
2076
2077 /**
2078  * Generate and append a string representation of this set to result.
2079  * This does not use this.pat, the cleaned up copy of the string
2080  * passed to applyPattern().
2081  */
2082 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083                                             UBool escapeUnprintable) const
2084 {
2085     result.append(SET_OPEN);
2086
2087 //  // Check against the predefined categories.  We implicitly build
2088 //  // up ALL category sets the first time toPattern() is called.
2089 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2090 //      if (*this == getCategorySet(cat)) {
2091 //          result.append(COLON);
2092 //          result.append(CATEGORY_NAMES, cat*2, 2);
2093 //          return result.append(CATEGORY_CLOSE);
2094 //      }
2095 //  }
2096
2097     int32_t count = getRangeCount();
2098
2099     // If the set contains at least 2 intervals and includes both
2100     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2101     // be more economical.
2102     if (count > 1 &&
2103         getRangeStart(0) == MIN_VALUE &&
2104         getRangeEnd(count-1) == MAX_VALUE) {
2105
2106         // Emit the inverse
2107         result.append(COMPLEMENT);
2108
2109         for (int32_t i = 1; i < count; ++i) {
2110             UChar32 start = getRangeEnd(i-1)+1;
2111             UChar32 end = getRangeStart(i)-1;
2112             _appendToPat(result, start, escapeUnprintable);
2113             if (start != end) {
2114                 if ((start+1) != end) {
2115                     result.append(HYPHEN);
2116                 }
2117                 _appendToPat(result, end, escapeUnprintable);
2118             }
2119         }
2120     }
2121
2122     // Default; emit the ranges as pairs
2123     else {
2124         for (int32_t i = 0; i < count; ++i) {
2125             UChar32 start = getRangeStart(i);
2126             UChar32 end = getRangeEnd(i);
2127             _appendToPat(result, start, escapeUnprintable);
2128             if (start != end) {
2129                 if ((start+1) != end) {
2130                     result.append(HYPHEN);
2131                 }
2132                 _appendToPat(result, end, escapeUnprintable);
2133             }
2134         }
2135     }
2136
2137     if (strings != nullptr) {
2138         for (int32_t i = 0; i<strings->size(); ++i) {
2139             result.append(OPEN_BRACE);
2140             _appendToPat(result,
2141                          *(const UnicodeString*) strings->elementAt(i),
2142                          escapeUnprintable);
2143             result.append(CLOSE_BRACE);
2144         }
2145     }
2146     return result.append(SET_CLOSE);
2147 }
2148
2149 /**
2150 * Release existing cached pattern
2151 */
2152 void UnicodeSet::releasePattern() {
2153     if (pat) {
2154         uprv_free(pat);
2155         pat = NULL;
2156         patLen = 0;
2157     }
2158 }
2159
2160 /**
2161 * Set the new pattern to cache.
2162 */
2163 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2164     releasePattern();
2165     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2166     if (pat) {
2167         patLen = newPatLen;
2168         u_memcpy(pat, newPat, patLen);
2169         pat[patLen] = 0;
2170     }
2171     // else we don't care if malloc failed. This was just a nice cache.
2172     // We can regenerate an equivalent pattern later when requested.
2173 }
2174
2175 UnicodeSet *UnicodeSet::freeze() {
2176     if(!isFrozen() && !isBogus()) {
2177         compact();
2178
2179         // Optimize contains() and span() and similar functions.
2180         if (hasStrings()) {
2181             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2182             if (stringSpan == nullptr) {
2183                 setToBogus();
2184                 return this;
2185             } else if (!stringSpan->needsStringSpanUTF16()) {
2186                 // All strings are irrelevant for span() etc. because
2187                 // all of each string's code points are contained in this set.
2188                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2189                 // many relevant strings as UTF-16.
2190                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2191                 delete stringSpan;
2192                 stringSpan = NULL;
2193             }
2194         }
2195         if (stringSpan == NULL) {
2196             // No span-relevant strings: Optimize for code point spans.
2197             bmpSet=new BMPSet(list, len);
2198             if (bmpSet == NULL) { // Check for memory allocation error.
2199                 setToBogus();
2200             }
2201         }
2202     }
2203     return this;
2204 }
2205
2206 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2207     if(length>0 && bmpSet!=NULL) {
2208         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2209     }
2210     if(length<0) {
2211         length=u_strlen(s);
2212     }
2213     if(length==0) {
2214         return 0;
2215     }
2216     if(stringSpan!=NULL) {
2217         return stringSpan->span(s, length, spanCondition);
2218     } else if(hasStrings()) {
2219         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2220                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2221                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2222         UnicodeSetStringSpan strSpan(*this, *strings, which);
2223         if(strSpan.needsStringSpanUTF16()) {
2224             return strSpan.span(s, length, spanCondition);
2225         }
2226     }
2227
2228     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2229         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2230     }
2231
2232     UChar32 c;
2233     int32_t start=0, prev=0;
2234     do {
2235         U16_NEXT(s, start, length, c);
2236         if(spanCondition!=contains(c)) {
2237             break;
2238         }
2239     } while((prev=start)<length);
2240     return prev;
2241 }
2242
2243 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2244     if(length>0 && bmpSet!=NULL) {
2245         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2246     }
2247     if(length<0) {
2248         length=u_strlen(s);
2249     }
2250     if(length==0) {
2251         return 0;
2252     }
2253     if(stringSpan!=NULL) {
2254         return stringSpan->spanBack(s, length, spanCondition);
2255     } else if(hasStrings()) {
2256         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2257                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2258                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2259         UnicodeSetStringSpan strSpan(*this, *strings, which);
2260         if(strSpan.needsStringSpanUTF16()) {
2261             return strSpan.spanBack(s, length, spanCondition);
2262         }
2263     }
2264
2265     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2266         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2267     }
2268
2269     UChar32 c;
2270     int32_t prev=length;
2271     do {
2272         U16_PREV(s, 0, length, c);
2273         if(spanCondition!=contains(c)) {
2274             break;
2275         }
2276     } while((prev=length)>0);
2277     return prev;
2278 }
2279
2280 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2281     if(length>0 && bmpSet!=NULL) {
2282         const uint8_t *s0=(const uint8_t *)s;
2283         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2284     }
2285     if(length<0) {
2286         length=(int32_t)uprv_strlen(s);
2287     }
2288     if(length==0) {
2289         return 0;
2290     }
2291     if(stringSpan!=NULL) {
2292         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2293     } else if(hasStrings()) {
2294         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2295                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2296                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2297         UnicodeSetStringSpan strSpan(*this, *strings, which);
2298         if(strSpan.needsStringSpanUTF8()) {
2299             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2300         }
2301     }
2302
2303     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2304         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2305     }
2306
2307     UChar32 c;
2308     int32_t start=0, prev=0;
2309     do {
2310         U8_NEXT_OR_FFFD(s, start, length, c);
2311         if(spanCondition!=contains(c)) {
2312             break;
2313         }
2314     } while((prev=start)<length);
2315     return prev;
2316 }
2317
2318 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2319     if(length>0 && bmpSet!=NULL) {
2320         const uint8_t *s0=(const uint8_t *)s;
2321         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2322     }
2323     if(length<0) {
2324         length=(int32_t)uprv_strlen(s);
2325     }
2326     if(length==0) {
2327         return 0;
2328     }
2329     if(stringSpan!=NULL) {
2330         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2331     } else if(hasStrings()) {
2332         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2333                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2334                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2335         UnicodeSetStringSpan strSpan(*this, *strings, which);
2336         if(strSpan.needsStringSpanUTF8()) {
2337             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2338         }
2339     }
2340
2341     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2342         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2343     }
2344
2345     UChar32 c;
2346     int32_t prev=length;
2347     do {
2348         U8_PREV_OR_FFFD(s, 0, length, c);
2349         if(spanCondition!=contains(c)) {
2350             break;
2351         }
2352     } while((prev=length)>0);
2353     return prev;
2354 }
2355
2356 U_NAMESPACE_END