icuSources/common/uniset.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1999-2008, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12 #include "unicode/uniset.h"
  13 #include "unicode/parsepos.h"
  14 #include "unicode/symtable.h"
  15 #include "ruleiter.h"
  16 #include "cmemory.h"
  17 #include "cstring.h"
  18 #include "uhash.h"
  19 #include "util.h"
  20 #include "uvector.h"
  21 #include "charstr.h"
  22 #include "ustrfmt.h"
  23 #include "uassert.h"
  24 #include "hash.h"
  25 #include "bmpset.h"
  26 #include "unisetspan.h"
  27
  28 // Define UChar constants using hex for EBCDIC compatibility
  29 // Used #define to reduce private static exports and memory access time.
  30 #define SET_OPEN        ((UChar)0x005B) /*[*/
  31 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  32 #define HYPHEN          ((UChar)0x002D) /*-*/
  33 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  34 #define COLON           ((UChar)0x003A) /*:*/
  35 #define BACKSLASH       ((UChar)0x005C) /*\*/
  36 #define INTERSECTION    ((UChar)0x0026) /*&*/
  37 #define UPPER_U         ((UChar)0x0055) /*U*/
  38 #define LOWER_U         ((UChar)0x0075) /*u*/
  39 #define OPEN_BRACE      ((UChar)123)    /*{*/
  40 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  41 #define UPPER_P         ((UChar)0x0050) /*P*/
  42 #define LOWER_P         ((UChar)0x0070) /*p*/
  43 #define UPPER_N         ((UChar)78)     /*N*/
  44 #define EQUALS          ((UChar)0x003D) /*=*/
  45
  46 // HIGH_VALUE > all valid values. 110000 for codepoints
  47 #define UNICODESET_HIGH 0x0110000
  48
  49 // LOW <= all valid values. ZERO for codepoints
  50 #define UNICODESET_LOW 0x000000
  51
  52 // initial storage. Must be >= 0
  53 #define START_EXTRA 16
  54
  55 // extra amount for growth. Must be >= 0
  56 #define GROW_EXTRA START_EXTRA
  57
  58 U_NAMESPACE_BEGIN
  59
  60 SymbolTable::~SymbolTable() {}
  61
  62 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
  63
  64 /**
  65  * Modify the given UChar32 variable so that it is in range, by
  66  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
  67  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
  68  * It modifies its argument in-place and also returns it.
  69  */
  70 static inline UChar32 pinCodePoint(UChar32& c) {
  71     if (c < UNICODESET_LOW) {
  72         c = UNICODESET_LOW;
  73     } else if (c > (UNICODESET_HIGH-1)) {
  74         c = (UNICODESET_HIGH-1);
  75     }
  76     return c;
  77 }
  78
  79 //----------------------------------------------------------------
  80 // Debugging
  81 //----------------------------------------------------------------
  82
  83 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
  84 // To enable the debugging, define the symbol DEBUG_MEM in the line
  85 // below.  This will result in text being sent to stdout that looks
  86 // like this:
  87 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
  88 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
  89 // Each line lists a construction (ct) or destruction (dt) event, the
  90 // object address, the number of outstanding objects after the event,
  91 // and the pattern of the object in question.
  92
  93 // #define DEBUG_MEM
  94
  95 #ifdef DEBUG_MEM
  96 #include <stdio.h>
  97 static int32_t _dbgCount = 0;
  98
  99 static inline void _dbgct(UnicodeSet* set) {
 100     UnicodeString str;
 101     set->toPattern(str, TRUE);
 102     char buf[40];
 103     str.extract(0, 39, buf, "");
 104     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
 105 }
 106
 107 static inline void _dbgdt(UnicodeSet* set) {
 108     UnicodeString str;
 109     set->toPattern(str, TRUE);
 110     char buf[40];
 111     str.extract(0, 39, buf, "");
 112     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
 113 }
 114
 115 #else
 116
 117 #define _dbgct(set)
 118 #define _dbgdt(set)
 119
 120 #endif
 121
 122 //----------------------------------------------------------------
 123 // UnicodeString in UVector support
 124 //----------------------------------------------------------------
 125
 126 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
 127     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
 128 }
 129
 130 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
 131     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
 132     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
 133     return a.compare(b);
 134 }
 135
 136 //----------------------------------------------------------------
 137 // Constructors &c
 138 //----------------------------------------------------------------
 139
 140 /**
 141  * Constructs an empty set.
 142  */
 143 UnicodeSet::UnicodeSet() :
 144     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
 145     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 146     fFlags(0)
 147 {
 148     UErrorCode status = U_ZERO_ERROR;
 149     allocateStrings(status);
 150     if (U_FAILURE(status)) {
 151         return;
 152     }
 153     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 154     if(list!=NULL){
 155         list[0] = UNICODESET_HIGH;
 156     } else { // If memory allocation failed, set to bogus state.
 157         setToBogus();
 158         return;
 159     }
 160     _dbgct(this);
 161 }
 162
 163 /**
 164  * Constructs a set containing the given range. If <code>end >
 165  * start</code> then an empty set is created.
 166  *
 167  * @param start first character, inclusive, of range
 168  * @param end last character, inclusive, of range
 169  */
 170 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
 171     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
 172     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 173     fFlags(0)
 174 {
 175     UErrorCode status = U_ZERO_ERROR;
 176     allocateStrings(status);
 177     if (U_FAILURE(status)) {
 178         return;
 179     }
 180     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 181     if(list!=NULL){
 182         list[0] = UNICODESET_HIGH;
 183         complement(start, end);
 184     } else { // If memory allocation failed, set to bogus state.
 185         setToBogus();
 186         return;
 187     }
 188     _dbgct(this);
 189 }
 190
 191 /**
 192  * Constructs a set that is identical to the given UnicodeSet.
 193  */
 194 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
 195     UnicodeFilter(o),
 196     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
 197     bmpSet(0),
 198     buffer(0), bufferCapacity(0),
 199     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 200     fFlags(0)
 201 {
 202     UErrorCode status = U_ZERO_ERROR;
 203     allocateStrings(status);
 204     if (U_FAILURE(status)) {
 205         return;
 206     }
 207     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 208     if(list!=NULL){
 209         *this = o;
 210     } else { // If memory allocation failed, set to bogus state.
 211         setToBogus();
 212         return;
 213     }
 214     _dbgct(this);
 215 }
 216
 217 // Copy-construct as thawed.
 218 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
 219     UnicodeFilter(o),
 220     len(0), capacity(o.len + GROW_EXTRA), list(0),
 221     bmpSet(0),
 222     buffer(0), bufferCapacity(0),
 223     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 224     fFlags(0)
 225 {
 226     UErrorCode status = U_ZERO_ERROR;
 227     allocateStrings(status);
 228     if (U_FAILURE(status)) {
 229         return;
 230     }
 231     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 232     if(list!=NULL){
 233         // *this = o except for bmpSet and stringSpan
 234         len = o.len;
 235         uprv_memcpy(list, o.list, len*sizeof(UChar32));
 236         if (strings != NULL && o.strings != NULL) {
 237             strings->assign(*o.strings, cloneUnicodeString, status);
 238         } else { // Invalid strings.
 239             setToBogus();
 240             return;
 241         }
 242         if (o.pat) {
 243             setPattern(UnicodeString(o.pat, o.patLen));
 244         }
 245     } else { // If memory allocation failed, set to bogus state.
 246         setToBogus();
 247         return;
 248     }
 249     _dbgct(this);
 250 }
 251
 252 /**
 253  * Destructs the set.
 254  */
 255 UnicodeSet::~UnicodeSet() {
 256     _dbgdt(this); // first!
 257     uprv_free(list);
 258     delete bmpSet;
 259     if (buffer) {
 260         uprv_free(buffer);
 261     }
 262     delete strings;
 263     delete stringSpan;
 264     releasePattern();
 265 }
 266
 267 /**
 268  * Assigns this object to be a copy of another.
 269  */
 270 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
 271     if (this == &o) {
 272         return *this;
 273     }
 274     if (isFrozen()) {
 275         return *this;
 276     }
 277     if (o.isBogus()) {
 278         setToBogus();
 279         return *this;
 280     }
 281     UErrorCode ec = U_ZERO_ERROR;
 282     ensureCapacity(o.len, ec);
 283     if (U_FAILURE(ec)) {
 284         return *this; // There is no way to report this error :-(
 285     }
 286     len = o.len;
 287     uprv_memcpy(list, o.list, len*sizeof(UChar32));
 288     if (o.bmpSet == NULL) {
 289         bmpSet = NULL;
 290     } else {
 291         bmpSet = new BMPSet(*o.bmpSet, list, len);
 292         if (bmpSet == NULL) { // Check for memory allocation error.
 293             setToBogus();
 294             return *this;
 295         }
 296     }
 297     if (strings != NULL && o.strings != NULL) {
 298         strings->assign(*o.strings, cloneUnicodeString, ec);
 299     } else { // Invalid strings.
 300         setToBogus();
 301         return *this;
 302     }
 303     if (o.stringSpan == NULL) {
 304         stringSpan = NULL;
 305     } else {
 306         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
 307         if (stringSpan == NULL) { // Check for memory allocation error.
 308             setToBogus();
 309             return *this;
 310         }
 311     }
 312     releasePattern();
 313     if (o.pat) {
 314         setPattern(UnicodeString(o.pat, o.patLen));
 315     }
 316     return *this;
 317 }
 318
 319 /**
 320  * Returns a copy of this object.  All UnicodeMatcher objects have
 321  * to support cloning in order to allow classes using
 322  * UnicodeMatchers, such as Transliterator, to implement cloning.
 323  */
 324 UnicodeFunctor* UnicodeSet::clone() const {
 325     return new UnicodeSet(*this);
 326 }
 327
 328 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
 329     return new UnicodeSet(*this, TRUE);
 330 }
 331
 332 /**
 333  * Compares the specified object with this set for equality.  Returns
 334  * <tt>true</tt> if the two sets
 335  * have the same size, and every member of the specified set is
 336  * contained in this set (or equivalently, every member of this set is
 337  * contained in the specified set).
 338  *
 339  * @param o set to be compared for equality with this set.
 340  * @return <tt>true</tt> if the specified set is equal to this set.
 341  */
 342 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
 343     if (len != o.len) return FALSE;
 344     for (int32_t i = 0; i < len; ++i) {
 345         if (list[i] != o.list[i]) return FALSE;
 346     }
 347     if (*strings != *o.strings) return FALSE;
 348     return TRUE;
 349 }
 350
 351 /**
 352  * Returns the hash code value for this set.
 353  *
 354  * @return the hash code value for this set.
 355  * @see Object#hashCode()
 356  */
 357 int32_t UnicodeSet::hashCode(void) const {
 358     int32_t result = len;
 359     for (int32_t i = 0; i < len; ++i) {
 360         result *= 1000003;
 361         result += list[i];
 362     }
 363     return result;
 364 }
 365
 366 //----------------------------------------------------------------
 367 // Public API
 368 //----------------------------------------------------------------
 369
 370 /**
 371  * Returns the number of elements in this set (its cardinality),
 372  * Note than the elements of a set may include both individual
 373  * codepoints and strings.
 374  *
 375  * @return the number of elements in this set (its cardinality).
 376  */
 377 int32_t UnicodeSet::size(void) const {
 378     int32_t n = 0;
 379     int32_t count = getRangeCount();
 380     for (int32_t i = 0; i < count; ++i) {
 381         n += getRangeEnd(i) - getRangeStart(i) + 1;
 382     }
 383     return n + strings->size();
 384 }
 385
 386 /**
 387  * Returns <tt>true</tt> if this set contains no elements.
 388  *
 389  * @return <tt>true</tt> if this set contains no elements.
 390  */
 391 UBool UnicodeSet::isEmpty(void) const {
 392     return len == 1 && strings->size() == 0;
 393 }
 394
 395 /**
 396  * Returns true if this set contains the given character.
 397  * @param c character to be checked for containment
 398  * @return true if the test condition is met
 399  */
 400 UBool UnicodeSet::contains(UChar32 c) const {
 401     // Set i to the index of the start item greater than ch
 402     // We know we will terminate without length test!
 403     // LATER: for large sets, add binary search
 404     //int32_t i = -1;
 405     //for (;;) {
 406     //    if (c < list[++i]) break;
 407     //}
 408     if (bmpSet != NULL) {
 409         return bmpSet->contains(c);
 410     }
 411     if (stringSpan != NULL) {
 412         return stringSpan->contains(c);
 413     }
 414     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
 415         return FALSE;
 416     }
 417     int32_t i = findCodePoint(c);
 418     return (UBool)(i & 1); // return true if odd
 419 }
 420
 421 /**
 422  * Returns the smallest value i such that c < list[i].  Caller
 423  * must ensure that c is a legal value or this method will enter
 424  * an infinite loop.  This method performs a binary search.
 425  * @param c a character in the range MIN_VALUE..MAX_VALUE
 426  * inclusive
 427  * @return the smallest integer i in the range 0..len-1,
 428  * inclusive, such that c < list[i]
 429  */
 430 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
 431     /* Examples:
 432                                        findCodePoint(c)
 433        set              list[]         c=0 1 3 4 7 8
 434        ===              ==============   ===========
 435        []               [110000]         0 0 0 0 0 0
 436        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 437        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 438        [:Any:]          [0, 110000]      1 1 1 1 1 1
 439      */
 440
 441     // Return the smallest i such that c < list[i].  Assume
 442     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 443     if (c < list[0])
 444         return 0;
 445     // High runner test.  c is often after the last range, so an
 446     // initial check for this condition pays off.
 447     int32_t lo = 0;
 448     int32_t hi = len - 1;
 449     if (lo >= hi || c >= list[hi-1])
 450         return hi;
 451     // invariant: c >= list[lo]
 452     // invariant: c < list[hi]
 453     for (;;) {
 454         int32_t i = (lo + hi) >> 1;
 455         if (i == lo) {
 456             break; // Found!
 457         } else if (c < list[i]) {
 458             hi = i;
 459         } else {
 460             lo = i;
 461         }
 462     }
 463     return hi;
 464 }
 465
 466 /**
 467  * Returns true if this set contains every character
 468  * of the given range.
 469  * @param start first character, inclusive, of the range
 470  * @param end last character, inclusive, of the range
 471  * @return true if the test condition is met
 472  */
 473 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
 474     //int32_t i = -1;
 475     //for (;;) {
 476     //    if (start < list[++i]) break;
 477     //}
 478     int32_t i = findCodePoint(start);
 479     return ((i & 1) != 0 && end < list[i]);
 480 }
 481
 482 /**
 483  * Returns <tt>true</tt> if this set contains the given
 484  * multicharacter string.
 485  * @param s string to be checked for containment
 486  * @return <tt>true</tt> if this set contains the specified string
 487  */
 488 UBool UnicodeSet::contains(const UnicodeString& s) const {
 489     if (s.length() == 0) return FALSE;
 490     int32_t cp = getSingleCP(s);
 491     if (cp < 0) {
 492         return strings->contains((void*) &s);
 493     } else {
 494         return contains((UChar32) cp);
 495     }
 496 }
 497
 498 /**
 499  * Returns true if this set contains all the characters and strings
 500  * of the given set.
 501  * @param c set to be checked for containment
 502  * @return true if the test condition is met
 503  */
 504 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
 505     // The specified set is a subset if all of its pairs are contained in
 506     // this set.  It's possible to code this more efficiently in terms of
 507     // direct manipulation of the inversion lists if the need arises.
 508     int32_t n = c.getRangeCount();
 509     for (int i=0; i<n; ++i) {
 510         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
 511             return FALSE;
 512         }
 513     }
 514     if (!strings->containsAll(*c.strings)) return FALSE;
 515     return TRUE;
 516 }
 517
 518 /**
 519  * Returns true if this set contains all the characters
 520  * of the given string.
 521  * @param s string containing characters to be checked for containment
 522  * @return true if the test condition is met
 523  */
 524 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
 525     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
 526                    s.length());
 527 }
 528
 529 /**
 530  * Returns true if this set contains none of the characters
 531  * of the given range.
 532  * @param start first character, inclusive, of the range
 533  * @param end last character, inclusive, of the range
 534  * @return true if the test condition is met
 535  */
 536 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
 537     //int32_t i = -1;
 538     //for (;;) {
 539     //    if (start < list[++i]) break;
 540     //}
 541     int32_t i = findCodePoint(start);
 542     return ((i & 1) == 0 && end < list[i]);
 543 }
 544
 545 /**
 546  * Returns true if this set contains none of the characters and strings
 547  * of the given set.
 548  * @param c set to be checked for containment
 549  * @return true if the test condition is met
 550  */
 551 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
 552     // The specified set is a subset if all of its pairs are contained in
 553     // this set.  It's possible to code this more efficiently in terms of
 554     // direct manipulation of the inversion lists if the need arises.
 555     int32_t n = c.getRangeCount();
 556     for (int32_t i=0; i<n; ++i) {
 557         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
 558             return FALSE;
 559         }
 560     }
 561     if (!strings->containsNone(*c.strings)) return FALSE;
 562     return TRUE;
 563 }
 564
 565 /**
 566  * Returns true if this set contains none of the characters
 567  * of the given string.
 568  * @param s string containing characters to be checked for containment
 569  * @return true if the test condition is met
 570  */
 571 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
 572     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
 573                    s.length());
 574 }
 575
 576 /**
 577  * Returns <tt>true</tt> if this set contains any character whose low byte
 578  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
 579  * indexing.
 580  */
 581 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
 582     /* The index value v, in the range [0,255], is contained in this set if
 583      * it is contained in any pair of this set.  Pairs either have the high
 584      * bytes equal, or unequal.  If the high bytes are equal, then we have
 585      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
 586      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
 587      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
 588      * time zone month containment logic.)
 589      */
 590     int32_t i;
 591     int32_t rangeCount=getRangeCount();
 592     for (i=0; i<rangeCount; ++i) {
 593         UChar32 low = getRangeStart(i);
 594         UChar32 high = getRangeEnd(i);
 595         if ((low & ~0xFF) == (high & ~0xFF)) {
 596             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
 597                 return TRUE;
 598             }
 599         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
 600             return TRUE;
 601         }
 602     }
 603     if (strings->size() != 0) {
 604         for (i=0; i<strings->size(); ++i) {
 605             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
 606             //if (s.length() == 0) {
 607             //    // Empty strings match everything
 608             //    return TRUE;
 609             //}
 610             // assert(s.length() != 0); // We enforce this elsewhere
 611             UChar32 c = s.char32At(0);
 612             if ((c & 0xFF) == v) {
 613                 return TRUE;
 614             }
 615         }
 616     }
 617     return FALSE;
 618 }
 619
 620 /**
 621  * Implementation of UnicodeMatcher::matches().  Always matches the
 622  * longest possible multichar string.
 623  */
 624 UMatchDegree UnicodeSet::matches(const Replaceable& text,
 625                                  int32_t& offset,
 626                                  int32_t limit,
 627                                  UBool incremental) {
 628     if (offset == limit) {
 629         // Strings, if any, have length != 0, so we don't worry
 630         // about them here.  If we ever allow zero-length strings
 631         // we much check for them here.
 632         if (contains(U_ETHER)) {
 633             return incremental ? U_PARTIAL_MATCH : U_MATCH;
 634         } else {
 635             return U_MISMATCH;
 636         }
 637     } else {
 638         if (strings->size() != 0) { // try strings first
 639
 640             // might separate forward and backward loops later
 641             // for now they are combined
 642
 643             // TODO Improve efficiency of this, at least in the forward
 644             // direction, if not in both.  In the forward direction we
 645             // can assume the strings are sorted.
 646
 647             int32_t i;
 648             UBool forward = offset < limit;
 649
 650             // firstChar is the leftmost char to match in the
 651             // forward direction or the rightmost char to match in
 652             // the reverse direction.
 653             UChar firstChar = text.charAt(offset);
 654
 655             // If there are multiple strings that can match we
 656             // return the longest match.
 657             int32_t highWaterLength = 0;
 658
 659             for (i=0; i<strings->size(); ++i) {
 660                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
 661
 662                 //if (trial.length() == 0) {
 663                 //    return U_MATCH; // null-string always matches
 664                 //}
 665                 // assert(trial.length() != 0); // We ensure this elsewhere
 666
 667                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 668
 669                 // Strings are sorted, so we can optimize in the
 670                 // forward direction.
 671                 if (forward && c > firstChar) break;
 672                 if (c != firstChar) continue;
 673
 674                 int32_t matchLen = matchRest(text, offset, limit, trial);
 675
 676                 if (incremental) {
 677                     int32_t maxLen = forward ? limit-offset : offset-limit;
 678                     if (matchLen == maxLen) {
 679                         // We have successfully matched but only up to limit.
 680                         return U_PARTIAL_MATCH;
 681                     }
 682                 }
 683
 684                 if (matchLen == trial.length()) {
 685                     // We have successfully matched the whole string.
 686                     if (matchLen > highWaterLength) {
 687                         highWaterLength = matchLen;
 688                     }
 689                     // In the forward direction we know strings
 690                     // are sorted so we can bail early.
 691                     if (forward && matchLen < highWaterLength) {
 692                         break;
 693                     }
 694                     continue;
 695                 }
 696             }
 697
 698             // We've checked all strings without a partial match.
 699             // If we have full matches, return the longest one.
 700             if (highWaterLength != 0) {
 701                 offset += forward ? highWaterLength : -highWaterLength;
 702                 return U_MATCH;
 703             }
 704         }
 705         return UnicodeFilter::matches(text, offset, limit, incremental);
 706     }
 707 }
 708
 709 /**
 710  * Returns the longest match for s in text at the given position.
 711  * If limit > start then match forward from start+1 to limit
 712  * matching all characters except s.charAt(0).  If limit < start,
 713  * go backward starting from start-1 matching all characters
 714  * except s.charAt(s.length()-1).  This method assumes that the
 715  * first character, text.charAt(start), matches s, so it does not
 716  * check it.
 717  * @param text the text to match
 718  * @param start the first character to match.  In the forward
 719  * direction, text.charAt(start) is matched against s.charAt(0).
 720  * In the reverse direction, it is matched against
 721  * s.charAt(s.length()-1).
 722  * @param limit the limit offset for matching, either last+1 in
 723  * the forward direction, or last-1 in the reverse direction,
 724  * where last is the index of the last character to match.
 725  * @return If part of s matches up to the limit, return |limit -
 726  * start|.  If all of s matches before reaching the limit, return
 727  * s.length().  If there is a mismatch between s and text, return
 728  * 0
 729  */
 730 int32_t UnicodeSet::matchRest(const Replaceable& text,
 731                               int32_t start, int32_t limit,
 732                               const UnicodeString& s) {
 733     int32_t i;
 734     int32_t maxLen;
 735     int32_t slen = s.length();
 736     if (start < limit) {
 737         maxLen = limit - start;
 738         if (maxLen > slen) maxLen = slen;
 739         for (i = 1; i < maxLen; ++i) {
 740             if (text.charAt(start + i) != s.charAt(i)) return 0;
 741         }
 742     } else {
 743         maxLen = start - limit;
 744         if (maxLen > slen) maxLen = slen;
 745         --slen; // <=> slen = s.length() - 1;
 746         for (i = 1; i < maxLen; ++i) {
 747             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
 748         }
 749     }
 750     return maxLen;
 751 }
 752
 753 /**
 754  * Implement of UnicodeMatcher
 755  */
 756 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
 757     toUnionTo.addAll(*this);
 758 }
 759
 760 /**
 761  * Returns the index of the given character within this set, where
 762  * the set is ordered by ascending code point.  If the character
 763  * is not in this set, return -1.  The inverse of this method is
 764  * <code>charAt()</code>.
 765  * @return an index from 0..size()-1, or -1
 766  */
 767 int32_t UnicodeSet::indexOf(UChar32 c) const {
 768     if (c < MIN_VALUE || c > MAX_VALUE) {
 769         return -1;
 770     }
 771     int32_t i = 0;
 772     int32_t n = 0;
 773     for (;;) {
 774         UChar32 start = list[i++];
 775         if (c < start) {
 776             return -1;
 777         }
 778         UChar32 limit = list[i++];
 779         if (c < limit) {
 780             return n + c - start;
 781         }
 782         n += limit - start;
 783     }
 784 }
 785
 786 /**
 787  * Returns the character at the given index within this set, where
 788  * the set is ordered by ascending code point.  If the index is
 789  * out of range, return (UChar32)-1.  The inverse of this method is
 790  * <code>indexOf()</code>.
 791  * @param index an index from 0..size()-1
 792  * @return the character at the given index, or (UChar32)-1.
 793  */
 794 UChar32 UnicodeSet::charAt(int32_t index) const {
 795     if (index >= 0) {
 796         // len2 is the largest even integer <= len, that is, it is len
 797         // for even values and len-1 for odd values.  With odd values
 798         // the last entry is UNICODESET_HIGH.
 799         int32_t len2 = len & ~1;
 800         for (int32_t i=0; i < len2;) {
 801             UChar32 start = list[i++];
 802             int32_t count = list[i++] - start;
 803             if (index < count) {
 804                 return (UChar32)(start + index);
 805             }
 806             index -= count;
 807         }
 808     }
 809     return (UChar32)-1;
 810 }
 811
 812 /**
 813  * Make this object represent the range <code>start - end</code>.
 814  * If <code>end > start</code> then this object is set to an
 815  * an empty range.
 816  *
 817  * @param start first character in the set, inclusive
 818  * @rparam end last character in the set, inclusive
 819  */
 820 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
 821     clear();
 822     complement(start, end);
 823     return *this;
 824 }
 825
 826 /**
 827  * Adds the specified range to this set if it is not already
 828  * present.  If this set already contains the specified range,
 829  * the call leaves this set unchanged.  If <code>end > start</code>
 830  * then an empty range is added, leaving the set unchanged.
 831  *
 832  * @param start first character, inclusive, of range to be added
 833  * to this set.
 834  * @param end last character, inclusive, of range to be added
 835  * to this set.
 836  */
 837 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
 838     if (pinCodePoint(start) < pinCodePoint(end)) {
 839         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
 840         add(range, 2, 0);
 841     } else if (start == end) {
 842         add(start);
 843     }
 844     return *this;
 845 }
 846
 847 // #define DEBUG_US_ADD
 848
 849 #ifdef DEBUG_US_ADD
 850 #include <stdio.h>
 851 void dump(UChar32 c) {
 852     if (c <= 0xFF) {
 853         printf("%c", (char)c);
 854     } else {
 855         printf("U+%04X", c);
 856     }
 857 }
 858 void dump(const UChar32* list, int32_t len) {
 859     printf("[");
 860     for (int32_t i=0; i<len; ++i) {
 861         if (i != 0) printf(", ");
 862         dump(list[i]);
 863     }
 864     printf("]");
 865 }
 866 #endif
 867
 868 /**
 869  * Adds the specified character to this set if it is not already
 870  * present.  If this set already contains the specified character,
 871  * the call leaves this set unchanged.
 872  */
 873 UnicodeSet& UnicodeSet::add(UChar32 c) {
 874     // find smallest i such that c < list[i]
 875     // if odd, then it is IN the set
 876     // if even, then it is OUT of the set
 877     int32_t i = findCodePoint(pinCodePoint(c));
 878
 879     // already in set?
 880     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
 881
 882     // HIGH is 0x110000
 883     // assert(list[len-1] == HIGH);
 884
 885     // empty = [HIGH]
 886     // [start_0, limit_0, start_1, limit_1, HIGH]
 887
 888     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 889     //                             ^
 890     //                             list[i]
 891
 892     // i == 0 means c is before the first range
 893
 894 #ifdef DEBUG_US_ADD
 895     printf("Add of ");
 896     dump(c);
 897     printf(" found at %d", i);
 898     printf(": ");
 899     dump(list, len);
 900     printf(" => ");
 901 #endif
 902
 903     if (c == list[i]-1) {
 904         // c is before start of next range
 905         list[i] = c;
 906         // if we touched the HIGH mark, then add a new one
 907         if (c == (UNICODESET_HIGH - 1)) {
 908             UErrorCode status = U_ZERO_ERROR;
 909             ensureCapacity(len+1, status);
 910             if (U_FAILURE(status)) {
 911                 return *this; // There is no way to report this error :-(
 912             }
 913             list[len++] = UNICODESET_HIGH;
 914         }
 915         if (i > 0 && c == list[i-1]) {
 916             // collapse adjacent ranges
 917
 918             // [..., start_k-1, c, c, limit_k, ..., HIGH]
 919             //                     ^
 920             //                     list[i]
 921
 922             //for (int32_t k=i-1; k<len-2; ++k) {
 923             //    list[k] = list[k+2];
 924             //}
 925             UChar32* dst = list + i - 1;
 926             UChar32* src = dst + 2;
 927             UChar32* srclimit = list + len;
 928             while (src < srclimit) *(dst++) = *(src++);
 929
 930             len -= 2;
 931         }
 932     }
 933
 934     else if (i > 0 && c == list[i-1]) {
 935         // c is after end of prior range
 936         list[i-1]++;
 937         // no need to check for collapse here
 938     }
 939
 940     else {
 941         // At this point we know the new char is not adjacent to
 942         // any existing ranges, and it is not 10FFFF.
 943
 944
 945         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 946         //                             ^
 947         //                             list[i]
 948
 949         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 950         //                             ^
 951         //                             list[i]
 952
 953         UErrorCode status = U_ZERO_ERROR;
 954         ensureCapacity(len+2, status);
 955         if (U_FAILURE(status)) {
 956             return *this; // There is no way to report this error :-(
 957         }
 958
 959         //for (int32_t k=len-1; k>=i; --k) {
 960         //    list[k+2] = list[k];
 961         //}
 962         UChar32* src = list + len;
 963         UChar32* dst = src + 2;
 964         UChar32* srclimit = list + i;
 965         while (src > srclimit) *(--dst) = *(--src);
 966
 967         list[i] = c;
 968         list[i+1] = c+1;
 969         len += 2;
 970     }
 971
 972 #ifdef DEBUG_US_ADD
 973     dump(list, len);
 974     printf("\n");
 975
 976     for (i=1; i<len; ++i) {
 977         if (list[i] <= list[i-1]) {
 978             // Corrupt array!
 979             printf("ERROR: list has been corrupted\n");
 980             exit(1);
 981         }
 982     }
 983 #endif
 984
 985     releasePattern();
 986     return *this;
 987 }
 988
 989 /**
 990  * Adds the specified multicharacter to this set if it is not already
 991  * present.  If this set already contains the multicharacter,
 992  * the call leaves this set unchanged.
 993  * Thus "ch" => {"ch"}
 994  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 995  * @param s the source string
 996  * @return the modified set, for chaining
 997  */
 998 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
 999     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1000     int32_t cp = getSingleCP(s);
1001     if (cp < 0) {
1002         if (!strings->contains((void*) &s)) {
1003             _add(s);
1004             releasePattern();
1005         }
1006     } else {
1007         add((UChar32)cp);
1008     }
1009     return *this;
1010 }
1011
1012 /**
1013  * Adds the given string, in order, to 'strings'.  The given string
1014  * must have been checked by the caller to not be empty and to not
1015  * already be in 'strings'.
1016  */
1017 void UnicodeSet::_add(const UnicodeString& s) {
1018     if (isFrozen() || isBogus()) {
1019         return;
1020     }
1021     UnicodeString* t = new UnicodeString(s);
1022     if (t == NULL) { // Check for memory allocation error.
1023         setToBogus();
1024         return;
1025     }
1026     UErrorCode ec = U_ZERO_ERROR;
1027     strings->sortedInsert(t, compareUnicodeString, ec);
1028     if (U_FAILURE(ec)) {
1029         setToBogus();
1030         delete t;
1031     }
1032 }
1033
1034 /**
1035  * @return a code point IF the string consists of a single one.
1036  * otherwise returns -1.
1037  * @param string to test
1038  */
1039 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1040     //if (s.length() < 1) {
1041     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1042     //}
1043     if (s.length() > 2) return -1;
1044     if (s.length() == 1) return s.charAt(0);
1045
1046     // at this point, len = 2
1047     UChar32 cp = s.char32At(0);
1048     if (cp > 0xFFFF) { // is surrogate pair
1049         return cp;
1050     }
1051     return -1;
1052 }
1053
1054 /**
1055  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1056  * If this set already any particular character, it has no effect on that character.
1057  * @param the source string
1058  * @return the modified set, for chaining
1059  */
1060 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1061     UChar32 cp;
1062     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1063         cp = s.char32At(i);
1064         add(cp);
1065     }
1066     return *this;
1067 }
1068
1069 /**
1070  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1071  * If this set already any particular character, it has no effect on that character.
1072  * @param the source string
1073  * @return the modified set, for chaining
1074  */
1075 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1076     UnicodeSet set;
1077     set.addAll(s);
1078     retainAll(set);
1079     return *this;
1080 }
1081
1082 /**
1083  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1084  * If this set already any particular character, it has no effect on that character.
1085  * @param the source string
1086  * @return the modified set, for chaining
1087  */
1088 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1089     UnicodeSet set;
1090     set.addAll(s);
1091     complementAll(set);
1092     return *this;
1093 }
1094
1095 /**
1096  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1097  * If this set already any particular character, it has no effect on that character.
1098  * @param the source string
1099  * @return the modified set, for chaining
1100  */
1101 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1102     UnicodeSet set;
1103     set.addAll(s);
1104     removeAll(set);
1105     return *this;
1106 }
1107
1108 UnicodeSet& UnicodeSet::removeAllStrings() {
1109     strings->removeAllElements();
1110     return *this;
1111 }
1112
1113
1114 /**
1115  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1116  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1117  * @param the source string
1118  * @return a newly created set containing the given string
1119  */
1120 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1121     UnicodeSet *set = new UnicodeSet();
1122     if (set != NULL) { // Check for memory allocation error.
1123         set->add(s);
1124     }
1125     return set;
1126 }
1127
1128
1129 /**
1130  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1131  * @param the source string
1132  * @return a newly created set containing the given characters
1133  */
1134 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1135     UnicodeSet *set = new UnicodeSet();
1136     if (set != NULL) { // Check for memory allocation error.
1137         set->addAll(s);
1138     }
1139     return set;
1140 }
1141
1142 /**
1143  * Retain only the elements in this set that are contained in the
1144  * specified range.  If <code>end > start</code> then an empty range is
1145  * retained, leaving the set empty.
1146  *
1147  * @param start first character, inclusive, of range to be retained
1148  * to this set.
1149  * @param end last character, inclusive, of range to be retained
1150  * to this set.
1151  */
1152 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1153     if (pinCodePoint(start) <= pinCodePoint(end)) {
1154         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1155         retain(range, 2, 0);
1156     } else {
1157         clear();
1158     }
1159     return *this;
1160 }
1161
1162 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1163     return retain(c, c);
1164 }
1165
1166 /**
1167  * Removes the specified range from this set if it is present.
1168  * The set will not contain the specified range once the call
1169  * returns.  If <code>end > start</code> then an empty range is
1170  * removed, leaving the set unchanged.
1171  *
1172  * @param start first character, inclusive, of range to be removed
1173  * from this set.
1174  * @param end last character, inclusive, of range to be removed
1175  * from this set.
1176  */
1177 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1178     if (pinCodePoint(start) <= pinCodePoint(end)) {
1179         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1180         retain(range, 2, 2);
1181     }
1182     return *this;
1183 }
1184
1185 /**
1186  * Removes the specified character from this set if it is present.
1187  * The set will not contain the specified range once the call
1188  * returns.
1189  */
1190 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1191     return remove(c, c);
1192 }
1193
1194 /**
1195  * Removes the specified string from this set if it is present.
1196  * The set will not contain the specified character once the call
1197  * returns.
1198  * @param the source string
1199  * @return the modified set, for chaining
1200  */
1201 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1202     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1203     int32_t cp = getSingleCP(s);
1204     if (cp < 0) {
1205         strings->removeElement((void*) &s);
1206         releasePattern();
1207     } else {
1208         remove((UChar32)cp, (UChar32)cp);
1209     }
1210     return *this;
1211 }
1212
1213 /**
1214  * Complements the specified range in this set.  Any character in
1215  * the range will be removed if it is in this set, or will be
1216  * added if it is not in this set.  If <code>end > start</code>
1217  * then an empty range is xor'ed, leaving the set unchanged.
1218  *
1219  * @param start first character, inclusive, of range to be removed
1220  * from this set.
1221  * @param end last character, inclusive, of range to be removed
1222  * from this set.
1223  */
1224 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1225     if (isFrozen() || isBogus()) {
1226         return *this;
1227     }
1228     if (pinCodePoint(start) <= pinCodePoint(end)) {
1229         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1230         exclusiveOr(range, 2, 0);
1231     }
1232     releasePattern();
1233     return *this;
1234 }
1235
1236 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1237     return complement(c, c);
1238 }
1239
1240 /**
1241  * This is equivalent to
1242  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1243  */
1244 UnicodeSet& UnicodeSet::complement(void) {
1245     if (isFrozen() || isBogus()) {
1246         return *this;
1247     }
1248     UErrorCode status = U_ZERO_ERROR;
1249     if (list[0] == UNICODESET_LOW) {
1250         ensureBufferCapacity(len-1, status);
1251         if (U_FAILURE(status)) {
1252             return *this;
1253         }
1254         uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1255         --len;
1256     } else {
1257         ensureBufferCapacity(len+1, status);
1258         if (U_FAILURE(status)) {
1259             return *this;
1260         }
1261         uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1262         buffer[0] = UNICODESET_LOW;
1263         ++len;
1264     }
1265     swapBuffers();
1266     releasePattern();
1267     return *this;
1268 }
1269
1270 /**
1271  * Complement the specified string in this set.
1272  * The set will not contain the specified string once the call
1273  * returns.
1274  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1275  * @param s the string to complement
1276  * @return this object, for chaining
1277  */
1278 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1279     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1280     int32_t cp = getSingleCP(s);
1281     if (cp < 0) {
1282         if (strings->contains((void*) &s)) {
1283             strings->removeElement((void*) &s);
1284         } else {
1285             _add(s);
1286         }
1287         releasePattern();
1288     } else {
1289         complement((UChar32)cp, (UChar32)cp);
1290     }
1291     return *this;
1292 }
1293
1294 /**
1295  * Adds all of the elements in the specified set to this set if
1296  * they're not already present.  This operation effectively
1297  * modifies this set so that its value is the <i>union</i> of the two
1298  * sets.  The behavior of this operation is unspecified if the specified
1299  * collection is modified while the operation is in progress.
1300  *
1301  * @param c set whose elements are to be added to this set.
1302  * @see #add(char, char)
1303  */
1304 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1305     if ( c.len>0 && c.list!=NULL ) {
1306         add(c.list, c.len, 0);
1307     }
1308
1309     // Add strings in order
1310     if ( c.strings!=NULL ) {
1311         for (int32_t i=0; i<c.strings->size(); ++i) {
1312             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1313             if (!strings->contains((void*) s)) {
1314                 _add(*s);
1315             }
1316         }
1317     }
1318     return *this;
1319 }
1320
1321 /**
1322  * Retains only the elements in this set that are contained in the
1323  * specified set.  In other words, removes from this set all of
1324  * its elements that are not contained in the specified set.  This
1325  * operation effectively modifies this set so that its value is
1326  * the <i>intersection</i> of the two sets.
1327  *
1328  * @param c set that defines which elements this set will retain.
1329  */
1330 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1331     if (isFrozen() || isBogus()) {
1332         return *this;
1333     }
1334     retain(c.list, c.len, 0);
1335     strings->retainAll(*c.strings);
1336     return *this;
1337 }
1338
1339 /**
1340  * Removes from this set all of its elements that are contained in the
1341  * specified set.  This operation effectively modifies this
1342  * set so that its value is the <i>asymmetric set difference</i> of
1343  * the two sets.
1344  *
1345  * @param c set that defines which elements will be removed from
1346  *          this set.
1347  */
1348 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1349     if (isFrozen() || isBogus()) {
1350         return *this;
1351     }
1352     retain(c.list, c.len, 2);
1353     strings->removeAll(*c.strings);
1354     return *this;
1355 }
1356
1357 /**
1358  * Complements in this set all elements contained in the specified
1359  * set.  Any character in the other set will be removed if it is
1360  * in this set, or will be added if it is not in this set.
1361  *
1362  * @param c set that defines which elements will be xor'ed from
1363  *          this set.
1364  */
1365 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1366     if (isFrozen() || isBogus()) {
1367         return *this;
1368     }
1369     exclusiveOr(c.list, c.len, 0);
1370
1371     for (int32_t i=0; i<c.strings->size(); ++i) {
1372         void* e = c.strings->elementAt(i);
1373         if (!strings->removeElement(e)) {
1374             _add(*(const UnicodeString*)e);
1375         }
1376     }
1377     return *this;
1378 }
1379
1380 /**
1381  * Removes all of the elements from this set.  This set will be
1382  * empty after this call returns.
1383  */
1384 UnicodeSet& UnicodeSet::clear(void) {
1385     if (isFrozen()) {
1386         return *this;
1387     }
1388     if (list != NULL) {
1389         list[0] = UNICODESET_HIGH;
1390     }
1391     len = 1;
1392     releasePattern();
1393     if (strings != NULL) {
1394         strings->removeAllElements();
1395     }
1396     if (list != NULL && strings != NULL) {
1397         // Remove bogus
1398         fFlags = 0;
1399     }
1400     return *this;
1401 }
1402
1403 /**
1404  * Iteration method that returns the number of ranges contained in
1405  * this set.
1406  * @see #getRangeStart
1407  * @see #getRangeEnd
1408  */
1409 int32_t UnicodeSet::getRangeCount() const {
1410     return len/2;
1411 }
1412
1413 /**
1414  * Iteration method that returns the first character in the
1415  * specified range of this set.
1416  * @see #getRangeCount
1417  * @see #getRangeEnd
1418  */
1419 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1420     return list[index*2];
1421 }
1422
1423 /**
1424  * Iteration method that returns the last character in the
1425  * specified range of this set.
1426  * @see #getRangeStart
1427  * @see #getRangeEnd
1428  */
1429 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1430     return list[index*2 + 1] - 1;
1431 }
1432
1433 int32_t UnicodeSet::getStringCount() const {
1434     return strings->size();
1435 }
1436
1437 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1438     return (const UnicodeString*) strings->elementAt(index);
1439 }
1440
1441 /**
1442  * Reallocate this objects internal structures to take up the least
1443  * possible space, without changing this object's value.
1444  */
1445 UnicodeSet& UnicodeSet::compact() {
1446     if (isFrozen() || isBogus()) {
1447         return *this;
1448     }
1449     // Delete buffer first to defragment memory less.
1450     if (buffer != NULL) {
1451         uprv_free(buffer);
1452         buffer = NULL;
1453     }
1454     if (len < capacity) {
1455         // Make the capacity equal to len or 1.
1456         // We don't want to realloc of 0 size.
1457         int32_t newCapacity = len + (len == 0);
1458         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1459         if (temp) {
1460             list = temp;
1461             capacity = newCapacity;
1462         }
1463         // else what the heck happened?! We allocated less memory!
1464         // Oh well. We'll keep our original array.
1465     }
1466     return *this;
1467 }
1468
1469 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1470     int32_t bmpLength, length, destLength;
1471
1472     if (U_FAILURE(ec)) {
1473         return 0;
1474     }
1475
1476     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1477         ec=U_ILLEGAL_ARGUMENT_ERROR;
1478         return 0;
1479     }
1480
1481     /* count necessary 16-bit units */
1482     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1483     // assert(length>=0);
1484     if (length==0) {
1485         /* empty set */
1486         if (destCapacity>0) {
1487             *dest=0;
1488         } else {
1489             ec=U_BUFFER_OVERFLOW_ERROR;
1490         }
1491         return 1;
1492     }
1493     /* now length>0 */
1494
1495     if (this->list[length-1]<=0xffff) {
1496         /* all BMP */
1497         bmpLength=length;
1498     } else if (this->list[0]>=0x10000) {
1499         /* all supplementary */
1500         bmpLength=0;
1501         length*=2;
1502     } else {
1503         /* some BMP, some supplementary */
1504         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1505         length=bmpLength+2*(length-bmpLength);
1506     }
1507
1508     /* length: number of 16-bit array units */
1509     if (length>0x7fff) {
1510         /* there are only 15 bits for the length in the first serialized word */
1511         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1512         return 0;
1513     }
1514
1515     /*
1516      * total serialized length:
1517      * number of 16-bit array units (length) +
1518      * 1 length unit (always) +
1519      * 1 bmpLength unit (if there are supplementary values)
1520      */
1521     destLength=length+((length>bmpLength)?2:1);
1522     if (destLength<=destCapacity) {
1523         const UChar32 *p;
1524         int32_t i;
1525
1526         *dest=(uint16_t)length;
1527         if (length>bmpLength) {
1528             *dest|=0x8000;
1529             *++dest=(uint16_t)bmpLength;
1530         }
1531         ++dest;
1532
1533         /* write the BMP part of the array */
1534         p=this->list;
1535         for (i=0; i<bmpLength; ++i) {
1536             *dest++=(uint16_t)*p++;
1537         }
1538
1539         /* write the supplementary part of the array */
1540         for (; i<length; i+=2) {
1541             *dest++=(uint16_t)(*p>>16);
1542             *dest++=(uint16_t)*p++;
1543         }
1544     } else {
1545         ec=U_BUFFER_OVERFLOW_ERROR;
1546     }
1547     return destLength;
1548 }
1549
1550 //----------------------------------------------------------------
1551 // Implementation: Utility methods
1552 //----------------------------------------------------------------
1553
1554 /**
1555  * Allocate our strings vector and return TRUE if successful.
1556  */
1557 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1558     if (U_FAILURE(status)) {
1559         return FALSE;
1560     }
1561     strings = new UVector(uhash_deleteUnicodeString,
1562                           uhash_compareUnicodeString, 1, status);
1563     if (strings == NULL) { // Check for memory allocation error.
1564         status = U_MEMORY_ALLOCATION_ERROR;
1565         return FALSE;
1566     }
1567     if (U_FAILURE(status)) {
1568         delete strings;
1569         strings = NULL;
1570         return FALSE;
1571     }
1572     return TRUE;
1573 }
1574
1575 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1576     if (newLen <= capacity)
1577         return;
1578     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1579     if (temp == NULL) {
1580         ec = U_MEMORY_ALLOCATION_ERROR;
1581         setToBogus();
1582         return;
1583     }
1584     list = temp;
1585     capacity = newLen + GROW_EXTRA;
1586     // else we keep the original contents on the memory failure.
1587 }
1588
1589 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1590     if (buffer != NULL && newLen <= bufferCapacity)
1591         return;
1592     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1593     if (temp == NULL) {
1594         ec = U_MEMORY_ALLOCATION_ERROR;
1595         setToBogus();
1596         return;
1597     }
1598     buffer = temp;
1599     bufferCapacity = newLen + GROW_EXTRA;
1600     // else we keep the original contents on the memory failure.
1601 }
1602
1603 /**
1604  * Swap list and buffer.
1605  */
1606 void UnicodeSet::swapBuffers(void) {
1607     // swap list and buffer
1608     UChar32* temp = list;
1609     list = buffer;
1610     buffer = temp;
1611
1612     int32_t c = capacity;
1613     capacity = bufferCapacity;
1614     bufferCapacity = c;
1615 }
1616
1617 void UnicodeSet::setToBogus() {
1618     clear(); // Remove everything in the set.
1619     fFlags = kIsBogus;
1620 }
1621
1622 //----------------------------------------------------------------
1623 // Implementation: Fundamental operators
1624 //----------------------------------------------------------------
1625
1626 static inline UChar32 max(UChar32 a, UChar32 b) {
1627     return (a > b) ? a : b;
1628 }
1629
1630 // polarity = 0, 3 is normal: x xor y
1631 // polarity = 1, 2: x xor ~y == x === y
1632
1633 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1634     if (isFrozen() || isBogus()) {
1635         return;
1636     }
1637     UErrorCode status = U_ZERO_ERROR;
1638     ensureBufferCapacity(len + otherLen, status);
1639     if (U_FAILURE(status)) {
1640         return;
1641     }
1642
1643     int32_t i = 0, j = 0, k = 0;
1644     UChar32 a = list[i++];
1645     UChar32 b;
1646     if (polarity == 1 || polarity == 2) {
1647         b = UNICODESET_LOW;
1648         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1649             ++j;
1650             b = other[j];
1651         }
1652     } else {
1653         b = other[j++];
1654     }
1655     // simplest of all the routines
1656     // sort the values, discarding identicals!
1657     for (;;) {
1658         if (a < b) {
1659             buffer[k++] = a;
1660             a = list[i++];
1661         } else if (b < a) {
1662             buffer[k++] = b;
1663             b = other[j++];
1664         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1665             // discard both values!
1666             a = list[i++];
1667             b = other[j++];
1668         } else { // DONE!
1669             buffer[k++] = UNICODESET_HIGH;
1670             len = k;
1671             break;
1672         }
1673     }
1674     swapBuffers();
1675     releasePattern();
1676 }
1677
1678 // polarity = 0 is normal: x union y
1679 // polarity = 2: x union ~y
1680 // polarity = 1: ~x union y
1681 // polarity = 3: ~x union ~y
1682
1683 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1684     if (isFrozen() || isBogus() || other==NULL) {
1685         return;
1686     }
1687     UErrorCode status = U_ZERO_ERROR;
1688     ensureBufferCapacity(len + otherLen, status);
1689     if (U_FAILURE(status)) {
1690         return;
1691     }
1692
1693     int32_t i = 0, j = 0, k = 0;
1694     UChar32 a = list[i++];
1695     UChar32 b = other[j++];
1696     // change from xor is that we have to check overlapping pairs
1697     // polarity bit 1 means a is second, bit 2 means b is.
1698     for (;;) {
1699         switch (polarity) {
1700           case 0: // both first; take lower if unequal
1701             if (a < b) { // take a
1702                 // Back up over overlapping ranges in buffer[]
1703                 if (k > 0 && a <= buffer[k-1]) {
1704                     // Pick latter end value in buffer[] vs. list[]
1705                     a = max(list[i], buffer[--k]);
1706                 } else {
1707                     // No overlap
1708                     buffer[k++] = a;
1709                     a = list[i];
1710                 }
1711                 i++; // Common if/else code factored out
1712                 polarity ^= 1;
1713             } else if (b < a) { // take b
1714                 if (k > 0 && b <= buffer[k-1]) {
1715                     b = max(other[j], buffer[--k]);
1716                 } else {
1717                     buffer[k++] = b;
1718                     b = other[j];
1719                 }
1720                 j++;
1721                 polarity ^= 2;
1722             } else { // a == b, take a, drop b
1723                 if (a == UNICODESET_HIGH) goto loop_end;
1724                 // This is symmetrical; it doesn't matter if
1725                 // we backtrack with a or b. - liu
1726                 if (k > 0 && a <= buffer[k-1]) {
1727                     a = max(list[i], buffer[--k]);
1728                 } else {
1729                     // No overlap
1730                     buffer[k++] = a;
1731                     a = list[i];
1732                 }
1733                 i++;
1734                 polarity ^= 1;
1735                 b = other[j++];
1736                 polarity ^= 2;
1737             }
1738             break;
1739           case 3: // both second; take higher if unequal, and drop other
1740             if (b <= a) { // take a
1741                 if (a == UNICODESET_HIGH) goto loop_end;
1742                 buffer[k++] = a;
1743             } else { // take b
1744                 if (b == UNICODESET_HIGH) goto loop_end;
1745                 buffer[k++] = b;
1746             }
1747             a = list[i++];
1748             polarity ^= 1;   // factored common code
1749             b = other[j++];
1750             polarity ^= 2;
1751             break;
1752           case 1: // a second, b first; if b < a, overlap
1753             if (a < b) { // no overlap, take a
1754                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1755             } else if (b < a) { // OVERLAP, drop b
1756                 b = other[j++];
1757                 polarity ^= 2;
1758             } else { // a == b, drop both!
1759                 if (a == UNICODESET_HIGH) goto loop_end;
1760                 a = list[i++];
1761                 polarity ^= 1;
1762                 b = other[j++];
1763                 polarity ^= 2;
1764             }
1765             break;
1766           case 2: // a first, b second; if a < b, overlap
1767             if (b < a) { // no overlap, take b
1768                 buffer[k++] = b;
1769                 b = other[j++];
1770                 polarity ^= 2;
1771             } else  if (a < b) { // OVERLAP, drop a
1772                 a = list[i++];
1773                 polarity ^= 1;
1774             } else { // a == b, drop both!
1775                 if (a == UNICODESET_HIGH) goto loop_end;
1776                 a = list[i++];
1777                 polarity ^= 1;
1778                 b = other[j++];
1779                 polarity ^= 2;
1780             }
1781             break;
1782         }
1783     }
1784  loop_end:
1785     buffer[k++] = UNICODESET_HIGH;    // terminate
1786     len = k;
1787     swapBuffers();
1788     releasePattern();
1789 }
1790
1791 // polarity = 0 is normal: x intersect y
1792 // polarity = 2: x intersect ~y == set-minus
1793 // polarity = 1: ~x intersect y
1794 // polarity = 3: ~x intersect ~y
1795
1796 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1797     if (isFrozen() || isBogus()) {
1798         return;
1799     }
1800     UErrorCode status = U_ZERO_ERROR;
1801     ensureBufferCapacity(len + otherLen, status);
1802     if (U_FAILURE(status)) {
1803         return;
1804     }
1805
1806     int32_t i = 0, j = 0, k = 0;
1807     UChar32 a = list[i++];
1808     UChar32 b = other[j++];
1809     // change from xor is that we have to check overlapping pairs
1810     // polarity bit 1 means a is second, bit 2 means b is.
1811     for (;;) {
1812         switch (polarity) {
1813           case 0: // both first; drop the smaller
1814             if (a < b) { // drop a
1815                 a = list[i++];
1816                 polarity ^= 1;
1817             } else if (b < a) { // drop b
1818                 b = other[j++];
1819                 polarity ^= 2;
1820             } else { // a == b, take one, drop other
1821                 if (a == UNICODESET_HIGH) goto loop_end;
1822                 buffer[k++] = a;
1823                 a = list[i++];
1824                 polarity ^= 1;
1825                 b = other[j++];
1826                 polarity ^= 2;
1827             }
1828             break;
1829           case 3: // both second; take lower if unequal
1830             if (a < b) { // take a
1831                 buffer[k++] = a;
1832                 a = list[i++];
1833                 polarity ^= 1;
1834             } else if (b < a) { // take b
1835                 buffer[k++] = b;
1836                 b = other[j++];
1837                 polarity ^= 2;
1838             } else { // a == b, take one, drop other
1839                 if (a == UNICODESET_HIGH) goto loop_end;
1840                 buffer[k++] = a;
1841                 a = list[i++];
1842                 polarity ^= 1;
1843                 b = other[j++];
1844                 polarity ^= 2;
1845             }
1846             break;
1847           case 1: // a second, b first;
1848             if (a < b) { // NO OVERLAP, drop a
1849                 a = list[i++];
1850                 polarity ^= 1;
1851             } else if (b < a) { // OVERLAP, take b
1852                 buffer[k++] = b;
1853                 b = other[j++];
1854                 polarity ^= 2;
1855             } else { // a == b, drop both!
1856                 if (a == UNICODESET_HIGH) goto loop_end;
1857                 a = list[i++];
1858                 polarity ^= 1;
1859                 b = other[j++];
1860                 polarity ^= 2;
1861             }
1862             break;
1863           case 2: // a first, b second; if a < b, overlap
1864             if (b < a) { // no overlap, drop b
1865                 b = other[j++];
1866                 polarity ^= 2;
1867             } else  if (a < b) { // OVERLAP, take a
1868                 buffer[k++] = a;
1869                 a = list[i++];
1870                 polarity ^= 1;
1871             } else { // a == b, drop both!
1872                 if (a == UNICODESET_HIGH) goto loop_end;
1873                 a = list[i++];
1874                 polarity ^= 1;
1875                 b = other[j++];
1876                 polarity ^= 2;
1877             }
1878             break;
1879         }
1880     }
1881  loop_end:
1882     buffer[k++] = UNICODESET_HIGH;    // terminate
1883     len = k;
1884     swapBuffers();
1885     releasePattern();
1886 }
1887
1888 /**
1889  * Append the <code>toPattern()</code> representation of a
1890  * string to the given <code>StringBuffer</code>.
1891  */
1892 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1893 escapeUnprintable) {
1894     UChar32 cp;
1895     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1896         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1897     }
1898 }
1899
1900 /**
1901  * Append the <code>toPattern()</code> representation of a
1902  * character to the given <code>StringBuffer</code>.
1903  */
1904 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1905 escapeUnprintable) {
1906     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1907         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1908         // unprintable
1909         if (ICU_Utility::escapeUnprintable(buf, c)) {
1910             return;
1911         }
1912     }
1913     // Okay to let ':' pass through
1914     switch (c) {
1915     case SET_OPEN:
1916     case SET_CLOSE:
1917     case HYPHEN:
1918     case COMPLEMENT:
1919     case INTERSECTION:
1920     case BACKSLASH:
1921     case OPEN_BRACE:
1922     case CLOSE_BRACE:
1923     case COLON:
1924     case SymbolTable::SYMBOL_REF:
1925         buf.append(BACKSLASH);
1926         break;
1927     default:
1928         // Escape whitespace
1929         if (uprv_isRuleWhiteSpace(c)) {
1930             buf.append(BACKSLASH);
1931         }
1932         break;
1933     }
1934     buf.append(c);
1935 }
1936
1937 /**
1938  * Append a string representation of this set to result.  This will be
1939  * a cleaned version of the string passed to applyPattern(), if there
1940  * is one.  Otherwise it will be generated.
1941  */
1942 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1943                                       UBool escapeUnprintable) const
1944 {
1945     if (pat != NULL) {
1946         int32_t i;
1947         int32_t backslashCount = 0;
1948         for (i=0; i<patLen; ) {
1949             UChar32 c;
1950             U16_NEXT(pat, i, patLen, c);
1951             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1952                 // If the unprintable character is preceded by an odd
1953                 // number of backslashes, then it has been escaped.
1954                 // Before unescaping it, we delete the final
1955                 // backslash.
1956                 if ((backslashCount % 2) == 1) {
1957                     result.truncate(result.length() - 1);
1958                 }
1959                 ICU_Utility::escapeUnprintable(result, c);
1960                 backslashCount = 0;
1961             } else {
1962                 result.append(c);
1963                 if (c == BACKSLASH) {
1964                     ++backslashCount;
1965                 } else {
1966                     backslashCount = 0;
1967                 }
1968             }
1969         }
1970         return result;
1971     }
1972
1973     return _generatePattern(result, escapeUnprintable);
1974 }
1975
1976 /**
1977  * Returns a string representation of this set.  If the result of
1978  * calling this function is passed to a UnicodeSet constructor, it
1979  * will produce another set that is equal to this one.
1980  */
1981 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1982                                      UBool escapeUnprintable) const
1983 {
1984     result.truncate(0);
1985     return _toPattern(result, escapeUnprintable);
1986 }
1987
1988 /**
1989  * Generate and append a string representation of this set to result.
1990  * This does not use this.pat, the cleaned up copy of the string
1991  * passed to applyPattern().
1992  */
1993 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1994                                             UBool escapeUnprintable) const
1995 {
1996     result.append(SET_OPEN);
1997
1998 //  // Check against the predefined categories.  We implicitly build
1999 //  // up ALL category sets the first time toPattern() is called.
2000 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2001 //      if (*this == getCategorySet(cat)) {
2002 //          result.append(COLON);
2003 //          result.append(CATEGORY_NAMES, cat*2, 2);
2004 //          return result.append(CATEGORY_CLOSE);
2005 //      }
2006 //  }
2007
2008     int32_t count = getRangeCount();
2009
2010     // If the set contains at least 2 intervals and includes both
2011     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2012     // be more economical.
2013     if (count > 1 &&
2014         getRangeStart(0) == MIN_VALUE &&
2015         getRangeEnd(count-1) == MAX_VALUE) {
2016
2017         // Emit the inverse
2018         result.append(COMPLEMENT);
2019
2020         for (int32_t i = 1; i < count; ++i) {
2021             UChar32 start = getRangeEnd(i-1)+1;
2022             UChar32 end = getRangeStart(i)-1;
2023             _appendToPat(result, start, escapeUnprintable);
2024             if (start != end) {
2025                 if ((start+1) != end) {
2026                     result.append(HYPHEN);
2027                 }
2028                 _appendToPat(result, end, escapeUnprintable);
2029             }
2030         }
2031     }
2032
2033     // Default; emit the ranges as pairs
2034     else {
2035         for (int32_t i = 0; i < count; ++i) {
2036             UChar32 start = getRangeStart(i);
2037             UChar32 end = getRangeEnd(i);
2038             _appendToPat(result, start, escapeUnprintable);
2039             if (start != end) {
2040                 if ((start+1) != end) {
2041                     result.append(HYPHEN);
2042                 }
2043                 _appendToPat(result, end, escapeUnprintable);
2044             }
2045         }
2046     }
2047
2048     for (int32_t i = 0; i<strings->size(); ++i) {
2049         result.append(OPEN_BRACE);
2050         _appendToPat(result,
2051                      *(const UnicodeString*) strings->elementAt(i),
2052                      escapeUnprintable);
2053         result.append(CLOSE_BRACE);
2054     }
2055     return result.append(SET_CLOSE);
2056 }
2057
2058 /**
2059 * Release existing cached pattern
2060 */
2061 void UnicodeSet::releasePattern() {
2062     if (pat) {
2063         uprv_free(pat);
2064         pat = NULL;
2065         patLen = 0;
2066     }
2067 }
2068
2069 /**
2070 * Set the new pattern to cache.
2071 */
2072 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2073     releasePattern();
2074     int32_t newPatLen = newPat.length();
2075     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2076     if (pat) {
2077         patLen = newPatLen;
2078         newPat.extractBetween(0, patLen, pat);
2079         pat[patLen] = 0;
2080     }
2081     // else we don't care if malloc failed. This was just a nice cache.
2082     // We can regenerate an equivalent pattern later when requested.
2083 }
2084
2085 UnicodeFunctor *UnicodeSet::freeze() {
2086     if(!isFrozen() && !isBogus()) {
2087         // Do most of what compact() does before freezing because
2088         // compact() will not work when the set is frozen.
2089         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2090
2091         // Delete buffer first to defragment memory less.
2092         if (buffer != NULL) {
2093             uprv_free(buffer);
2094             buffer = NULL;
2095         }
2096         if (capacity > (len + GROW_EXTRA)) {
2097             // Make the capacity equal to len or 1.
2098             // We don't want to realloc of 0 size.
2099             capacity = len + (len == 0);
2100             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2101             if (list == NULL) { // Check for memory allocation error.
2102                 setToBogus();
2103                 return this;
2104             }
2105         }
2106
2107         // Optimize contains() and span() and similar functions.
2108         if (!strings->isEmpty()) {
2109             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2110             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2111                 // All strings are irrelevant for span() etc. because
2112                 // all of each string's code points are contained in this set.
2113                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2114                 // many relevant strings as UTF-16.
2115                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2116                 delete stringSpan;
2117                 stringSpan = NULL;
2118             }
2119         }
2120         if (stringSpan == NULL) {
2121             // No span-relevant strings: Optimize for code point spans.
2122             bmpSet=new BMPSet(list, len);
2123             if (bmpSet == NULL) { // Check for memory allocation error.
2124                 setToBogus();
2125             }
2126         }
2127     }
2128     return this;
2129 }
2130
2131 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2132     if(length>0 && bmpSet!=NULL) {
2133         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2134     }
2135     if(length<0) {
2136         length=u_strlen(s);
2137     }
2138     if(length==0) {
2139         return 0;
2140     }
2141     if(stringSpan!=NULL) {
2142         return stringSpan->span(s, length, spanCondition);
2143     } else if(!strings->isEmpty()) {
2144         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2145                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2146                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2147         UnicodeSetStringSpan strSpan(*this, *strings, which);
2148         if(strSpan.needsStringSpanUTF16()) {
2149             return strSpan.span(s, length, spanCondition);
2150         }
2151     }
2152
2153     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2154         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2155     }
2156
2157     UChar32 c;
2158     int32_t start=0, prev=0;
2159     do {
2160         U16_NEXT(s, start, length, c);
2161         if(spanCondition!=contains(c)) {
2162             break;
2163         }
2164     } while((prev=start)<length);
2165     return prev;
2166 }
2167
2168 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2169     if(length>0 && bmpSet!=NULL) {
2170         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2171     }
2172     if(length<0) {
2173         length=u_strlen(s);
2174     }
2175     if(length==0) {
2176         return 0;
2177     }
2178     if(stringSpan!=NULL) {
2179         return stringSpan->spanBack(s, length, spanCondition);
2180     } else if(!strings->isEmpty()) {
2181         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2182                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2183                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2184         UnicodeSetStringSpan strSpan(*this, *strings, which);
2185         if(strSpan.needsStringSpanUTF16()) {
2186             return strSpan.spanBack(s, length, spanCondition);
2187         }
2188     }
2189
2190     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2191         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2192     }
2193
2194     UChar32 c;
2195     int32_t prev=length;
2196     do {
2197         U16_PREV(s, 0, length, c);
2198         if(spanCondition!=contains(c)) {
2199             break;
2200         }
2201     } while((prev=length)>0);
2202     return prev;
2203 }
2204
2205 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2206     if(length>0 && bmpSet!=NULL) {
2207         const uint8_t *s0=(const uint8_t *)s;
2208         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2209     }
2210     if(length<0) {
2211         length=uprv_strlen(s);
2212     }
2213     if(length==0) {
2214         return 0;
2215     }
2216     if(stringSpan!=NULL) {
2217         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2218     } else if(!strings->isEmpty()) {
2219         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2220                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2221                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2222         UnicodeSetStringSpan strSpan(*this, *strings, which);
2223         if(strSpan.needsStringSpanUTF8()) {
2224             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2225         }
2226     }
2227
2228     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2229         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2230     }
2231
2232     UChar32 c;
2233     int32_t start=0, prev=0;
2234     do {
2235         U8_NEXT(s, start, length, c);
2236         if(c<0) {
2237             c=0xfffd;
2238         }
2239         if(spanCondition!=contains(c)) {
2240             break;
2241         }
2242     } while((prev=start)<length);
2243     return prev;
2244 }
2245
2246 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2247     if(length>0 && bmpSet!=NULL) {
2248         const uint8_t *s0=(const uint8_t *)s;
2249         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2250     }
2251     if(length<0) {
2252         length=uprv_strlen(s);
2253     }
2254     if(length==0) {
2255         return 0;
2256     }
2257     if(stringSpan!=NULL) {
2258         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2259     } else if(!strings->isEmpty()) {
2260         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2261                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2262                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2263         UnicodeSetStringSpan strSpan(*this, *strings, which);
2264         if(strSpan.needsStringSpanUTF8()) {
2265             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2266         }
2267     }
2268
2269     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2270         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2271     }
2272
2273     UChar32 c;
2274     int32_t prev=length;
2275     do {
2276         U8_PREV(s, 0, length, c);
2277         if(c<0) {
2278             c=0xfffd;
2279         }
2280         if(spanCondition!=contains(c)) {
2281             break;
2282         }
2283     } while((prev=length)>0);
2284     return prev;
2285 }
2286
2287 U_NAMESPACE_END