icuSources/common/uniset.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 1999-2015, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   10/20/99    alan        Creation.
  10 **********************************************************************
  11 */
  12
  13 #include "unicode/utypes.h"
  14 #include "unicode/parsepos.h"
  15 #include "unicode/symtable.h"
  16 #include "unicode/uniset.h"
  17 #include "unicode/utf8.h"
  18 #include "unicode/utf16.h"
  19 #include "ruleiter.h"
  20 #include "cmemory.h"
  21 #include "cstring.h"
  22 #include "patternprops.h"
  23 #include "uelement.h"
  24 #include "util.h"
  25 #include "uvector.h"
  26 #include "charstr.h"
  27 #include "ustrfmt.h"
  28 #include "uassert.h"
  29 #include "bmpset.h"
  30 #include "unisetspan.h"
  31
  32 // Define UChar constants using hex for EBCDIC compatibility
  33 // Used #define to reduce private static exports and memory access time.
  34 #define SET_OPEN        ((UChar)0x005B) /*[*/
  35 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  36 #define HYPHEN          ((UChar)0x002D) /*-*/
  37 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  38 #define COLON           ((UChar)0x003A) /*:*/
  39 #define BACKSLASH       ((UChar)0x005C) /*\*/
  40 #define INTERSECTION    ((UChar)0x0026) /*&*/
  41 #define UPPER_U         ((UChar)0x0055) /*U*/
  42 #define LOWER_U         ((UChar)0x0075) /*u*/
  43 #define OPEN_BRACE      ((UChar)123)    /*{*/
  44 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  45 #define UPPER_P         ((UChar)0x0050) /*P*/
  46 #define LOWER_P         ((UChar)0x0070) /*p*/
  47 #define UPPER_N         ((UChar)78)     /*N*/
  48 #define EQUALS          ((UChar)0x003D) /*=*/
  49
  50 // HIGH_VALUE > all valid values. 110000 for codepoints
  51 #define UNICODESET_HIGH 0x0110000
  52
  53 // LOW <= all valid values. ZERO for codepoints
  54 #define UNICODESET_LOW 0x000000
  55
  56 // initial storage. Must be >= 0
  57 #define START_EXTRA 16
  58
  59 // extra amount for growth. Must be >= 0
  60 #define GROW_EXTRA START_EXTRA
  61
  62 U_NAMESPACE_BEGIN
  63
  64 SymbolTable::~SymbolTable() {}
  65
  66 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
  67
  68 /**
  69  * Modify the given UChar32 variable so that it is in range, by
  70  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
  71  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
  72  * It modifies its argument in-place and also returns it.
  73  */
  74 static inline UChar32 pinCodePoint(UChar32& c) {
  75     if (c < UNICODESET_LOW) {
  76         c = UNICODESET_LOW;
  77     } else if (c > (UNICODESET_HIGH-1)) {
  78         c = (UNICODESET_HIGH-1);
  79     }
  80     return c;
  81 }
  82
  83 //----------------------------------------------------------------
  84 // Debugging
  85 //----------------------------------------------------------------
  86
  87 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
  88 // To enable the debugging, define the symbol DEBUG_MEM in the line
  89 // below.  This will result in text being sent to stdout that looks
  90 // like this:
  91 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
  92 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
  93 // Each line lists a construction (ct) or destruction (dt) event, the
  94 // object address, the number of outstanding objects after the event,
  95 // and the pattern of the object in question.
  96
  97 // #define DEBUG_MEM
  98
  99 #ifdef DEBUG_MEM
 100 #include <stdio.h>
 101 static int32_t _dbgCount = 0;
 102
 103 static inline void _dbgct(UnicodeSet* set) {
 104     UnicodeString str;
 105     set->toPattern(str, TRUE);
 106     char buf[40];
 107     str.extract(0, 39, buf, "");
 108     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
 109 }
 110
 111 static inline void _dbgdt(UnicodeSet* set) {
 112     UnicodeString str;
 113     set->toPattern(str, TRUE);
 114     char buf[40];
 115     str.extract(0, 39, buf, "");
 116     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
 117 }
 118
 119 #else
 120
 121 #define _dbgct(set)
 122 #define _dbgdt(set)
 123
 124 #endif
 125
 126 //----------------------------------------------------------------
 127 // UnicodeString in UVector support
 128 //----------------------------------------------------------------
 129
 130 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
 131     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
 132 }
 133
 134 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
 135     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
 136     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
 137     return a.compare(b);
 138 }
 139
 140 //----------------------------------------------------------------
 141 // Constructors &c
 142 //----------------------------------------------------------------
 143
 144 /**
 145  * Constructs an empty set.
 146  */
 147 UnicodeSet::UnicodeSet() :
 148     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
 149     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 150     fFlags(0)
 151 {
 152     UErrorCode status = U_ZERO_ERROR;
 153     allocateStrings(status);
 154     if (U_FAILURE(status)) {
 155         return;
 156     }
 157     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 158     if(list!=NULL){
 159         list[0] = UNICODESET_HIGH;
 160     } else { // If memory allocation failed, set to bogus state.
 161         setToBogus();
 162         return;
 163     }
 164     _dbgct(this);
 165 }
 166
 167 /**
 168  * Constructs a set containing the given range. If <code>end >
 169  * start</code> then an empty set is created.
 170  *
 171  * @param start first character, inclusive, of range
 172  * @param end last character, inclusive, of range
 173  */
 174 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
 175     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
 176     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 177     fFlags(0)
 178 {
 179     UErrorCode status = U_ZERO_ERROR;
 180     allocateStrings(status);
 181     if (U_FAILURE(status)) {
 182         return;
 183     }
 184     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 185     if(list!=NULL){
 186         list[0] = UNICODESET_HIGH;
 187         complement(start, end);
 188     } else { // If memory allocation failed, set to bogus state.
 189         setToBogus();
 190         return;
 191     }
 192     _dbgct(this);
 193 }
 194
 195 /**
 196  * Constructs a set that is identical to the given UnicodeSet.
 197  */
 198 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
 199     UnicodeFilter(o),
 200     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
 201     bmpSet(0),
 202     buffer(0), bufferCapacity(0),
 203     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 204     fFlags(0)
 205 {
 206     UErrorCode status = U_ZERO_ERROR;
 207     allocateStrings(status);
 208     if (U_FAILURE(status)) {
 209         return;
 210     }
 211     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 212     if(list!=NULL){
 213         *this = o;
 214     } else { // If memory allocation failed, set to bogus state.
 215         setToBogus();
 216         return;
 217     }
 218     _dbgct(this);
 219 }
 220
 221 // Copy-construct as thawed.
 222 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
 223     UnicodeFilter(o),
 224     len(0), capacity(o.len + GROW_EXTRA), list(0),
 225     bmpSet(0),
 226     buffer(0), bufferCapacity(0),
 227     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 228     fFlags(0)
 229 {
 230     UErrorCode status = U_ZERO_ERROR;
 231     allocateStrings(status);
 232     if (U_FAILURE(status)) {
 233         return;
 234     }
 235     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 236     if(list!=NULL){
 237         // *this = o except for bmpSet and stringSpan
 238         len = o.len;
 239         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
 240         if (strings != NULL && o.strings != NULL) {
 241             strings->assign(*o.strings, cloneUnicodeString, status);
 242         } else { // Invalid strings.
 243             setToBogus();
 244             return;
 245         }
 246         if (o.pat) {
 247             setPattern(UnicodeString(o.pat, o.patLen));
 248         }
 249     } else { // If memory allocation failed, set to bogus state.
 250         setToBogus();
 251         return;
 252     }
 253     _dbgct(this);
 254 }
 255
 256 /**
 257  * Destructs the set.
 258  */
 259 UnicodeSet::~UnicodeSet() {
 260     _dbgdt(this); // first!
 261     uprv_free(list);
 262     delete bmpSet;
 263     if (buffer) {
 264         uprv_free(buffer);
 265     }
 266     delete strings;
 267     delete stringSpan;
 268     releasePattern();
 269 }
 270
 271 /**
 272  * Assigns this object to be a copy of another.
 273  */
 274 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
 275     if (this == &o) {
 276         return *this;
 277     }
 278     if (isFrozen()) {
 279         return *this;
 280     }
 281     if (o.isBogus()) {
 282         setToBogus();
 283         return *this;
 284     }
 285     UErrorCode ec = U_ZERO_ERROR;
 286     ensureCapacity(o.len, ec);
 287     if (U_FAILURE(ec)) {
 288         return *this; // There is no way to report this error :-(
 289     }
 290     len = o.len;
 291     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
 292     if (o.bmpSet == NULL) {
 293         bmpSet = NULL;
 294     } else {
 295         bmpSet = new BMPSet(*o.bmpSet, list, len);
 296         if (bmpSet == NULL) { // Check for memory allocation error.
 297             setToBogus();
 298             return *this;
 299         }
 300     }
 301     if (strings != NULL && o.strings != NULL) {
 302         strings->assign(*o.strings, cloneUnicodeString, ec);
 303     } else { // Invalid strings.
 304         setToBogus();
 305         return *this;
 306     }
 307     if (o.stringSpan == NULL) {
 308         stringSpan = NULL;
 309     } else {
 310         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
 311         if (stringSpan == NULL) { // Check for memory allocation error.
 312             setToBogus();
 313             return *this;
 314         }
 315     }
 316     releasePattern();
 317     if (o.pat) {
 318         setPattern(UnicodeString(o.pat, o.patLen));
 319     }
 320     return *this;
 321 }
 322
 323 /**
 324  * Returns a copy of this object.  All UnicodeMatcher objects have
 325  * to support cloning in order to allow classes using
 326  * UnicodeMatchers, such as Transliterator, to implement cloning.
 327  */
 328 UnicodeFunctor* UnicodeSet::clone() const {
 329     return new UnicodeSet(*this);
 330 }
 331
 332 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
 333     return new UnicodeSet(*this, TRUE);
 334 }
 335
 336 /**
 337  * Compares the specified object with this set for equality.  Returns
 338  * <tt>true</tt> if the two sets
 339  * have the same size, and every member of the specified set is
 340  * contained in this set (or equivalently, every member of this set is
 341  * contained in the specified set).
 342  *
 343  * @param o set to be compared for equality with this set.
 344  * @return <tt>true</tt> if the specified set is equal to this set.
 345  */
 346 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
 347     if (len != o.len) return FALSE;
 348     for (int32_t i = 0; i < len; ++i) {
 349         if (list[i] != o.list[i]) return FALSE;
 350     }
 351     if (*strings != *o.strings) return FALSE;
 352     return TRUE;
 353 }
 354
 355 /**
 356  * Returns the hash code value for this set.
 357  *
 358  * @return the hash code value for this set.
 359  * @see Object#hashCode()
 360  */
 361 int32_t UnicodeSet::hashCode(void) const {
 362     int32_t result = len;
 363     for (int32_t i = 0; i < len; ++i) {
 364         result *= 1000003;
 365         result += list[i];
 366     }
 367     return result;
 368 }
 369
 370 //----------------------------------------------------------------
 371 // Public API
 372 //----------------------------------------------------------------
 373
 374 /**
 375  * Returns the number of elements in this set (its cardinality),
 376  * Note than the elements of a set may include both individual
 377  * codepoints and strings.
 378  *
 379  * @return the number of elements in this set (its cardinality).
 380  */
 381 int32_t UnicodeSet::size(void) const {
 382     int32_t n = 0;
 383     int32_t count = getRangeCount();
 384     for (int32_t i = 0; i < count; ++i) {
 385         n += getRangeEnd(i) - getRangeStart(i) + 1;
 386     }
 387     return n + strings->size();
 388 }
 389
 390 /**
 391  * Returns <tt>true</tt> if this set contains no elements.
 392  *
 393  * @return <tt>true</tt> if this set contains no elements.
 394  */
 395 UBool UnicodeSet::isEmpty(void) const {
 396     return len == 1 && strings->size() == 0;
 397 }
 398
 399 /**
 400  * Returns true if this set contains the given character.
 401  * @param c character to be checked for containment
 402  * @return true if the test condition is met
 403  */
 404 UBool UnicodeSet::contains(UChar32 c) const {
 405     // Set i to the index of the start item greater than ch
 406     // We know we will terminate without length test!
 407     // LATER: for large sets, add binary search
 408     //int32_t i = -1;
 409     //for (;;) {
 410     //    if (c < list[++i]) break;
 411     //}
 412     if (bmpSet != NULL) {
 413         return bmpSet->contains(c);
 414     }
 415     if (stringSpan != NULL) {
 416         return stringSpan->contains(c);
 417     }
 418     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
 419         return FALSE;
 420     }
 421     int32_t i = findCodePoint(c);
 422     return (UBool)(i & 1); // return true if odd
 423 }
 424
 425 /**
 426  * Returns the smallest value i such that c < list[i].  Caller
 427  * must ensure that c is a legal value or this method will enter
 428  * an infinite loop.  This method performs a binary search.
 429  * @param c a character in the range MIN_VALUE..MAX_VALUE
 430  * inclusive
 431  * @return the smallest integer i in the range 0..len-1,
 432  * inclusive, such that c < list[i]
 433  */
 434 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
 435     /* Examples:
 436                                        findCodePoint(c)
 437        set              list[]         c=0 1 3 4 7 8
 438        ===              ==============   ===========
 439        []               [110000]         0 0 0 0 0 0
 440        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 441        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 442        [:Any:]          [0, 110000]      1 1 1 1 1 1
 443      */
 444
 445     // Return the smallest i such that c < list[i].  Assume
 446     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 447     if (c < list[0])
 448         return 0;
 449     // High runner test.  c is often after the last range, so an
 450     // initial check for this condition pays off.
 451     int32_t lo = 0;
 452     int32_t hi = len - 1;
 453     if (lo >= hi || c >= list[hi-1])
 454         return hi;
 455     // invariant: c >= list[lo]
 456     // invariant: c < list[hi]
 457     for (;;) {
 458         int32_t i = (lo + hi) >> 1;
 459         if (i == lo) {
 460             break; // Found!
 461         } else if (c < list[i]) {
 462             hi = i;
 463         } else {
 464             lo = i;
 465         }
 466     }
 467     return hi;
 468 }
 469
 470 /**
 471  * Returns true if this set contains every character
 472  * of the given range.
 473  * @param start first character, inclusive, of the range
 474  * @param end last character, inclusive, of the range
 475  * @return true if the test condition is met
 476  */
 477 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
 478     //int32_t i = -1;
 479     //for (;;) {
 480     //    if (start < list[++i]) break;
 481     //}
 482     int32_t i = findCodePoint(start);
 483     return ((i & 1) != 0 && end < list[i]);
 484 }
 485
 486 /**
 487  * Returns <tt>true</tt> if this set contains the given
 488  * multicharacter string.
 489  * @param s string to be checked for containment
 490  * @return <tt>true</tt> if this set contains the specified string
 491  */
 492 UBool UnicodeSet::contains(const UnicodeString& s) const {
 493     if (s.length() == 0) return FALSE;
 494     int32_t cp = getSingleCP(s);
 495     if (cp < 0) {
 496         return strings->contains((void*) &s);
 497     } else {
 498         return contains((UChar32) cp);
 499     }
 500 }
 501
 502 /**
 503  * Returns true if this set contains all the characters and strings
 504  * of the given set.
 505  * @param c set to be checked for containment
 506  * @return true if the test condition is met
 507  */
 508 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
 509     // The specified set is a subset if all of its pairs are contained in
 510     // this set.  It's possible to code this more efficiently in terms of
 511     // direct manipulation of the inversion lists if the need arises.
 512     int32_t n = c.getRangeCount();
 513     for (int i=0; i<n; ++i) {
 514         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
 515             return FALSE;
 516         }
 517     }
 518     if (!strings->containsAll(*c.strings)) return FALSE;
 519     return TRUE;
 520 }
 521
 522 /**
 523  * Returns true if this set contains all the characters
 524  * of the given string.
 525  * @param s string containing characters to be checked for containment
 526  * @return true if the test condition is met
 527  */
 528 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
 529     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
 530                    s.length());
 531 }
 532
 533 /**
 534  * Returns true if this set contains none of the characters
 535  * of the given range.
 536  * @param start first character, inclusive, of the range
 537  * @param end last character, inclusive, of the range
 538  * @return true if the test condition is met
 539  */
 540 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
 541     //int32_t i = -1;
 542     //for (;;) {
 543     //    if (start < list[++i]) break;
 544     //}
 545     int32_t i = findCodePoint(start);
 546     return ((i & 1) == 0 && end < list[i]);
 547 }
 548
 549 /**
 550  * Returns true if this set contains none of the characters and strings
 551  * of the given set.
 552  * @param c set to be checked for containment
 553  * @return true if the test condition is met
 554  */
 555 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
 556     // The specified set is a subset if all of its pairs are contained in
 557     // this set.  It's possible to code this more efficiently in terms of
 558     // direct manipulation of the inversion lists if the need arises.
 559     int32_t n = c.getRangeCount();
 560     for (int32_t i=0; i<n; ++i) {
 561         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
 562             return FALSE;
 563         }
 564     }
 565     if (!strings->containsNone(*c.strings)) return FALSE;
 566     return TRUE;
 567 }
 568
 569 /**
 570  * Returns true if this set contains none of the characters
 571  * of the given string.
 572  * @param s string containing characters to be checked for containment
 573  * @return true if the test condition is met
 574  */
 575 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
 576     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
 577                    s.length());
 578 }
 579
 580 /**
 581  * Returns <tt>true</tt> if this set contains any character whose low byte
 582  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
 583  * indexing.
 584  */
 585 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
 586     /* The index value v, in the range [0,255], is contained in this set if
 587      * it is contained in any pair of this set.  Pairs either have the high
 588      * bytes equal, or unequal.  If the high bytes are equal, then we have
 589      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
 590      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
 591      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
 592      * time zone month containment logic.)
 593      */
 594     int32_t i;
 595     int32_t rangeCount=getRangeCount();
 596     for (i=0; i<rangeCount; ++i) {
 597         UChar32 low = getRangeStart(i);
 598         UChar32 high = getRangeEnd(i);
 599         if ((low & ~0xFF) == (high & ~0xFF)) {
 600             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
 601                 return TRUE;
 602             }
 603         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
 604             return TRUE;
 605         }
 606     }
 607     if (strings->size() != 0) {
 608         for (i=0; i<strings->size(); ++i) {
 609             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
 610             //if (s.length() == 0) {
 611             //    // Empty strings match everything
 612             //    return TRUE;
 613             //}
 614             // assert(s.length() != 0); // We enforce this elsewhere
 615             UChar32 c = s.char32At(0);
 616             if ((c & 0xFF) == v) {
 617                 return TRUE;
 618             }
 619         }
 620     }
 621     return FALSE;
 622 }
 623
 624 /**
 625  * Implementation of UnicodeMatcher::matches().  Always matches the
 626  * longest possible multichar string.
 627  */
 628 UMatchDegree UnicodeSet::matches(const Replaceable& text,
 629                                  int32_t& offset,
 630                                  int32_t limit,
 631                                  UBool incremental) {
 632     if (offset == limit) {
 633         // Strings, if any, have length != 0, so we don't worry
 634         // about them here.  If we ever allow zero-length strings
 635         // we much check for them here.
 636         if (contains(U_ETHER)) {
 637             return incremental ? U_PARTIAL_MATCH : U_MATCH;
 638         } else {
 639             return U_MISMATCH;
 640         }
 641     } else {
 642         if (strings->size() != 0) { // try strings first
 643
 644             // might separate forward and backward loops later
 645             // for now they are combined
 646
 647             // TODO Improve efficiency of this, at least in the forward
 648             // direction, if not in both.  In the forward direction we
 649             // can assume the strings are sorted.
 650
 651             int32_t i;
 652             UBool forward = offset < limit;
 653
 654             // firstChar is the leftmost char to match in the
 655             // forward direction or the rightmost char to match in
 656             // the reverse direction.
 657             UChar firstChar = text.charAt(offset);
 658
 659             // If there are multiple strings that can match we
 660             // return the longest match.
 661             int32_t highWaterLength = 0;
 662
 663             for (i=0; i<strings->size(); ++i) {
 664                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
 665
 666                 //if (trial.length() == 0) {
 667                 //    return U_MATCH; // null-string always matches
 668                 //}
 669                 // assert(trial.length() != 0); // We ensure this elsewhere
 670
 671                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 672
 673                 // Strings are sorted, so we can optimize in the
 674                 // forward direction.
 675                 if (forward && c > firstChar) break;
 676                 if (c != firstChar) continue;
 677
 678                 int32_t matchLen = matchRest(text, offset, limit, trial);
 679
 680                 if (incremental) {
 681                     int32_t maxLen = forward ? limit-offset : offset-limit;
 682                     if (matchLen == maxLen) {
 683                         // We have successfully matched but only up to limit.
 684                         return U_PARTIAL_MATCH;
 685                     }
 686                 }
 687
 688                 if (matchLen == trial.length()) {
 689                     // We have successfully matched the whole string.
 690                     if (matchLen > highWaterLength) {
 691                         highWaterLength = matchLen;
 692                     }
 693                     // In the forward direction we know strings
 694                     // are sorted so we can bail early.
 695                     if (forward && matchLen < highWaterLength) {
 696                         break;
 697                     }
 698                     continue;
 699                 }
 700             }
 701
 702             // We've checked all strings without a partial match.
 703             // If we have full matches, return the longest one.
 704             if (highWaterLength != 0) {
 705                 offset += forward ? highWaterLength : -highWaterLength;
 706                 return U_MATCH;
 707             }
 708         }
 709         return UnicodeFilter::matches(text, offset, limit, incremental);
 710     }
 711 }
 712
 713 /**
 714  * Returns the longest match for s in text at the given position.
 715  * If limit > start then match forward from start+1 to limit
 716  * matching all characters except s.charAt(0).  If limit < start,
 717  * go backward starting from start-1 matching all characters
 718  * except s.charAt(s.length()-1).  This method assumes that the
 719  * first character, text.charAt(start), matches s, so it does not
 720  * check it.
 721  * @param text the text to match
 722  * @param start the first character to match.  In the forward
 723  * direction, text.charAt(start) is matched against s.charAt(0).
 724  * In the reverse direction, it is matched against
 725  * s.charAt(s.length()-1).
 726  * @param limit the limit offset for matching, either last+1 in
 727  * the forward direction, or last-1 in the reverse direction,
 728  * where last is the index of the last character to match.
 729  * @return If part of s matches up to the limit, return |limit -
 730  * start|.  If all of s matches before reaching the limit, return
 731  * s.length().  If there is a mismatch between s and text, return
 732  * 0
 733  */
 734 int32_t UnicodeSet::matchRest(const Replaceable& text,
 735                               int32_t start, int32_t limit,
 736                               const UnicodeString& s) {
 737     int32_t i;
 738     int32_t maxLen;
 739     int32_t slen = s.length();
 740     if (start < limit) {
 741         maxLen = limit - start;
 742         if (maxLen > slen) maxLen = slen;
 743         for (i = 1; i < maxLen; ++i) {
 744             if (text.charAt(start + i) != s.charAt(i)) return 0;
 745         }
 746     } else {
 747         maxLen = start - limit;
 748         if (maxLen > slen) maxLen = slen;
 749         --slen; // <=> slen = s.length() - 1;
 750         for (i = 1; i < maxLen; ++i) {
 751             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
 752         }
 753     }
 754     return maxLen;
 755 }
 756
 757 /**
 758  * Implement of UnicodeMatcher
 759  */
 760 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
 761     toUnionTo.addAll(*this);
 762 }
 763
 764 /**
 765  * Returns the index of the given character within this set, where
 766  * the set is ordered by ascending code point.  If the character
 767  * is not in this set, return -1.  The inverse of this method is
 768  * <code>charAt()</code>.
 769  * @return an index from 0..size()-1, or -1
 770  */
 771 int32_t UnicodeSet::indexOf(UChar32 c) const {
 772     if (c < MIN_VALUE || c > MAX_VALUE) {
 773         return -1;
 774     }
 775     int32_t i = 0;
 776     int32_t n = 0;
 777     for (;;) {
 778         UChar32 start = list[i++];
 779         if (c < start) {
 780             return -1;
 781         }
 782         UChar32 limit = list[i++];
 783         if (c < limit) {
 784             return n + c - start;
 785         }
 786         n += limit - start;
 787     }
 788 }
 789
 790 /**
 791  * Returns the character at the given index within this set, where
 792  * the set is ordered by ascending code point.  If the index is
 793  * out of range, return (UChar32)-1.  The inverse of this method is
 794  * <code>indexOf()</code>.
 795  * @param index an index from 0..size()-1
 796  * @return the character at the given index, or (UChar32)-1.
 797  */
 798 UChar32 UnicodeSet::charAt(int32_t index) const {
 799     if (index >= 0) {
 800         // len2 is the largest even integer <= len, that is, it is len
 801         // for even values and len-1 for odd values.  With odd values
 802         // the last entry is UNICODESET_HIGH.
 803         int32_t len2 = len & ~1;
 804         for (int32_t i=0; i < len2;) {
 805             UChar32 start = list[i++];
 806             int32_t count = list[i++] - start;
 807             if (index < count) {
 808                 return (UChar32)(start + index);
 809             }
 810             index -= count;
 811         }
 812     }
 813     return (UChar32)-1;
 814 }
 815
 816 /**
 817  * Make this object represent the range <code>start - end</code>.
 818  * If <code>end > start</code> then this object is set to an
 819  * an empty range.
 820  *
 821  * @param start first character in the set, inclusive
 822  * @rparam end last character in the set, inclusive
 823  */
 824 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
 825     clear();
 826     complement(start, end);
 827     return *this;
 828 }
 829
 830 /**
 831  * Adds the specified range to this set if it is not already
 832  * present.  If this set already contains the specified range,
 833  * the call leaves this set unchanged.  If <code>end > start</code>
 834  * then an empty range is added, leaving the set unchanged.
 835  *
 836  * @param start first character, inclusive, of range to be added
 837  * to this set.
 838  * @param end last character, inclusive, of range to be added
 839  * to this set.
 840  */
 841 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
 842     if (pinCodePoint(start) < pinCodePoint(end)) {
 843         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
 844         add(range, 2, 0);
 845     } else if (start == end) {
 846         add(start);
 847     }
 848     return *this;
 849 }
 850
 851 // #define DEBUG_US_ADD
 852
 853 #ifdef DEBUG_US_ADD
 854 #include <stdio.h>
 855 void dump(UChar32 c) {
 856     if (c <= 0xFF) {
 857         printf("%c", (char)c);
 858     } else {
 859         printf("U+%04X", c);
 860     }
 861 }
 862 void dump(const UChar32* list, int32_t len) {
 863     printf("[");
 864     for (int32_t i=0; i<len; ++i) {
 865         if (i != 0) printf(", ");
 866         dump(list[i]);
 867     }
 868     printf("]");
 869 }
 870 #endif
 871
 872 /**
 873  * Adds the specified character to this set if it is not already
 874  * present.  If this set already contains the specified character,
 875  * the call leaves this set unchanged.
 876  */
 877 UnicodeSet& UnicodeSet::add(UChar32 c) {
 878     // find smallest i such that c < list[i]
 879     // if odd, then it is IN the set
 880     // if even, then it is OUT of the set
 881     int32_t i = findCodePoint(pinCodePoint(c));
 882
 883     // already in set?
 884     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
 885
 886     // HIGH is 0x110000
 887     // assert(list[len-1] == HIGH);
 888
 889     // empty = [HIGH]
 890     // [start_0, limit_0, start_1, limit_1, HIGH]
 891
 892     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 893     //                             ^
 894     //                             list[i]
 895
 896     // i == 0 means c is before the first range
 897
 898 #ifdef DEBUG_US_ADD
 899     printf("Add of ");
 900     dump(c);
 901     printf(" found at %d", i);
 902     printf(": ");
 903     dump(list, len);
 904     printf(" => ");
 905 #endif
 906
 907     if (c == list[i]-1) {
 908         // c is before start of next range
 909         list[i] = c;
 910         // if we touched the HIGH mark, then add a new one
 911         if (c == (UNICODESET_HIGH - 1)) {
 912             UErrorCode status = U_ZERO_ERROR;
 913             ensureCapacity(len+1, status);
 914             if (U_FAILURE(status)) {
 915                 return *this; // There is no way to report this error :-(
 916             }
 917             list[len++] = UNICODESET_HIGH;
 918         }
 919         if (i > 0 && c == list[i-1]) {
 920             // collapse adjacent ranges
 921
 922             // [..., start_k-1, c, c, limit_k, ..., HIGH]
 923             //                     ^
 924             //                     list[i]
 925
 926             //for (int32_t k=i-1; k<len-2; ++k) {
 927             //    list[k] = list[k+2];
 928             //}
 929             UChar32* dst = list + i - 1;
 930             UChar32* src = dst + 2;
 931             UChar32* srclimit = list + len;
 932             while (src < srclimit) *(dst++) = *(src++);
 933
 934             len -= 2;
 935         }
 936     }
 937
 938     else if (i > 0 && c == list[i-1]) {
 939         // c is after end of prior range
 940         list[i-1]++;
 941         // no need to check for collapse here
 942     }
 943
 944     else {
 945         // At this point we know the new char is not adjacent to
 946         // any existing ranges, and it is not 10FFFF.
 947
 948
 949         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 950         //                             ^
 951         //                             list[i]
 952
 953         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 954         //                             ^
 955         //                             list[i]
 956
 957         UErrorCode status = U_ZERO_ERROR;
 958         ensureCapacity(len+2, status);
 959         if (U_FAILURE(status)) {
 960             return *this; // There is no way to report this error :-(
 961         }
 962
 963         //for (int32_t k=len-1; k>=i; --k) {
 964         //    list[k+2] = list[k];
 965         //}
 966         UChar32* src = list + len;
 967         UChar32* dst = src + 2;
 968         UChar32* srclimit = list + i;
 969         while (src > srclimit) *(--dst) = *(--src);
 970
 971         list[i] = c;
 972         list[i+1] = c+1;
 973         len += 2;
 974     }
 975
 976 #ifdef DEBUG_US_ADD
 977     dump(list, len);
 978     printf("\n");
 979
 980     for (i=1; i<len; ++i) {
 981         if (list[i] <= list[i-1]) {
 982             // Corrupt array!
 983             printf("ERROR: list has been corrupted\n");
 984             exit(1);
 985         }
 986     }
 987 #endif
 988
 989     releasePattern();
 990     return *this;
 991 }
 992
 993 /**
 994  * Adds the specified multicharacter to this set if it is not already
 995  * present.  If this set already contains the multicharacter,
 996  * the call leaves this set unchanged.
 997  * Thus "ch" => {"ch"}
 998  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 999  * @param s the source string
1000  * @return the modified set, for chaining
1001  */
1002 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1003     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1004     int32_t cp = getSingleCP(s);
1005     if (cp < 0) {
1006         if (!strings->contains((void*) &s)) {
1007             _add(s);
1008             releasePattern();
1009         }
1010     } else {
1011         add((UChar32)cp);
1012     }
1013     return *this;
1014 }
1015
1016 /**
1017  * Adds the given string, in order, to 'strings'.  The given string
1018  * must have been checked by the caller to not be empty and to not
1019  * already be in 'strings'.
1020  */
1021 void UnicodeSet::_add(const UnicodeString& s) {
1022     if (isFrozen() || isBogus()) {
1023         return;
1024     }
1025     UnicodeString* t = new UnicodeString(s);
1026     if (t == NULL) { // Check for memory allocation error.
1027         setToBogus();
1028         return;
1029     }
1030     UErrorCode ec = U_ZERO_ERROR;
1031     strings->sortedInsert(t, compareUnicodeString, ec);
1032     if (U_FAILURE(ec)) {
1033         setToBogus();
1034         delete t;
1035     }
1036 }
1037
1038 /**
1039  * @return a code point IF the string consists of a single one.
1040  * otherwise returns -1.
1041  * @param string to test
1042  */
1043 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1044     //if (s.length() < 1) {
1045     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1046     //}
1047     if (s.length() > 2) return -1;
1048     if (s.length() == 1) return s.charAt(0);
1049
1050     // at this point, len = 2
1051     UChar32 cp = s.char32At(0);
1052     if (cp > 0xFFFF) { // is surrogate pair
1053         return cp;
1054     }
1055     return -1;
1056 }
1057
1058 /**
1059  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1060  * If this set already any particular character, it has no effect on that character.
1061  * @param the source string
1062  * @return the modified set, for chaining
1063  */
1064 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1065     UChar32 cp;
1066     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1067         cp = s.char32At(i);
1068         add(cp);
1069     }
1070     return *this;
1071 }
1072
1073 /**
1074  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1075  * If this set already any particular character, it has no effect on that character.
1076  * @param the source string
1077  * @return the modified set, for chaining
1078  */
1079 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1080     UnicodeSet set;
1081     set.addAll(s);
1082     retainAll(set);
1083     return *this;
1084 }
1085
1086 /**
1087  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1088  * If this set already any particular character, it has no effect on that character.
1089  * @param the source string
1090  * @return the modified set, for chaining
1091  */
1092 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1093     UnicodeSet set;
1094     set.addAll(s);
1095     complementAll(set);
1096     return *this;
1097 }
1098
1099 /**
1100  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1101  * If this set already any particular character, it has no effect on that character.
1102  * @param the source string
1103  * @return the modified set, for chaining
1104  */
1105 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1106     UnicodeSet set;
1107     set.addAll(s);
1108     removeAll(set);
1109     return *this;
1110 }
1111
1112 UnicodeSet& UnicodeSet::removeAllStrings() {
1113     strings->removeAllElements();
1114     return *this;
1115 }
1116
1117
1118 /**
1119  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1120  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1121  * @param the source string
1122  * @return a newly created set containing the given string
1123  */
1124 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1125     UnicodeSet *set = new UnicodeSet();
1126     if (set != NULL) { // Check for memory allocation error.
1127         set->add(s);
1128     }
1129     return set;
1130 }
1131
1132
1133 /**
1134  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1135  * @param the source string
1136  * @return a newly created set containing the given characters
1137  */
1138 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1139     UnicodeSet *set = new UnicodeSet();
1140     if (set != NULL) { // Check for memory allocation error.
1141         set->addAll(s);
1142     }
1143     return set;
1144 }
1145
1146 /**
1147  * Retain only the elements in this set that are contained in the
1148  * specified range.  If <code>end > start</code> then an empty range is
1149  * retained, leaving the set empty.
1150  *
1151  * @param start first character, inclusive, of range to be retained
1152  * to this set.
1153  * @param end last character, inclusive, of range to be retained
1154  * to this set.
1155  */
1156 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1157     if (pinCodePoint(start) <= pinCodePoint(end)) {
1158         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1159         retain(range, 2, 0);
1160     } else {
1161         clear();
1162     }
1163     return *this;
1164 }
1165
1166 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1167     return retain(c, c);
1168 }
1169
1170 /**
1171  * Removes the specified range from this set if it is present.
1172  * The set will not contain the specified range once the call
1173  * returns.  If <code>end > start</code> then an empty range is
1174  * removed, leaving the set unchanged.
1175  *
1176  * @param start first character, inclusive, of range to be removed
1177  * from this set.
1178  * @param end last character, inclusive, of range to be removed
1179  * from this set.
1180  */
1181 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1182     if (pinCodePoint(start) <= pinCodePoint(end)) {
1183         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1184         retain(range, 2, 2);
1185     }
1186     return *this;
1187 }
1188
1189 /**
1190  * Removes the specified character from this set if it is present.
1191  * The set will not contain the specified range once the call
1192  * returns.
1193  */
1194 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1195     return remove(c, c);
1196 }
1197
1198 /**
1199  * Removes the specified string from this set if it is present.
1200  * The set will not contain the specified character once the call
1201  * returns.
1202  * @param the source string
1203  * @return the modified set, for chaining
1204  */
1205 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1206     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1207     int32_t cp = getSingleCP(s);
1208     if (cp < 0) {
1209         strings->removeElement((void*) &s);
1210         releasePattern();
1211     } else {
1212         remove((UChar32)cp, (UChar32)cp);
1213     }
1214     return *this;
1215 }
1216
1217 /**
1218  * Complements the specified range in this set.  Any character in
1219  * the range will be removed if it is in this set, or will be
1220  * added if it is not in this set.  If <code>end > start</code>
1221  * then an empty range is xor'ed, leaving the set unchanged.
1222  *
1223  * @param start first character, inclusive, of range to be removed
1224  * from this set.
1225  * @param end last character, inclusive, of range to be removed
1226  * from this set.
1227  */
1228 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1229     if (isFrozen() || isBogus()) {
1230         return *this;
1231     }
1232     if (pinCodePoint(start) <= pinCodePoint(end)) {
1233         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1234         exclusiveOr(range, 2, 0);
1235     }
1236     releasePattern();
1237     return *this;
1238 }
1239
1240 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1241     return complement(c, c);
1242 }
1243
1244 /**
1245  * This is equivalent to
1246  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1247  */
1248 UnicodeSet& UnicodeSet::complement(void) {
1249     if (isFrozen() || isBogus()) {
1250         return *this;
1251     }
1252     UErrorCode status = U_ZERO_ERROR;
1253     if (list[0] == UNICODESET_LOW) {
1254         ensureBufferCapacity(len-1, status);
1255         if (U_FAILURE(status)) {
1256             return *this;
1257         }
1258         uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
1259         --len;
1260     } else {
1261         ensureBufferCapacity(len+1, status);
1262         if (U_FAILURE(status)) {
1263             return *this;
1264         }
1265         uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
1266         buffer[0] = UNICODESET_LOW;
1267         ++len;
1268     }
1269     swapBuffers();
1270     releasePattern();
1271     return *this;
1272 }
1273
1274 /**
1275  * Complement the specified string in this set.
1276  * The set will not contain the specified string once the call
1277  * returns.
1278  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1279  * @param s the string to complement
1280  * @return this object, for chaining
1281  */
1282 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1283     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1284     int32_t cp = getSingleCP(s);
1285     if (cp < 0) {
1286         if (strings->contains((void*) &s)) {
1287             strings->removeElement((void*) &s);
1288         } else {
1289             _add(s);
1290         }
1291         releasePattern();
1292     } else {
1293         complement((UChar32)cp, (UChar32)cp);
1294     }
1295     return *this;
1296 }
1297
1298 /**
1299  * Adds all of the elements in the specified set to this set if
1300  * they're not already present.  This operation effectively
1301  * modifies this set so that its value is the <i>union</i> of the two
1302  * sets.  The behavior of this operation is unspecified if the specified
1303  * collection is modified while the operation is in progress.
1304  *
1305  * @param c set whose elements are to be added to this set.
1306  * @see #add(char, char)
1307  */
1308 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1309     if ( c.len>0 && c.list!=NULL ) {
1310         add(c.list, c.len, 0);
1311     }
1312
1313     // Add strings in order
1314     if ( c.strings!=NULL ) {
1315         for (int32_t i=0; i<c.strings->size(); ++i) {
1316             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1317             if (!strings->contains((void*) s)) {
1318                 _add(*s);
1319             }
1320         }
1321     }
1322     return *this;
1323 }
1324
1325 /**
1326  * Retains only the elements in this set that are contained in the
1327  * specified set.  In other words, removes from this set all of
1328  * its elements that are not contained in the specified set.  This
1329  * operation effectively modifies this set so that its value is
1330  * the <i>intersection</i> of the two sets.
1331  *
1332  * @param c set that defines which elements this set will retain.
1333  */
1334 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1335     if (isFrozen() || isBogus()) {
1336         return *this;
1337     }
1338     retain(c.list, c.len, 0);
1339     strings->retainAll(*c.strings);
1340     return *this;
1341 }
1342
1343 /**
1344  * Removes from this set all of its elements that are contained in the
1345  * specified set.  This operation effectively modifies this
1346  * set so that its value is the <i>asymmetric set difference</i> of
1347  * the two sets.
1348  *
1349  * @param c set that defines which elements will be removed from
1350  *          this set.
1351  */
1352 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1353     if (isFrozen() || isBogus()) {
1354         return *this;
1355     }
1356     retain(c.list, c.len, 2);
1357     strings->removeAll(*c.strings);
1358     return *this;
1359 }
1360
1361 /**
1362  * Complements in this set all elements contained in the specified
1363  * set.  Any character in the other set will be removed if it is
1364  * in this set, or will be added if it is not in this set.
1365  *
1366  * @param c set that defines which elements will be xor'ed from
1367  *          this set.
1368  */
1369 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1370     if (isFrozen() || isBogus()) {
1371         return *this;
1372     }
1373     exclusiveOr(c.list, c.len, 0);
1374
1375     for (int32_t i=0; i<c.strings->size(); ++i) {
1376         void* e = c.strings->elementAt(i);
1377         if (!strings->removeElement(e)) {
1378             _add(*(const UnicodeString*)e);
1379         }
1380     }
1381     return *this;
1382 }
1383
1384 /**
1385  * Removes all of the elements from this set.  This set will be
1386  * empty after this call returns.
1387  */
1388 UnicodeSet& UnicodeSet::clear(void) {
1389     if (isFrozen()) {
1390         return *this;
1391     }
1392     if (list != NULL) {
1393         list[0] = UNICODESET_HIGH;
1394     }
1395     len = 1;
1396     releasePattern();
1397     if (strings != NULL) {
1398         strings->removeAllElements();
1399     }
1400     if (list != NULL && strings != NULL) {
1401         // Remove bogus
1402         fFlags = 0;
1403     }
1404     return *this;
1405 }
1406
1407 /**
1408  * Iteration method that returns the number of ranges contained in
1409  * this set.
1410  * @see #getRangeStart
1411  * @see #getRangeEnd
1412  */
1413 int32_t UnicodeSet::getRangeCount() const {
1414     return len/2;
1415 }
1416
1417 /**
1418  * Iteration method that returns the first character in the
1419  * specified range of this set.
1420  * @see #getRangeCount
1421  * @see #getRangeEnd
1422  */
1423 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1424     return list[index*2];
1425 }
1426
1427 /**
1428  * Iteration method that returns the last character in the
1429  * specified range of this set.
1430  * @see #getRangeStart
1431  * @see #getRangeEnd
1432  */
1433 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1434     return list[index*2 + 1] - 1;
1435 }
1436
1437 int32_t UnicodeSet::getStringCount() const {
1438     return strings->size();
1439 }
1440
1441 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1442     return (const UnicodeString*) strings->elementAt(index);
1443 }
1444
1445 /**
1446  * Reallocate this objects internal structures to take up the least
1447  * possible space, without changing this object's value.
1448  */
1449 UnicodeSet& UnicodeSet::compact() {
1450     if (isFrozen() || isBogus()) {
1451         return *this;
1452     }
1453     // Delete buffer first to defragment memory less.
1454     if (buffer != NULL) {
1455         uprv_free(buffer);
1456         buffer = NULL;
1457     }
1458     if (len < capacity) {
1459         // Make the capacity equal to len or 1.
1460         // We don't want to realloc of 0 size.
1461         int32_t newCapacity = len + (len == 0);
1462         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1463         if (temp) {
1464             list = temp;
1465             capacity = newCapacity;
1466         }
1467         // else what the heck happened?! We allocated less memory!
1468         // Oh well. We'll keep our original array.
1469     }
1470     return *this;
1471 }
1472
1473 #ifdef DEBUG_SERIALIZE
1474 #include <stdio.h>
1475 #endif
1476
1477 /**
1478  * Deserialize constructor.
1479  */
1480 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1481   : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1482     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1483     fFlags(0) {
1484
1485   if(U_FAILURE(ec)) {
1486     setToBogus();
1487     return;
1488   }
1489
1490   if( (serialization != kSerialized)
1491       || (data==NULL)
1492       || (dataLen < 1)) {
1493     ec = U_ILLEGAL_ARGUMENT_ERROR;
1494     setToBogus();
1495     return;
1496   }
1497
1498   allocateStrings(ec);
1499   if (U_FAILURE(ec)) {
1500     setToBogus();
1501     return;
1502   }
1503
1504   // bmp?
1505   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1506   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1507
1508   len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1509 #ifdef DEBUG_SERIALIZE
1510   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1511 #endif
1512   capacity = len+1;
1513   list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1514   if(!list || U_FAILURE(ec)) {
1515     setToBogus();
1516     return;
1517   }
1518   // copy bmp
1519   int32_t i;
1520   for(i = 0; i< bmpLength;i++) {
1521     list[i] = data[i+headerSize];
1522 #ifdef DEBUG_SERIALIZE
1523     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1524 #endif
1525   }
1526   // copy smp
1527   for(i=bmpLength;i<len;i++) {
1528     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1529               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1530 #ifdef DEBUG_SERIALIZE
1531     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1532 #endif
1533   }
1534   // terminator
1535   list[len++]=UNICODESET_HIGH;
1536 }
1537
1538
1539 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1540     int32_t bmpLength, length, destLength;
1541
1542     if (U_FAILURE(ec)) {
1543         return 0;
1544     }
1545
1546     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1547         ec=U_ILLEGAL_ARGUMENT_ERROR;
1548         return 0;
1549     }
1550
1551     /* count necessary 16-bit units */
1552     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1553     // assert(length>=0);
1554     if (length==0) {
1555         /* empty set */
1556         if (destCapacity>0) {
1557             *dest=0;
1558         } else {
1559             ec=U_BUFFER_OVERFLOW_ERROR;
1560         }
1561         return 1;
1562     }
1563     /* now length>0 */
1564
1565     if (this->list[length-1]<=0xffff) {
1566         /* all BMP */
1567         bmpLength=length;
1568     } else if (this->list[0]>=0x10000) {
1569         /* all supplementary */
1570         bmpLength=0;
1571         length*=2;
1572     } else {
1573         /* some BMP, some supplementary */
1574         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1575         length=bmpLength+2*(length-bmpLength);
1576     }
1577 #ifdef DEBUG_SERIALIZE
1578     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1579 #endif
1580     /* length: number of 16-bit array units */
1581     if (length>0x7fff) {
1582         /* there are only 15 bits for the length in the first serialized word */
1583         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1584         return 0;
1585     }
1586
1587     /*
1588      * total serialized length:
1589      * number of 16-bit array units (length) +
1590      * 1 length unit (always) +
1591      * 1 bmpLength unit (if there are supplementary values)
1592      */
1593     destLength=length+((length>bmpLength)?2:1);
1594     if (destLength<=destCapacity) {
1595         const UChar32 *p;
1596         int32_t i;
1597
1598 #ifdef DEBUG_SERIALIZE
1599         printf("writeHdr\n");
1600 #endif
1601         *dest=(uint16_t)length;
1602         if (length>bmpLength) {
1603             *dest|=0x8000;
1604             *++dest=(uint16_t)bmpLength;
1605         }
1606         ++dest;
1607
1608         /* write the BMP part of the array */
1609         p=this->list;
1610         for (i=0; i<bmpLength; ++i) {
1611 #ifdef DEBUG_SERIALIZE
1612           printf("writebmp: %x\n", (int)*p);
1613 #endif
1614             *dest++=(uint16_t)*p++;
1615         }
1616
1617         /* write the supplementary part of the array */
1618         for (; i<length; i+=2) {
1619 #ifdef DEBUG_SERIALIZE
1620           printf("write32: %x\n", (int)*p);
1621 #endif
1622             *dest++=(uint16_t)(*p>>16);
1623             *dest++=(uint16_t)*p++;
1624         }
1625     } else {
1626         ec=U_BUFFER_OVERFLOW_ERROR;
1627     }
1628     return destLength;
1629 }
1630
1631 //----------------------------------------------------------------
1632 // Implementation: Utility methods
1633 //----------------------------------------------------------------
1634
1635 /**
1636  * Allocate our strings vector and return TRUE if successful.
1637  */
1638 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1639     if (U_FAILURE(status)) {
1640         return FALSE;
1641     }
1642     strings = new UVector(uprv_deleteUObject,
1643                           uhash_compareUnicodeString, 1, status);
1644     if (strings == NULL) { // Check for memory allocation error.
1645         status = U_MEMORY_ALLOCATION_ERROR;
1646         return FALSE;
1647     }
1648     if (U_FAILURE(status)) {
1649         delete strings;
1650         strings = NULL;
1651         return FALSE;
1652     }
1653     return TRUE;
1654 }
1655
1656 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1657     if (newLen <= capacity)
1658         return;
1659     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1660     if (temp == NULL) {
1661         ec = U_MEMORY_ALLOCATION_ERROR;
1662         setToBogus();
1663         return;
1664     }
1665     list = temp;
1666     capacity = newLen + GROW_EXTRA;
1667     // else we keep the original contents on the memory failure.
1668 }
1669
1670 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1671     if (buffer != NULL && newLen <= bufferCapacity)
1672         return;
1673     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1674     if (temp == NULL) {
1675         ec = U_MEMORY_ALLOCATION_ERROR;
1676         setToBogus();
1677         return;
1678     }
1679     buffer = temp;
1680     bufferCapacity = newLen + GROW_EXTRA;
1681     // else we keep the original contents on the memory failure.
1682 }
1683
1684 /**
1685  * Swap list and buffer.
1686  */
1687 void UnicodeSet::swapBuffers(void) {
1688     // swap list and buffer
1689     UChar32* temp = list;
1690     list = buffer;
1691     buffer = temp;
1692
1693     int32_t c = capacity;
1694     capacity = bufferCapacity;
1695     bufferCapacity = c;
1696 }
1697
1698 void UnicodeSet::setToBogus() {
1699     clear(); // Remove everything in the set.
1700     fFlags = kIsBogus;
1701 }
1702
1703 //----------------------------------------------------------------
1704 // Implementation: Fundamental operators
1705 //----------------------------------------------------------------
1706
1707 static inline UChar32 max(UChar32 a, UChar32 b) {
1708     return (a > b) ? a : b;
1709 }
1710
1711 // polarity = 0, 3 is normal: x xor y
1712 // polarity = 1, 2: x xor ~y == x === y
1713
1714 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1715     if (isFrozen() || isBogus()) {
1716         return;
1717     }
1718     UErrorCode status = U_ZERO_ERROR;
1719     ensureBufferCapacity(len + otherLen, status);
1720     if (U_FAILURE(status)) {
1721         return;
1722     }
1723
1724     int32_t i = 0, j = 0, k = 0;
1725     UChar32 a = list[i++];
1726     UChar32 b;
1727     if (polarity == 1 || polarity == 2) {
1728         b = UNICODESET_LOW;
1729         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1730             ++j;
1731             b = other[j];
1732         }
1733     } else {
1734         b = other[j++];
1735     }
1736     // simplest of all the routines
1737     // sort the values, discarding identicals!
1738     for (;;) {
1739         if (a < b) {
1740             buffer[k++] = a;
1741             a = list[i++];
1742         } else if (b < a) {
1743             buffer[k++] = b;
1744             b = other[j++];
1745         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1746             // discard both values!
1747             a = list[i++];
1748             b = other[j++];
1749         } else { // DONE!
1750             buffer[k++] = UNICODESET_HIGH;
1751             len = k;
1752             break;
1753         }
1754     }
1755     swapBuffers();
1756     releasePattern();
1757 }
1758
1759 // polarity = 0 is normal: x union y
1760 // polarity = 2: x union ~y
1761 // polarity = 1: ~x union y
1762 // polarity = 3: ~x union ~y
1763
1764 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1765     if (isFrozen() || isBogus() || other==NULL) {
1766         return;
1767     }
1768     UErrorCode status = U_ZERO_ERROR;
1769     ensureBufferCapacity(len + otherLen, status);
1770     if (U_FAILURE(status)) {
1771         return;
1772     }
1773
1774     int32_t i = 0, j = 0, k = 0;
1775     UChar32 a = list[i++];
1776     UChar32 b = other[j++];
1777     // change from xor is that we have to check overlapping pairs
1778     // polarity bit 1 means a is second, bit 2 means b is.
1779     for (;;) {
1780         switch (polarity) {
1781           case 0: // both first; take lower if unequal
1782             if (a < b) { // take a
1783                 // Back up over overlapping ranges in buffer[]
1784                 if (k > 0 && a <= buffer[k-1]) {
1785                     // Pick latter end value in buffer[] vs. list[]
1786                     a = max(list[i], buffer[--k]);
1787                 } else {
1788                     // No overlap
1789                     buffer[k++] = a;
1790                     a = list[i];
1791                 }
1792                 i++; // Common if/else code factored out
1793                 polarity ^= 1;
1794             } else if (b < a) { // take b
1795                 if (k > 0 && b <= buffer[k-1]) {
1796                     b = max(other[j], buffer[--k]);
1797                 } else {
1798                     buffer[k++] = b;
1799                     b = other[j];
1800                 }
1801                 j++;
1802                 polarity ^= 2;
1803             } else { // a == b, take a, drop b
1804                 if (a == UNICODESET_HIGH) goto loop_end;
1805                 // This is symmetrical; it doesn't matter if
1806                 // we backtrack with a or b. - liu
1807                 if (k > 0 && a <= buffer[k-1]) {
1808                     a = max(list[i], buffer[--k]);
1809                 } else {
1810                     // No overlap
1811                     buffer[k++] = a;
1812                     a = list[i];
1813                 }
1814                 i++;
1815                 polarity ^= 1;
1816                 b = other[j++];
1817                 polarity ^= 2;
1818             }
1819             break;
1820           case 3: // both second; take higher if unequal, and drop other
1821             if (b <= a) { // take a
1822                 if (a == UNICODESET_HIGH) goto loop_end;
1823                 buffer[k++] = a;
1824             } else { // take b
1825                 if (b == UNICODESET_HIGH) goto loop_end;
1826                 buffer[k++] = b;
1827             }
1828             a = list[i++];
1829             polarity ^= 1;   // factored common code
1830             b = other[j++];
1831             polarity ^= 2;
1832             break;
1833           case 1: // a second, b first; if b < a, overlap
1834             if (a < b) { // no overlap, take a
1835                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836             } else if (b < a) { // OVERLAP, drop b
1837                 b = other[j++];
1838                 polarity ^= 2;
1839             } else { // a == b, drop both!
1840                 if (a == UNICODESET_HIGH) goto loop_end;
1841                 a = list[i++];
1842                 polarity ^= 1;
1843                 b = other[j++];
1844                 polarity ^= 2;
1845             }
1846             break;
1847           case 2: // a first, b second; if a < b, overlap
1848             if (b < a) { // no overlap, take b
1849                 buffer[k++] = b;
1850                 b = other[j++];
1851                 polarity ^= 2;
1852             } else  if (a < b) { // OVERLAP, drop a
1853                 a = list[i++];
1854                 polarity ^= 1;
1855             } else { // a == b, drop both!
1856                 if (a == UNICODESET_HIGH) goto loop_end;
1857                 a = list[i++];
1858                 polarity ^= 1;
1859                 b = other[j++];
1860                 polarity ^= 2;
1861             }
1862             break;
1863         }
1864     }
1865  loop_end:
1866     buffer[k++] = UNICODESET_HIGH;    // terminate
1867     len = k;
1868     swapBuffers();
1869     releasePattern();
1870 }
1871
1872 // polarity = 0 is normal: x intersect y
1873 // polarity = 2: x intersect ~y == set-minus
1874 // polarity = 1: ~x intersect y
1875 // polarity = 3: ~x intersect ~y
1876
1877 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878     if (isFrozen() || isBogus()) {
1879         return;
1880     }
1881     UErrorCode status = U_ZERO_ERROR;
1882     ensureBufferCapacity(len + otherLen, status);
1883     if (U_FAILURE(status)) {
1884         return;
1885     }
1886
1887     int32_t i = 0, j = 0, k = 0;
1888     UChar32 a = list[i++];
1889     UChar32 b = other[j++];
1890     // change from xor is that we have to check overlapping pairs
1891     // polarity bit 1 means a is second, bit 2 means b is.
1892     for (;;) {
1893         switch (polarity) {
1894           case 0: // both first; drop the smaller
1895             if (a < b) { // drop a
1896                 a = list[i++];
1897                 polarity ^= 1;
1898             } else if (b < a) { // drop b
1899                 b = other[j++];
1900                 polarity ^= 2;
1901             } else { // a == b, take one, drop other
1902                 if (a == UNICODESET_HIGH) goto loop_end;
1903                 buffer[k++] = a;
1904                 a = list[i++];
1905                 polarity ^= 1;
1906                 b = other[j++];
1907                 polarity ^= 2;
1908             }
1909             break;
1910           case 3: // both second; take lower if unequal
1911             if (a < b) { // take a
1912                 buffer[k++] = a;
1913                 a = list[i++];
1914                 polarity ^= 1;
1915             } else if (b < a) { // take b
1916                 buffer[k++] = b;
1917                 b = other[j++];
1918                 polarity ^= 2;
1919             } else { // a == b, take one, drop other
1920                 if (a == UNICODESET_HIGH) goto loop_end;
1921                 buffer[k++] = a;
1922                 a = list[i++];
1923                 polarity ^= 1;
1924                 b = other[j++];
1925                 polarity ^= 2;
1926             }
1927             break;
1928           case 1: // a second, b first;
1929             if (a < b) { // NO OVERLAP, drop a
1930                 a = list[i++];
1931                 polarity ^= 1;
1932             } else if (b < a) { // OVERLAP, take b
1933                 buffer[k++] = b;
1934                 b = other[j++];
1935                 polarity ^= 2;
1936             } else { // a == b, drop both!
1937                 if (a == UNICODESET_HIGH) goto loop_end;
1938                 a = list[i++];
1939                 polarity ^= 1;
1940                 b = other[j++];
1941                 polarity ^= 2;
1942             }
1943             break;
1944           case 2: // a first, b second; if a < b, overlap
1945             if (b < a) { // no overlap, drop b
1946                 b = other[j++];
1947                 polarity ^= 2;
1948             } else  if (a < b) { // OVERLAP, take a
1949                 buffer[k++] = a;
1950                 a = list[i++];
1951                 polarity ^= 1;
1952             } else { // a == b, drop both!
1953                 if (a == UNICODESET_HIGH) goto loop_end;
1954                 a = list[i++];
1955                 polarity ^= 1;
1956                 b = other[j++];
1957                 polarity ^= 2;
1958             }
1959             break;
1960         }
1961     }
1962  loop_end:
1963     buffer[k++] = UNICODESET_HIGH;    // terminate
1964     len = k;
1965     swapBuffers();
1966     releasePattern();
1967 }
1968
1969 /**
1970  * Append the <code>toPattern()</code> representation of a
1971  * string to the given <code>StringBuffer</code>.
1972  */
1973 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1974 escapeUnprintable) {
1975     UChar32 cp;
1976     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1977         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1978     }
1979 }
1980
1981 /**
1982  * Append the <code>toPattern()</code> representation of a
1983  * character to the given <code>StringBuffer</code>.
1984  */
1985 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1986 escapeUnprintable) {
1987     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1988         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1989         // unprintable
1990         if (ICU_Utility::escapeUnprintable(buf, c)) {
1991             return;
1992         }
1993     }
1994     // Okay to let ':' pass through
1995     switch (c) {
1996     case SET_OPEN:
1997     case SET_CLOSE:
1998     case HYPHEN:
1999     case COMPLEMENT:
2000     case INTERSECTION:
2001     case BACKSLASH:
2002     case OPEN_BRACE:
2003     case CLOSE_BRACE:
2004     case COLON:
2005     case SymbolTable::SYMBOL_REF:
2006         buf.append(BACKSLASH);
2007         break;
2008     default:
2009         // Escape whitespace
2010         if (PatternProps::isWhiteSpace(c)) {
2011             buf.append(BACKSLASH);
2012         }
2013         break;
2014     }
2015     buf.append(c);
2016 }
2017
2018 /**
2019  * Append a string representation of this set to result.  This will be
2020  * a cleaned version of the string passed to applyPattern(), if there
2021  * is one.  Otherwise it will be generated.
2022  */
2023 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2024                                       UBool escapeUnprintable) const
2025 {
2026     if (pat != NULL) {
2027         int32_t i;
2028         int32_t backslashCount = 0;
2029         for (i=0; i<patLen; ) {
2030             UChar32 c;
2031             U16_NEXT(pat, i, patLen, c);
2032             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2033                 // If the unprintable character is preceded by an odd
2034                 // number of backslashes, then it has been escaped.
2035                 // Before unescaping it, we delete the final
2036                 // backslash.
2037                 if ((backslashCount % 2) == 1) {
2038                     result.truncate(result.length() - 1);
2039                 }
2040                 ICU_Utility::escapeUnprintable(result, c);
2041                 backslashCount = 0;
2042             } else {
2043                 result.append(c);
2044                 if (c == BACKSLASH) {
2045                     ++backslashCount;
2046                 } else {
2047                     backslashCount = 0;
2048                 }
2049             }
2050         }
2051         return result;
2052     }
2053
2054     return _generatePattern(result, escapeUnprintable);
2055 }
2056
2057 /**
2058  * Returns a string representation of this set.  If the result of
2059  * calling this function is passed to a UnicodeSet constructor, it
2060  * will produce another set that is equal to this one.
2061  */
2062 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2063                                      UBool escapeUnprintable) const
2064 {
2065     result.truncate(0);
2066     return _toPattern(result, escapeUnprintable);
2067 }
2068
2069 /**
2070  * Generate and append a string representation of this set to result.
2071  * This does not use this.pat, the cleaned up copy of the string
2072  * passed to applyPattern().
2073  */
2074 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2075                                             UBool escapeUnprintable) const
2076 {
2077     result.append(SET_OPEN);
2078
2079 //  // Check against the predefined categories.  We implicitly build
2080 //  // up ALL category sets the first time toPattern() is called.
2081 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2082 //      if (*this == getCategorySet(cat)) {
2083 //          result.append(COLON);
2084 //          result.append(CATEGORY_NAMES, cat*2, 2);
2085 //          return result.append(CATEGORY_CLOSE);
2086 //      }
2087 //  }
2088
2089     int32_t count = getRangeCount();
2090
2091     // If the set contains at least 2 intervals and includes both
2092     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2093     // be more economical.
2094     if (count > 1 &&
2095         getRangeStart(0) == MIN_VALUE &&
2096         getRangeEnd(count-1) == MAX_VALUE) {
2097
2098         // Emit the inverse
2099         result.append(COMPLEMENT);
2100
2101         for (int32_t i = 1; i < count; ++i) {
2102             UChar32 start = getRangeEnd(i-1)+1;
2103             UChar32 end = getRangeStart(i)-1;
2104             _appendToPat(result, start, escapeUnprintable);
2105             if (start != end) {
2106                 if ((start+1) != end) {
2107                     result.append(HYPHEN);
2108                 }
2109                 _appendToPat(result, end, escapeUnprintable);
2110             }
2111         }
2112     }
2113
2114     // Default; emit the ranges as pairs
2115     else {
2116         for (int32_t i = 0; i < count; ++i) {
2117             UChar32 start = getRangeStart(i);
2118             UChar32 end = getRangeEnd(i);
2119             _appendToPat(result, start, escapeUnprintable);
2120             if (start != end) {
2121                 if ((start+1) != end) {
2122                     result.append(HYPHEN);
2123                 }
2124                 _appendToPat(result, end, escapeUnprintable);
2125             }
2126         }
2127     }
2128
2129     for (int32_t i = 0; i<strings->size(); ++i) {
2130         result.append(OPEN_BRACE);
2131         _appendToPat(result,
2132                      *(const UnicodeString*) strings->elementAt(i),
2133                      escapeUnprintable);
2134         result.append(CLOSE_BRACE);
2135     }
2136     return result.append(SET_CLOSE);
2137 }
2138
2139 /**
2140 * Release existing cached pattern
2141 */
2142 void UnicodeSet::releasePattern() {
2143     if (pat) {
2144         uprv_free(pat);
2145         pat = NULL;
2146         patLen = 0;
2147     }
2148 }
2149
2150 /**
2151 * Set the new pattern to cache.
2152 */
2153 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2154     releasePattern();
2155     int32_t newPatLen = newPat.length();
2156     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2157     if (pat) {
2158         patLen = newPatLen;
2159         newPat.extractBetween(0, patLen, pat);
2160         pat[patLen] = 0;
2161     }
2162     // else we don't care if malloc failed. This was just a nice cache.
2163     // We can regenerate an equivalent pattern later when requested.
2164 }
2165
2166 UnicodeFunctor *UnicodeSet::freeze() {
2167     if(!isFrozen() && !isBogus()) {
2168         // Do most of what compact() does before freezing because
2169         // compact() will not work when the set is frozen.
2170         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2171
2172         // Delete buffer first to defragment memory less.
2173         if (buffer != NULL) {
2174             uprv_free(buffer);
2175             buffer = NULL;
2176         }
2177         if (capacity > (len + GROW_EXTRA)) {
2178             // Make the capacity equal to len or 1.
2179             // We don't want to realloc of 0 size.
2180             capacity = len + (len == 0);
2181             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2182             if (list == NULL) { // Check for memory allocation error.
2183                 setToBogus();
2184                 return this;
2185             }
2186         }
2187
2188         // Optimize contains() and span() and similar functions.
2189         if (!strings->isEmpty()) {
2190             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2191             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2192                 // All strings are irrelevant for span() etc. because
2193                 // all of each string's code points are contained in this set.
2194                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2195                 // many relevant strings as UTF-16.
2196                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2197                 delete stringSpan;
2198                 stringSpan = NULL;
2199             }
2200         }
2201         if (stringSpan == NULL) {
2202             // No span-relevant strings: Optimize for code point spans.
2203             bmpSet=new BMPSet(list, len);
2204             if (bmpSet == NULL) { // Check for memory allocation error.
2205                 setToBogus();
2206             }
2207         }
2208     }
2209     return this;
2210 }
2211
2212 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2213     if(length>0 && bmpSet!=NULL) {
2214         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2215     }
2216     if(length<0) {
2217         length=u_strlen(s);
2218     }
2219     if(length==0) {
2220         return 0;
2221     }
2222     if(stringSpan!=NULL) {
2223         return stringSpan->span(s, length, spanCondition);
2224     } else if(!strings->isEmpty()) {
2225         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2226                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2227                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2228         UnicodeSetStringSpan strSpan(*this, *strings, which);
2229         if(strSpan.needsStringSpanUTF16()) {
2230             return strSpan.span(s, length, spanCondition);
2231         }
2232     }
2233
2234     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2235         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2236     }
2237
2238     UChar32 c;
2239     int32_t start=0, prev=0;
2240     do {
2241         U16_NEXT(s, start, length, c);
2242         if(spanCondition!=contains(c)) {
2243             break;
2244         }
2245     } while((prev=start)<length);
2246     return prev;
2247 }
2248
2249 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2250     if(length>0 && bmpSet!=NULL) {
2251         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2252     }
2253     if(length<0) {
2254         length=u_strlen(s);
2255     }
2256     if(length==0) {
2257         return 0;
2258     }
2259     if(stringSpan!=NULL) {
2260         return stringSpan->spanBack(s, length, spanCondition);
2261     } else if(!strings->isEmpty()) {
2262         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2263                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2264                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2265         UnicodeSetStringSpan strSpan(*this, *strings, which);
2266         if(strSpan.needsStringSpanUTF16()) {
2267             return strSpan.spanBack(s, length, spanCondition);
2268         }
2269     }
2270
2271     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2272         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2273     }
2274
2275     UChar32 c;
2276     int32_t prev=length;
2277     do {
2278         U16_PREV(s, 0, length, c);
2279         if(spanCondition!=contains(c)) {
2280             break;
2281         }
2282     } while((prev=length)>0);
2283     return prev;
2284 }
2285
2286 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2287     if(length>0 && bmpSet!=NULL) {
2288         const uint8_t *s0=(const uint8_t *)s;
2289         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2290     }
2291     if(length<0) {
2292         length=(int32_t)uprv_strlen(s);
2293     }
2294     if(length==0) {
2295         return 0;
2296     }
2297     if(stringSpan!=NULL) {
2298         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2299     } else if(!strings->isEmpty()) {
2300         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2301                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2302                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2303         UnicodeSetStringSpan strSpan(*this, *strings, which);
2304         if(strSpan.needsStringSpanUTF8()) {
2305             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2306         }
2307     }
2308
2309     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2310         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2311     }
2312
2313     UChar32 c;
2314     int32_t start=0, prev=0;
2315     do {
2316         U8_NEXT_OR_FFFD(s, start, length, c);
2317         if(spanCondition!=contains(c)) {
2318             break;
2319         }
2320     } while((prev=start)<length);
2321     return prev;
2322 }
2323
2324 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2325     if(length>0 && bmpSet!=NULL) {
2326         const uint8_t *s0=(const uint8_t *)s;
2327         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2328     }
2329     if(length<0) {
2330         length=(int32_t)uprv_strlen(s);
2331     }
2332     if(length==0) {
2333         return 0;
2334     }
2335     if(stringSpan!=NULL) {
2336         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337     } else if(!strings->isEmpty()) {
2338         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2339                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2340                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2341         UnicodeSetStringSpan strSpan(*this, *strings, which);
2342         if(strSpan.needsStringSpanUTF8()) {
2343             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2344         }
2345     }
2346
2347     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2348         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2349     }
2350
2351     UChar32 c;
2352     int32_t prev=length;
2353     do {
2354         U8_PREV_OR_FFFD(s, 0, length, c);
2355         if(spanCondition!=contains(c)) {
2356             break;
2357         }
2358     } while((prev=length)>0);
2359     return prev;
2360 }
2361
2362 U_NAMESPACE_END