icuSources/common/uniset.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1999-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12 #include "unicode/uniset.h"
  13 #include "unicode/parsepos.h"
  14 #include "unicode/symtable.h"
  15 #include "ruleiter.h"
  16 #include "cmemory.h"
  17 #include "uhash.h"
  18 #include "util.h"
  19 #include "uvector.h"
  20 #include "charstr.h"
  21 #include "ustrfmt.h"
  22 #include "mutex.h"
  23 #include "uassert.h"
  24 #include "hash.h"
  25
  26 // Define UChar constants using hex for EBCDIC compatibility
  27 // Used #define to reduce private static exports and memory access time.
  28 #define SET_OPEN        ((UChar)0x005B) /*[*/
  29 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  30 #define HYPHEN          ((UChar)0x002D) /*-*/
  31 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  32 #define COLON           ((UChar)0x003A) /*:*/
  33 #define BACKSLASH       ((UChar)0x005C) /*\*/
  34 #define INTERSECTION    ((UChar)0x0026) /*&*/
  35 #define UPPER_U         ((UChar)0x0055) /*U*/
  36 #define LOWER_U         ((UChar)0x0075) /*u*/
  37 #define OPEN_BRACE      ((UChar)123)    /*{*/
  38 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  39 #define UPPER_P         ((UChar)0x0050) /*P*/
  40 #define LOWER_P         ((UChar)0x0070) /*p*/
  41 #define UPPER_N         ((UChar)78)     /*N*/
  42 #define EQUALS          ((UChar)0x003D) /*=*/
  43
  44 // HIGH_VALUE > all valid values. 110000 for codepoints
  45 #define UNICODESET_HIGH 0x0110000
  46
  47 // LOW <= all valid values. ZERO for codepoints
  48 #define UNICODESET_LOW 0x000000
  49
  50 // initial storage. Must be >= 0
  51 #define START_EXTRA 16
  52
  53 // extra amount for growth. Must be >= 0
  54 #define GROW_EXTRA START_EXTRA
  55
  56 U_NAMESPACE_BEGIN
  57
  58 SymbolTable::~SymbolTable() {}
  59
  60 /**
  61  * Minimum value that can be stored in a UnicodeSet.
  62  */
  63 const UChar32 UnicodeSet::MIN_VALUE = UNICODESET_LOW;
  64
  65 /**
  66  * Maximum value that can be stored in a UnicodeSet.
  67  */
  68 const UChar32 UnicodeSet::MAX_VALUE = UNICODESET_HIGH - 1;
  69
  70 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
  71
  72 /**
  73  * Modify the given UChar32 variable so that it is in range, by
  74  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
  75  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
  76  * It modifies its argument in-place and also returns it.
  77  */
  78 static inline UChar32 pinCodePoint(UChar32& c) {
  79     if (c < UNICODESET_LOW) {
  80         c = UNICODESET_LOW;
  81     } else if (c > (UNICODESET_HIGH-1)) {
  82         c = (UNICODESET_HIGH-1);
  83     }
  84     return c;
  85 }
  86
  87 //----------------------------------------------------------------
  88 // Debugging
  89 //----------------------------------------------------------------
  90
  91 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
  92 // To enable the debugging, define the symbol DEBUG_MEM in the line
  93 // below.  This will result in text being sent to stdout that looks
  94 // like this:
  95 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
  96 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
  97 // Each line lists a construction (ct) or destruction (dt) event, the
  98 // object address, the number of outstanding objects after the event,
  99 // and the pattern of the object in question.
 100
 101 // #define DEBUG_MEM
 102
 103 #ifdef DEBUG_MEM
 104 #include <stdio.h>
 105 static int32_t _dbgCount = 0;
 106
 107 static inline void _dbgct(UnicodeSet* set) {
 108     UnicodeString str;
 109     set->toPattern(str, TRUE);
 110     char buf[40];
 111     str.extract(0, 39, buf, "");
 112     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
 113 }
 114
 115 static inline void _dbgdt(UnicodeSet* set) {
 116     UnicodeString str;
 117     set->toPattern(str, TRUE);
 118     char buf[40];
 119     str.extract(0, 39, buf, "");
 120     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
 121 }
 122
 123 #else
 124
 125 #define _dbgct(set)
 126 #define _dbgdt(set)
 127
 128 #endif
 129
 130 //----------------------------------------------------------------
 131 // UnicodeString in UVector support
 132 //----------------------------------------------------------------
 133
 134 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
 135     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
 136 }
 137
 138 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
 139     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
 140     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
 141     return a.compare(b);
 142 }
 143
 144 //----------------------------------------------------------------
 145 // Constructors &c
 146 //----------------------------------------------------------------
 147
 148 /**
 149  * Constructs an empty set.
 150  */
 151 UnicodeSet::UnicodeSet() :
 152     len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
 153     list(0), buffer(0), strings(0)
 154 {
 155     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 156     if(list!=NULL){
 157         list[0] = UNICODESET_HIGH;
 158     }
 159     allocateStrings();
 160     _dbgct(this);
 161 }
 162
 163 /**
 164  * Constructs a set containing the given range. If <code>end >
 165  * start</code> then an empty set is created.
 166  *
 167  * @param start first character, inclusive, of range
 168  * @param end last character, inclusive, of range
 169  */
 170 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
 171     len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
 172     list(0), buffer(0), strings(0)
 173 {
 174     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 175     if(list!=NULL){
 176         list[0] = UNICODESET_HIGH;
 177     }
 178     allocateStrings();
 179     complement(start, end);
 180     _dbgct(this);
 181 }
 182
 183 /**
 184  * Constructs a set that is identical to the given UnicodeSet.
 185  */
 186 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
 187     UnicodeFilter(o),
 188     len(0), capacity(o.len + GROW_EXTRA), bufferCapacity(0),
 189     list(0), buffer(0), strings(0)
 190 {
 191     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 192     if(list!=NULL){
 193         allocateStrings();
 194         *this = o;
 195     }
 196     _dbgct(this);
 197 }
 198
 199 /**
 200  * Destructs the set.
 201  */
 202 UnicodeSet::~UnicodeSet() {
 203     _dbgdt(this); // first!
 204     uprv_free(list);
 205     if (buffer) {
 206         uprv_free(buffer);
 207     }
 208     delete strings;
 209 }
 210
 211 /**
 212  * Assigns this object to be a copy of another.
 213  */
 214 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
 215     ensureCapacity(o.len);
 216     len = o.len;
 217     uprv_memcpy(list, o.list, len*sizeof(UChar32));
 218     UErrorCode ec = U_ZERO_ERROR;
 219     strings->assign(*o.strings, cloneUnicodeString, ec);
 220     pat = o.pat;
 221     return *this;
 222 }
 223
 224 /**
 225  * Compares the specified object with this set for equality.  Returns
 226  * <tt>true</tt> if the two sets
 227  * have the same size, and every member of the specified set is
 228  * contained in this set (or equivalently, every member of this set is
 229  * contained in the specified set).
 230  *
 231  * @param o set to be compared for equality with this set.
 232  * @return <tt>true</tt> if the specified set is equal to this set.
 233  */
 234 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
 235     if (len != o.len) return FALSE;
 236     for (int32_t i = 0; i < len; ++i) {
 237         if (list[i] != o.list[i]) return FALSE;
 238     }
 239     if (*strings != *o.strings) return FALSE;
 240     return TRUE;
 241 }
 242
 243 /**
 244  * Returns a copy of this object.  All UnicodeMatcher objects have
 245  * to support cloning in order to allow classes using
 246  * UnicodeMatchers, such as Transliterator, to implement cloning.
 247  */
 248 UnicodeFunctor* UnicodeSet::clone() const {
 249     return new UnicodeSet(*this);
 250 }
 251
 252 /**
 253  * Returns the hash code value for this set.
 254  *
 255  * @return the hash code value for this set.
 256  * @see Object#hashCode()
 257  */
 258 int32_t UnicodeSet::hashCode(void) const {
 259     int32_t result = len;
 260     for (int32_t i = 0; i < len; ++i) {
 261         result *= 1000003;
 262         result += list[i];
 263     }
 264     return result;
 265 }
 266
 267 //----------------------------------------------------------------
 268 // Public API
 269 //----------------------------------------------------------------
 270
 271 /**
 272  * Make this object represent the range <code>start - end</code>.
 273  * If <code>end > start</code> then this object is set to an
 274  * an empty range.
 275  *
 276  * @param start first character in the set, inclusive
 277  * @rparam end last character in the set, inclusive
 278  */
 279 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
 280     clear();
 281     complement(start, end);
 282     return *this;
 283 }
 284
 285 /**
 286  * Returns the number of elements in this set (its cardinality),
 287  * Note than the elements of a set may include both individual
 288  * codepoints and strings.
 289  *
 290  * @return the number of elements in this set (its cardinality).
 291  */
 292 int32_t UnicodeSet::size(void) const {
 293     int32_t n = 0;
 294     int32_t count = getRangeCount();
 295     for (int32_t i = 0; i < count; ++i) {
 296         n += getRangeEnd(i) - getRangeStart(i) + 1;
 297     }
 298     return n + strings->size();
 299 }
 300
 301 /**
 302  * Returns <tt>true</tt> if this set contains no elements.
 303  *
 304  * @return <tt>true</tt> if this set contains no elements.
 305  */
 306 UBool UnicodeSet::isEmpty(void) const {
 307     return len == 1 && strings->size() == 0;
 308 }
 309
 310 /**
 311  * Returns true if this set contains the given character.
 312  * @param c character to be checked for containment
 313  * @return true if the test condition is met
 314  */
 315 UBool UnicodeSet::contains(UChar32 c) const {
 316     // Set i to the index of the start item greater than ch
 317     // We know we will terminate without length test!
 318     // LATER: for large sets, add binary search
 319     //int32_t i = -1;
 320     //for (;;) {
 321     //    if (c < list[++i]) break;
 322     //}
 323     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
 324         return FALSE;
 325     }
 326     int32_t i = findCodePoint(c);
 327     return ((i & 1) != 0); // return true if odd
 328 }
 329
 330 /**
 331  * Returns the smallest value i such that c < list[i].  Caller
 332  * must ensure that c is a legal value or this method will enter
 333  * an infinite loop.  This method performs a binary search.
 334  * @param c a character in the range MIN_VALUE..MAX_VALUE
 335  * inclusive
 336  * @return the smallest integer i in the range 0..len-1,
 337  * inclusive, such that c < list[i]
 338  */
 339 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
 340     /* Examples:
 341                                        findCodePoint(c)
 342        set              list[]         c=0 1 3 4 7 8
 343        ===              ==============   ===========
 344        []               [110000]         0 0 0 0 0 0
 345        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 346        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 347        [:Any:]          [0, 110000]      1 1 1 1 1 1
 348      */
 349
 350     // Return the smallest i such that c < list[i].  Assume
 351     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 352     if (c < list[0]) return 0;
 353     // High runner test.  c is often after the last range, so an
 354     // initial check for this condition pays off.
 355     if (len >= 2 && c >= list[len-2]) return len-1;
 356     int32_t lo = 0;
 357     int32_t hi = len - 1;
 358     // invariant: c >= list[lo]
 359     // invariant: c < list[hi]
 360     for (;;) {
 361         int32_t i = (lo + hi) >> 1;
 362         if (i == lo) return hi;
 363         if (c < list[i]) {
 364             hi = i;
 365         } else {
 366             lo = i;
 367         }
 368     }
 369     return 0; // To make compiler happy; never reached
 370 }
 371
 372 /**
 373  * Returns true if this set contains every character
 374  * of the given range.
 375  * @param start first character, inclusive, of the range
 376  * @param end last character, inclusive, of the range
 377  * @return true if the test condition is met
 378  */
 379 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
 380     //int32_t i = -1;
 381     //for (;;) {
 382     //    if (start < list[++i]) break;
 383     //}
 384     int32_t i = findCodePoint(start);
 385     return ((i & 1) != 0 && end < list[i]);
 386 }
 387
 388 /**
 389  * Returns <tt>true</tt> if this set contains the given
 390  * multicharacter string.
 391  * @param s string to be checked for containment
 392  * @return <tt>true</tt> if this set contains the specified string
 393  */
 394 UBool UnicodeSet::contains(const UnicodeString& s) const {
 395     if (s.length() == 0) return FALSE;
 396     int32_t cp = getSingleCP(s);
 397     if (cp < 0) {
 398         return strings->contains((void*) &s);
 399     } else {
 400         return contains((UChar32) cp);
 401     }
 402 }
 403
 404 /**
 405  * Returns true if this set contains all the characters and strings
 406  * of the given set.
 407  * @param c set to be checked for containment
 408  * @return true if the test condition is met
 409  */
 410 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
 411     // The specified set is a subset if all of its pairs are contained in
 412     // this set.  It's possible to code this more efficiently in terms of
 413     // direct manipulation of the inversion lists if the need arises.
 414     int32_t n = c.getRangeCount();
 415     for (int i=0; i<n; ++i) {
 416         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
 417             return FALSE;
 418         }
 419     }
 420     if (!strings->containsAll(*c.strings)) return FALSE;
 421     return TRUE;
 422 }
 423
 424 /**
 425  * Returns true if this set contains all the characters
 426  * of the given string.
 427  * @param s string containing characters to be checked for containment
 428  * @return true if the test condition is met
 429  */
 430 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
 431     UChar32 cp;
 432     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
 433         cp = s.char32At(i);
 434         if (!contains(cp)) return FALSE;
 435     }
 436     return TRUE;
 437 }
 438
 439 /**
 440  * Returns true if this set contains none of the characters
 441  * of the given range.
 442  * @param start first character, inclusive, of the range
 443  * @param end last character, inclusive, of the range
 444  * @return true if the test condition is met
 445  */
 446 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
 447     //int32_t i = -1;
 448     //for (;;) {
 449     //    if (start < list[++i]) break;
 450     //}
 451     int32_t i = findCodePoint(start);
 452     return ((i & 1) == 0 && end < list[i]);
 453 }
 454
 455 /**
 456  * Returns true if this set contains none of the characters and strings
 457  * of the given set.
 458  * @param c set to be checked for containment
 459  * @return true if the test condition is met
 460  */
 461 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
 462     // The specified set is a subset if all of its pairs are contained in
 463     // this set.  It's possible to code this more efficiently in terms of
 464     // direct manipulation of the inversion lists if the need arises.
 465     int32_t n = c.getRangeCount();
 466     for (int32_t i=0; i<n; ++i) {
 467         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
 468             return FALSE;
 469         }
 470     }
 471     if (!strings->containsNone(*c.strings)) return FALSE;
 472     return TRUE;
 473 }
 474
 475 /**
 476  * Returns true if this set contains none of the characters
 477  * of the given string.
 478  * @param s string containing characters to be checked for containment
 479  * @return true if the test condition is met
 480  */
 481 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
 482     UChar32 cp;
 483     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
 484         cp = s.char32At(i);
 485         if (contains(cp)) return FALSE;
 486     }
 487     return TRUE;
 488 }
 489
 490 /**
 491  * Returns <tt>true</tt> if this set contains any character whose low byte
 492  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
 493  * indexing.
 494  */
 495 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
 496     /* The index value v, in the range [0,255], is contained in this set if
 497      * it is contained in any pair of this set.  Pairs either have the high
 498      * bytes equal, or unequal.  If the high bytes are equal, then we have
 499      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
 500      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
 501      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
 502      * time zone month containment logic.)
 503      */
 504     int32_t i;
 505     for (i=0; i<getRangeCount(); ++i) {
 506         UChar32 low = getRangeStart(i);
 507         UChar32 high = getRangeEnd(i);
 508         if ((low & ~0xFF) == (high & ~0xFF)) {
 509             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
 510                 return TRUE;
 511             }
 512         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
 513             return TRUE;
 514         }
 515     }
 516     if (strings->size() != 0) {
 517         for (i=0; i<strings->size(); ++i) {
 518             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
 519             //if (s.length() == 0) {
 520             //    // Empty strings match everything
 521             //    return TRUE;
 522             //}
 523             // assert(s.length() != 0); // We enforce this elsewhere
 524             UChar32 c = s.char32At(0);
 525             if ((c & 0xFF) == v) {
 526                 return TRUE;
 527             }
 528         }
 529     }
 530     return FALSE;
 531 }
 532
 533 /**
 534  * Implementation of UnicodeMatcher::matches().  Always matches the
 535  * longest possible multichar string.
 536  */
 537 UMatchDegree UnicodeSet::matches(const Replaceable& text,
 538                                  int32_t& offset,
 539                                  int32_t limit,
 540                                  UBool incremental) {
 541     if (offset == limit) {
 542         // Strings, if any, have length != 0, so we don't worry
 543         // about them here.  If we ever allow zero-length strings
 544         // we much check for them here.
 545         if (contains(U_ETHER)) {
 546             return incremental ? U_PARTIAL_MATCH : U_MATCH;
 547         } else {
 548             return U_MISMATCH;
 549         }
 550     } else {
 551         if (strings->size() != 0) { // try strings first
 552
 553             // might separate forward and backward loops later
 554             // for now they are combined
 555
 556             // TODO Improve efficiency of this, at least in the forward
 557             // direction, if not in both.  In the forward direction we
 558             // can assume the strings are sorted.
 559
 560             int32_t i;
 561             UBool forward = offset < limit;
 562
 563             // firstChar is the leftmost char to match in the
 564             // forward direction or the rightmost char to match in
 565             // the reverse direction.
 566             UChar firstChar = text.charAt(offset);
 567
 568             // If there are multiple strings that can match we
 569             // return the longest match.
 570             int32_t highWaterLength = 0;
 571
 572             for (i=0; i<strings->size(); ++i) {
 573                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
 574
 575                 //if (trial.length() == 0) {
 576                 //    return U_MATCH; // null-string always matches
 577                 //}
 578                 // assert(trial.length() != 0); // We ensure this elsewhere
 579
 580                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 581
 582                 // Strings are sorted, so we can optimize in the
 583                 // forward direction.
 584                 if (forward && c > firstChar) break;
 585                 if (c != firstChar) continue;
 586
 587                 int32_t matchLen = matchRest(text, offset, limit, trial);
 588
 589                 if (incremental) {
 590                     int32_t maxLen = forward ? limit-offset : offset-limit;
 591                     if (matchLen == maxLen) {
 592                         // We have successfully matched but only up to limit.
 593                         return U_PARTIAL_MATCH;
 594                     }
 595                 }
 596
 597                 if (matchLen == trial.length()) {
 598                     // We have successfully matched the whole string.
 599                     if (matchLen > highWaterLength) {
 600                         highWaterLength = matchLen;
 601                     }
 602                     // In the forward direction we know strings
 603                     // are sorted so we can bail early.
 604                     if (forward && matchLen < highWaterLength) {
 605                         break;
 606                     }
 607                     continue;
 608                 }
 609             }
 610
 611             // We've checked all strings without a partial match.
 612             // If we have full matches, return the longest one.
 613             if (highWaterLength != 0) {
 614                 offset += forward ? highWaterLength : -highWaterLength;
 615                 return U_MATCH;
 616             }
 617         }
 618         return UnicodeFilter::matches(text, offset, limit, incremental);
 619     }
 620 }
 621
 622 /**
 623  * Returns the longest match for s in text at the given position.
 624  * If limit > start then match forward from start+1 to limit
 625  * matching all characters except s.charAt(0).  If limit < start,
 626  * go backward starting from start-1 matching all characters
 627  * except s.charAt(s.length()-1).  This method assumes that the
 628  * first character, text.charAt(start), matches s, so it does not
 629  * check it.
 630  * @param text the text to match
 631  * @param start the first character to match.  In the forward
 632  * direction, text.charAt(start) is matched against s.charAt(0).
 633  * In the reverse direction, it is matched against
 634  * s.charAt(s.length()-1).
 635  * @param limit the limit offset for matching, either last+1 in
 636  * the forward direction, or last-1 in the reverse direction,
 637  * where last is the index of the last character to match.
 638  * @return If part of s matches up to the limit, return |limit -
 639  * start|.  If all of s matches before reaching the limit, return
 640  * s.length().  If there is a mismatch between s and text, return
 641  * 0
 642  */
 643 int32_t UnicodeSet::matchRest(const Replaceable& text,
 644                               int32_t start, int32_t limit,
 645                               const UnicodeString& s) {
 646     int32_t i;
 647     int32_t maxLen;
 648     int32_t slen = s.length();
 649     if (start < limit) {
 650         maxLen = limit - start;
 651         if (maxLen > slen) maxLen = slen;
 652         for (i = 1; i < maxLen; ++i) {
 653             if (text.charAt(start + i) != s.charAt(i)) return 0;
 654         }
 655     } else {
 656         maxLen = start - limit;
 657         if (maxLen > slen) maxLen = slen;
 658         --slen; // <=> slen = s.length() - 1;
 659         for (i = 1; i < maxLen; ++i) {
 660             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
 661         }
 662     }
 663     return maxLen;
 664 }
 665
 666 /**
 667  * Implement of UnicodeMatcher
 668  */
 669 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
 670     toUnionTo.addAll(*this);
 671 }
 672
 673 /**
 674  * Returns the index of the given character within this set, where
 675  * the set is ordered by ascending code point.  If the character
 676  * is not in this set, return -1.  The inverse of this method is
 677  * <code>charAt()</code>.
 678  * @return an index from 0..size()-1, or -1
 679  */
 680 int32_t UnicodeSet::indexOf(UChar32 c) const {
 681     if (c < MIN_VALUE || c > MAX_VALUE) {
 682         return -1;
 683     }
 684     int32_t i = 0;
 685     int32_t n = 0;
 686     for (;;) {
 687         UChar32 start = list[i++];
 688         if (c < start) {
 689             return -1;
 690         }
 691         UChar32 limit = list[i++];
 692         if (c < limit) {
 693             return n + c - start;
 694         }
 695         n += limit - start;
 696     }
 697 }
 698
 699 /**
 700  * Returns the character at the given index within this set, where
 701  * the set is ordered by ascending code point.  If the index is
 702  * out of range, return (UChar32)-1.  The inverse of this method is
 703  * <code>indexOf()</code>.
 704  * @param index an index from 0..size()-1
 705  * @return the character at the given index, or (UChar32)-1.
 706  */
 707 UChar32 UnicodeSet::charAt(int32_t index) const {
 708     if (index >= 0) {
 709         // len2 is the largest even integer <= len, that is, it is len
 710         // for even values and len-1 for odd values.  With odd values
 711         // the last entry is UNICODESET_HIGH.
 712         int32_t len2 = len & ~1;
 713         for (int32_t i=0; i < len2;) {
 714             UChar32 start = list[i++];
 715             int32_t count = list[i++] - start;
 716             if (index < count) {
 717                 return (UChar32)(start + index);
 718             }
 719             index -= count;
 720         }
 721     }
 722     return (UChar32)-1;
 723 }
 724
 725 /**
 726  * Adds the specified range to this set if it is not already
 727  * present.  If this set already contains the specified range,
 728  * the call leaves this set unchanged.  If <code>end > start</code>
 729  * then an empty range is added, leaving the set unchanged.
 730  *
 731  * @param start first character, inclusive, of range to be added
 732  * to this set.
 733  * @param end last character, inclusive, of range to be added
 734  * to this set.
 735  */
 736 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
 737     if (pinCodePoint(start) < pinCodePoint(end)) {
 738         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
 739         add(range, 2, 0);
 740     } else if (start == end) {
 741         add(start);
 742     }
 743     return *this;
 744 }
 745
 746 // #define DEBUG_US_ADD
 747
 748 #ifdef DEBUG_US_ADD
 749 #include <stdio.h>
 750 void dump(UChar32 c) {
 751     if (c <= 0xFF) {
 752         printf("%c", (char)c);
 753     } else {
 754         printf("U+%04X", c);
 755     }
 756 }
 757 void dump(const UChar32* list, int32_t len) {
 758     printf("[");
 759     for (int32_t i=0; i<len; ++i) {
 760         if (i != 0) printf(", ");
 761         dump(list[i]);
 762     }
 763     printf("]");
 764 }
 765 #endif
 766
 767 /**
 768  * Adds the specified character to this set if it is not already
 769  * present.  If this set already contains the specified character,
 770  * the call leaves this set unchanged.
 771  */
 772 UnicodeSet& UnicodeSet::add(UChar32 c) {
 773     // find smallest i such that c < list[i]
 774     // if odd, then it is IN the set
 775     // if even, then it is OUT of the set
 776     int32_t i = findCodePoint(pinCodePoint(c));
 777
 778     // already in set?
 779     if ((i & 1) != 0) return *this;
 780
 781     // HIGH is 0x110000
 782     // assert(list[len-1] == HIGH);
 783
 784     // empty = [HIGH]
 785     // [start_0, limit_0, start_1, limit_1, HIGH]
 786
 787     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 788     //                             ^
 789     //                             list[i]
 790
 791     // i == 0 means c is before the first range
 792
 793 #ifdef DEBUG_US_ADD
 794     printf("Add of ");
 795     dump(c);
 796     printf(" found at %d", i);
 797     printf(": ");
 798     dump(list, len);
 799     printf(" => ");
 800 #endif
 801
 802     if (c == list[i]-1) {
 803         // c is before start of next range
 804         list[i] = c;
 805         // if we touched the HIGH mark, then add a new one
 806         if (c == (UNICODESET_HIGH - 1)) {
 807             ensureCapacity(len+1);
 808             list[len++] = UNICODESET_HIGH;
 809         }
 810         if (i > 0 && c == list[i-1]) {
 811             // collapse adjacent ranges
 812
 813             // [..., start_k-1, c, c, limit_k, ..., HIGH]
 814             //                     ^
 815             //                     list[i]
 816
 817             //for (int32_t k=i-1; k<len-2; ++k) {
 818             //    list[k] = list[k+2];
 819             //}
 820             UChar32* dst = list + i - 1;
 821             UChar32* src = dst + 2;
 822             UChar32* srclimit = list + len;
 823             while (src < srclimit) *(dst++) = *(src++);
 824
 825             len -= 2;
 826         }
 827     }
 828
 829     else if (i > 0 && c == list[i-1]) {
 830         // c is after end of prior range
 831         list[i-1]++;
 832         // no need to check for collapse here
 833     }
 834
 835     else {
 836         // At this point we know the new char is not adjacent to
 837         // any existing ranges, and it is not 10FFFF.
 838
 839
 840         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 841         //                             ^
 842         //                             list[i]
 843
 844         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 845         //                             ^
 846         //                             list[i]
 847
 848         ensureCapacity(len+2);
 849
 850         //for (int32_t k=len-1; k>=i; --k) {
 851         //    list[k+2] = list[k];
 852         //}
 853         UChar32* src = list + len;
 854         UChar32* dst = src + 2;
 855         UChar32* srclimit = list + i;
 856         while (src > srclimit) *(--dst) = *(--src);
 857
 858         list[i] = c;
 859         list[i+1] = c+1;
 860         len += 2;
 861     }
 862
 863 #ifdef DEBUG_US_ADD
 864     dump(list, len);
 865     printf("\n");
 866
 867     for (i=1; i<len; ++i) {
 868         if (list[i] <= list[i-1]) {
 869             // Corrupt array!
 870             printf("ERROR: list has been corrupted\n");
 871             exit(1);
 872         }
 873     }
 874 #endif
 875
 876     pat.truncate(0);
 877     return *this;
 878 }
 879
 880 /**
 881  * Adds the specified multicharacter to this set if it is not already
 882  * present.  If this set already contains the multicharacter,
 883  * the call leaves this set unchanged.
 884  * Thus "ch" => {"ch"}
 885  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 886  * @param s the source string
 887  * @return the modified set, for chaining
 888  */
 889 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
 890     if (s.length() == 0) return *this;
 891     int32_t cp = getSingleCP(s);
 892     if (cp < 0) {
 893         if (!strings->contains((void*) &s)) {
 894             _add(s);
 895             pat.truncate(0);
 896         }
 897     } else {
 898         add((UChar32)cp, (UChar32)cp);
 899     }
 900     return *this;
 901 }
 902
 903 /**
 904  * Adds the given string, in order, to 'strings'.  The given string
 905  * must have been checked by the caller to not be empty and to not
 906  * already be in 'strings'.
 907  */
 908 void UnicodeSet::_add(const UnicodeString& s) {
 909     UnicodeString* t = new UnicodeString(s);
 910     UErrorCode ec = U_ZERO_ERROR;
 911     strings->sortedInsert(t, compareUnicodeString, ec);
 912 }
 913
 914 /**
 915  * @return a code point IF the string consists of a single one.
 916  * otherwise returns -1.
 917  * @param string to test
 918  */
 919 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
 920     //if (s.length() < 1) {
 921     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
 922     //}
 923     if (s.length() > 2) return -1;
 924     if (s.length() == 1) return s.charAt(0);
 925
 926     // at this point, len = 2
 927     UChar32 cp = s.char32At(0);
 928     if (cp > 0xFFFF) { // is surrogate pair
 929         return cp;
 930     }
 931     return -1;
 932 }
 933
 934 /**
 935  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
 936  * If this set already any particular character, it has no effect on that character.
 937  * @param the source string
 938  * @return the modified set, for chaining
 939  */
 940 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
 941     UChar32 cp;
 942     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
 943         cp = s.char32At(i);
 944         add(cp, cp);
 945     }
 946     return *this;
 947 }
 948
 949 /**
 950  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
 951  * If this set already any particular character, it has no effect on that character.
 952  * @param the source string
 953  * @return the modified set, for chaining
 954  */
 955 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
 956     UnicodeSet set;
 957     set.addAll(s);
 958     retainAll(set);
 959     return *this;
 960 }
 961
 962 /**
 963  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
 964  * If this set already any particular character, it has no effect on that character.
 965  * @param the source string
 966  * @return the modified set, for chaining
 967  */
 968 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
 969     UnicodeSet set;
 970     set.addAll(s);
 971     complementAll(set);
 972     return *this;
 973 }
 974
 975 /**
 976  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
 977  * If this set already any particular character, it has no effect on that character.
 978  * @param the source string
 979  * @return the modified set, for chaining
 980  */
 981 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
 982     UnicodeSet set;
 983     set.addAll(s);
 984     removeAll(set);
 985     return *this;
 986 }
 987
 988 /**
 989  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
 990  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 991  * @param the source string
 992  * @return a newly created set containing the given string
 993  */
 994 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
 995     UnicodeSet *set = new UnicodeSet();
 996     set->add(s);
 997     return set;
 998 }
 999
1000
1001 /**
1002  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1003  * @param the source string
1004  * @return a newly created set containing the given characters
1005  */
1006 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1007     UnicodeSet *set = new UnicodeSet();
1008     set->addAll(s);
1009     return set;
1010 }
1011
1012 /**
1013  * Retain only the elements in this set that are contained in the
1014  * specified range.  If <code>end > start</code> then an empty range is
1015  * retained, leaving the set empty.
1016  *
1017  * @param start first character, inclusive, of range to be retained
1018  * to this set.
1019  * @param end last character, inclusive, of range to be retained
1020  * to this set.
1021  */
1022 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1023     if (pinCodePoint(start) <= pinCodePoint(end)) {
1024         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1025         retain(range, 2, 0);
1026     } else {
1027         clear();
1028     }
1029     return *this;
1030 }
1031
1032 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1033     return retain(c, c);
1034 }
1035
1036 /**
1037  * Removes the specified range from this set if it is present.
1038  * The set will not contain the specified range once the call
1039  * returns.  If <code>end > start</code> then an empty range is
1040  * removed, leaving the set unchanged.
1041  *
1042  * @param start first character, inclusive, of range to be removed
1043  * from this set.
1044  * @param end last character, inclusive, of range to be removed
1045  * from this set.
1046  */
1047 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1048     if (pinCodePoint(start) <= pinCodePoint(end)) {
1049         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1050         retain(range, 2, 2);
1051     }
1052     return *this;
1053 }
1054
1055 /**
1056  * Removes the specified character from this set if it is present.
1057  * The set will not contain the specified range once the call
1058  * returns.
1059  */
1060 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1061     return remove(c, c);
1062 }
1063
1064 /**
1065  * Removes the specified string from this set if it is present.
1066  * The set will not contain the specified character once the call
1067  * returns.
1068  * @param the source string
1069  * @return the modified set, for chaining
1070  */
1071 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1072     if (s.length() == 0) return *this;
1073     int32_t cp = getSingleCP(s);
1074     if (cp < 0) {
1075         strings->removeElement((void*) &s);
1076         pat.truncate(0);
1077     } else {
1078         remove((UChar32)cp, (UChar32)cp);
1079     }
1080     return *this;
1081 }
1082
1083 /**
1084  * Complements the specified range in this set.  Any character in
1085  * the range will be removed if it is in this set, or will be
1086  * added if it is not in this set.  If <code>end > start</code>
1087  * then an empty range is xor'ed, leaving the set unchanged.
1088  *
1089  * @param start first character, inclusive, of range to be removed
1090  * from this set.
1091  * @param end last character, inclusive, of range to be removed
1092  * from this set.
1093  */
1094 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1095     if (pinCodePoint(start) <= pinCodePoint(end)) {
1096         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1097         exclusiveOr(range, 2, 0);
1098     }
1099     pat.truncate(0);
1100     return *this;
1101 }
1102
1103 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1104     return complement(c, c);
1105 }
1106
1107 /**
1108  * This is equivalent to
1109  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1110  */
1111 UnicodeSet& UnicodeSet::complement(void) {
1112     if (list[0] == UNICODESET_LOW) {
1113         ensureBufferCapacity(len-1);
1114         uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1115         --len;
1116     } else {
1117         ensureBufferCapacity(len+1);
1118         uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1119         buffer[0] = UNICODESET_LOW;
1120         ++len;
1121     }
1122     swapBuffers();
1123     pat.truncate(0);
1124     return *this;
1125 }
1126
1127 /**
1128  * Complement the specified string in this set.
1129  * The set will not contain the specified string once the call
1130  * returns.
1131  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1132  * @param s the string to complement
1133  * @return this object, for chaining
1134  */
1135 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1136     if (s.length() == 0) return *this;
1137     int32_t cp = getSingleCP(s);
1138     if (cp < 0) {
1139         if (strings->contains((void*) &s)) {
1140             strings->removeElement((void*) &s);
1141         } else {
1142             _add(s);
1143         }
1144         pat.truncate(0);
1145     } else {
1146         complement((UChar32)cp, (UChar32)cp);
1147     }
1148     return *this;
1149 }
1150
1151 /**
1152  * Adds all of the elements in the specified set to this set if
1153  * they're not already present.  This operation effectively
1154  * modifies this set so that its value is the <i>union</i> of the two
1155  * sets.  The behavior of this operation is unspecified if the specified
1156  * collection is modified while the operation is in progress.
1157  *
1158  * @param c set whose elements are to be added to this set.
1159  * @see #add(char, char)
1160  */
1161 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1162     add(c.list, c.len, 0);
1163
1164     // Add strings in order
1165     for (int32_t i=0; i<c.strings->size(); ++i) {
1166         const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1167         if (!strings->contains((void*) s)) {
1168             _add(*s);
1169         }
1170     }
1171     return *this;
1172 }
1173
1174 /**
1175  * Retains only the elements in this set that are contained in the
1176  * specified set.  In other words, removes from this set all of
1177  * its elements that are not contained in the specified set.  This
1178  * operation effectively modifies this set so that its value is
1179  * the <i>intersection</i> of the two sets.
1180  *
1181  * @param c set that defines which elements this set will retain.
1182  */
1183 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1184     retain(c.list, c.len, 0);
1185     strings->retainAll(*c.strings);
1186     return *this;
1187 }
1188
1189 /**
1190  * Removes from this set all of its elements that are contained in the
1191  * specified set.  This operation effectively modifies this
1192  * set so that its value is the <i>asymmetric set difference</i> of
1193  * the two sets.
1194  *
1195  * @param c set that defines which elements will be removed from
1196  *          this set.
1197  */
1198 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1199     retain(c.list, c.len, 2);
1200     strings->removeAll(*c.strings);
1201     return *this;
1202 }
1203
1204 /**
1205  * Complements in this set all elements contained in the specified
1206  * set.  Any character in the other set will be removed if it is
1207  * in this set, or will be added if it is not in this set.
1208  *
1209  * @param c set that defines which elements will be xor'ed from
1210  *          this set.
1211  */
1212 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1213     exclusiveOr(c.list, c.len, 0);
1214
1215     for (int32_t i=0; i<c.strings->size(); ++i) {
1216         void* e = c.strings->elementAt(i);
1217         if (!strings->removeElement(e)) {
1218             _add(*(const UnicodeString*)e);
1219         }
1220     }
1221     return *this;
1222 }
1223
1224 /**
1225  * Removes all of the elements from this set.  This set will be
1226  * empty after this call returns.
1227  */
1228 UnicodeSet& UnicodeSet::clear(void) {
1229     list[0] = UNICODESET_HIGH;
1230     len = 1;
1231     pat.truncate(0);
1232     strings->removeAllElements();
1233     return *this;
1234 }
1235
1236 /**
1237  * Iteration method that returns the number of ranges contained in
1238  * this set.
1239  * @see #getRangeStart
1240  * @see #getRangeEnd
1241  */
1242 int32_t UnicodeSet::getRangeCount() const {
1243     return len/2;
1244 }
1245
1246 /**
1247  * Iteration method that returns the first character in the
1248  * specified range of this set.
1249  * @see #getRangeCount
1250  * @see #getRangeEnd
1251  */
1252 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1253     return list[index*2];
1254 }
1255
1256 /**
1257  * Iteration method that returns the last character in the
1258  * specified range of this set.
1259  * @see #getRangeStart
1260  * @see #getRangeEnd
1261  */
1262 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1263     return list[index*2 + 1] - 1;
1264 }
1265
1266 int32_t UnicodeSet::getStringCount() const {
1267     return strings->size();
1268 }
1269
1270 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1271     return (const UnicodeString*) strings->elementAt(index);
1272 }
1273
1274 /**
1275  * Reallocate this objects internal structures to take up the least
1276  * possible space, without changing this object's value.
1277  */
1278 UnicodeSet& UnicodeSet::compact() {
1279     if (len != capacity) {
1280         capacity = len;
1281         UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1282         uprv_memcpy(temp, list, len*sizeof(UChar32));
1283         uprv_free(list);
1284         list = temp;
1285     }
1286     uprv_free(buffer);
1287     buffer = NULL;
1288     return *this;
1289 }
1290
1291 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1292     int32_t bmpLength, length, destLength;
1293
1294     if (U_FAILURE(ec)) {
1295         return 0;
1296     }
1297
1298     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1299         ec=U_ILLEGAL_ARGUMENT_ERROR;
1300         return 0;
1301     }
1302
1303     /* count necessary 16-bit units */
1304     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1305     // assert(length>=0);
1306     if (length==0) {
1307         /* empty set */
1308         if (destCapacity>0) {
1309             *dest=0;
1310         } else {
1311             ec=U_BUFFER_OVERFLOW_ERROR;
1312         }
1313         return 1;
1314     }
1315     /* now length>0 */
1316
1317     if (this->list[length-1]<=0xffff) {
1318         /* all BMP */
1319         bmpLength=length;
1320     } else if (this->list[0]>=0x10000) {
1321         /* all supplementary */
1322         bmpLength=0;
1323         length*=2;
1324     } else {
1325         /* some BMP, some supplementary */
1326         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1327         length=bmpLength+2*(length-bmpLength);
1328     }
1329
1330     /* length: number of 16-bit array units */
1331     if (length>0x7fff) {
1332         /* there are only 15 bits for the length in the first serialized word */
1333         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1334         return 0;
1335     }
1336
1337     /*
1338      * total serialized length:
1339      * number of 16-bit array units (length) +
1340      * 1 length unit (always) +
1341      * 1 bmpLength unit (if there are supplementary values)
1342      */
1343     destLength=length+((length>bmpLength)?2:1);
1344     if (destLength<=destCapacity) {
1345         const UChar32 *p;
1346         int32_t i;
1347
1348         *dest=(uint16_t)length;
1349         if (length>bmpLength) {
1350             *dest|=0x8000;
1351             *++dest=(uint16_t)bmpLength;
1352         }
1353         ++dest;
1354
1355         /* write the BMP part of the array */
1356         p=this->list;
1357         for (i=0; i<bmpLength; ++i) {
1358             *dest++=(uint16_t)*p++;
1359         }
1360
1361         /* write the supplementary part of the array */
1362         for (; i<length; i+=2) {
1363             *dest++=(uint16_t)(*p>>16);
1364             *dest++=(uint16_t)*p++;
1365         }
1366     } else {
1367         ec=U_BUFFER_OVERFLOW_ERROR;
1368     }
1369     return destLength;
1370 }
1371
1372 //----------------------------------------------------------------
1373 // Implementation: Utility methods
1374 //----------------------------------------------------------------
1375
1376 /**
1377  * Allocate our strings vector and return TRUE if successful.
1378  */
1379 UBool UnicodeSet::allocateStrings() {
1380     UErrorCode ec = U_ZERO_ERROR;
1381     strings = new UVector(uhash_deleteUnicodeString,
1382                           uhash_compareUnicodeString, ec);
1383     if (U_FAILURE(ec)) {
1384         delete strings;
1385         strings = NULL;
1386         return FALSE;
1387     }
1388     return TRUE;
1389 }
1390
1391 void UnicodeSet::ensureCapacity(int32_t newLen) {
1392     if (newLen <= capacity)
1393         return;
1394     capacity = newLen + GROW_EXTRA;
1395     UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1396     uprv_memcpy(temp, list, len*sizeof(UChar32));
1397     uprv_free(list);
1398     list = temp;
1399 }
1400
1401 void UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1402     if (buffer != NULL && newLen <= bufferCapacity)
1403         return;
1404     if (buffer) {
1405         uprv_free(buffer);
1406     }
1407     bufferCapacity = newLen + GROW_EXTRA;
1408     buffer = (UChar32*) uprv_malloc(sizeof(UChar32) * bufferCapacity);
1409 }
1410
1411 /**
1412  * Swap list and buffer.
1413  */
1414 void UnicodeSet::swapBuffers(void) {
1415     // swap list and buffer
1416     UChar32* temp = list;
1417     list = buffer;
1418     buffer = temp;
1419
1420     int32_t c = capacity;
1421     capacity = bufferCapacity;
1422     bufferCapacity = c;
1423 }
1424
1425 //----------------------------------------------------------------
1426 // Implementation: Fundamental operators
1427 //----------------------------------------------------------------
1428
1429 static inline UChar32 max(UChar32 a, UChar32 b) {
1430     return (a > b) ? a : b;
1431 }
1432
1433 // polarity = 0, 3 is normal: x xor y
1434 // polarity = 1, 2: x xor ~y == x === y
1435
1436 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1437     ensureBufferCapacity(len + otherLen);
1438     int32_t i = 0, j = 0, k = 0;
1439     UChar32 a = list[i++];
1440     UChar32 b;
1441     if (polarity == 1 || polarity == 2) {
1442         b = UNICODESET_LOW;
1443         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1444             ++j;
1445             b = other[j];
1446         }
1447     } else {
1448         b = other[j++];
1449     }
1450     // simplest of all the routines
1451     // sort the values, discarding identicals!
1452     for (;;) {
1453         if (a < b) {
1454             buffer[k++] = a;
1455             a = list[i++];
1456         } else if (b < a) {
1457             buffer[k++] = b;
1458             b = other[j++];
1459         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1460             // discard both values!
1461             a = list[i++];
1462             b = other[j++];
1463         } else { // DONE!
1464             buffer[k++] = UNICODESET_HIGH;
1465             len = k;
1466             break;
1467         }
1468     }
1469     swapBuffers();
1470     pat.truncate(0);
1471 }
1472
1473 // polarity = 0 is normal: x union y
1474 // polarity = 2: x union ~y
1475 // polarity = 1: ~x union y
1476 // polarity = 3: ~x union ~y
1477
1478 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1479     ensureBufferCapacity(len + otherLen);
1480     int32_t i = 0, j = 0, k = 0;
1481     UChar32 a = list[i++];
1482     UChar32 b = other[j++];
1483     // change from xor is that we have to check overlapping pairs
1484     // polarity bit 1 means a is second, bit 2 means b is.
1485     for (;;) {
1486         switch (polarity) {
1487           case 0: // both first; take lower if unequal
1488             if (a < b) { // take a
1489                 // Back up over overlapping ranges in buffer[]
1490                 if (k > 0 && a <= buffer[k-1]) {
1491                     // Pick latter end value in buffer[] vs. list[]
1492                     a = max(list[i], buffer[--k]);
1493                 } else {
1494                     // No overlap
1495                     buffer[k++] = a;
1496                     a = list[i];
1497                 }
1498                 i++; // Common if/else code factored out
1499                 polarity ^= 1;
1500             } else if (b < a) { // take b
1501                 if (k > 0 && b <= buffer[k-1]) {
1502                     b = max(other[j], buffer[--k]);
1503                 } else {
1504                     buffer[k++] = b;
1505                     b = other[j];
1506                 }
1507                 j++;
1508                 polarity ^= 2;
1509             } else { // a == b, take a, drop b
1510                 if (a == UNICODESET_HIGH) goto loop_end;
1511                 // This is symmetrical; it doesn't matter if
1512                 // we backtrack with a or b. - liu
1513                 if (k > 0 && a <= buffer[k-1]) {
1514                     a = max(list[i], buffer[--k]);
1515                 } else {
1516                     // No overlap
1517                     buffer[k++] = a;
1518                     a = list[i];
1519                 }
1520                 i++;
1521                 polarity ^= 1;
1522                 b = other[j++];
1523                 polarity ^= 2;
1524             }
1525             break;
1526           case 3: // both second; take higher if unequal, and drop other
1527             if (b <= a) { // take a
1528                 if (a == UNICODESET_HIGH) goto loop_end;
1529                 buffer[k++] = a;
1530             } else { // take b
1531                 if (b == UNICODESET_HIGH) goto loop_end;
1532                 buffer[k++] = b;
1533             }
1534             a = list[i++];
1535             polarity ^= 1;   // factored common code
1536             b = other[j++];
1537             polarity ^= 2;
1538             break;
1539           case 1: // a second, b first; if b < a, overlap
1540             if (a < b) { // no overlap, take a
1541                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1542             } else if (b < a) { // OVERLAP, drop b
1543                 b = other[j++];
1544                 polarity ^= 2;
1545             } else { // a == b, drop both!
1546                 if (a == UNICODESET_HIGH) goto loop_end;
1547                 a = list[i++];
1548                 polarity ^= 1;
1549                 b = other[j++];
1550                 polarity ^= 2;
1551             }
1552             break;
1553           case 2: // a first, b second; if a < b, overlap
1554             if (b < a) { // no overlap, take b
1555                 buffer[k++] = b;
1556                 b = other[j++];
1557                 polarity ^= 2;
1558             } else  if (a < b) { // OVERLAP, drop a
1559                 a = list[i++];
1560                 polarity ^= 1;
1561             } else { // a == b, drop both!
1562                 if (a == UNICODESET_HIGH) goto loop_end;
1563                 a = list[i++];
1564                 polarity ^= 1;
1565                 b = other[j++];
1566                 polarity ^= 2;
1567             }
1568             break;
1569         }
1570     }
1571  loop_end:
1572     buffer[k++] = UNICODESET_HIGH;    // terminate
1573     len = k;
1574     swapBuffers();
1575     pat.truncate(0);
1576 }
1577
1578 // polarity = 0 is normal: x intersect y
1579 // polarity = 2: x intersect ~y == set-minus
1580 // polarity = 1: ~x intersect y
1581 // polarity = 3: ~x intersect ~y
1582
1583 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1584     ensureBufferCapacity(len + otherLen);
1585     int32_t i = 0, j = 0, k = 0;
1586     UChar32 a = list[i++];
1587     UChar32 b = other[j++];
1588     // change from xor is that we have to check overlapping pairs
1589     // polarity bit 1 means a is second, bit 2 means b is.
1590     for (;;) {
1591         switch (polarity) {
1592           case 0: // both first; drop the smaller
1593             if (a < b) { // drop a
1594                 a = list[i++];
1595                 polarity ^= 1;
1596             } else if (b < a) { // drop b
1597                 b = other[j++];
1598                 polarity ^= 2;
1599             } else { // a == b, take one, drop other
1600                 if (a == UNICODESET_HIGH) goto loop_end;
1601                 buffer[k++] = a;
1602                 a = list[i++];
1603                 polarity ^= 1;
1604                 b = other[j++];
1605                 polarity ^= 2;
1606             }
1607             break;
1608           case 3: // both second; take lower if unequal
1609             if (a < b) { // take a
1610                 buffer[k++] = a;
1611                 a = list[i++];
1612                 polarity ^= 1;
1613             } else if (b < a) { // take b
1614                 buffer[k++] = b;
1615                 b = other[j++];
1616                 polarity ^= 2;
1617             } else { // a == b, take one, drop other
1618                 if (a == UNICODESET_HIGH) goto loop_end;
1619                 buffer[k++] = a;
1620                 a = list[i++];
1621                 polarity ^= 1;
1622                 b = other[j++];
1623                 polarity ^= 2;
1624             }
1625             break;
1626           case 1: // a second, b first;
1627             if (a < b) { // NO OVERLAP, drop a
1628                 a = list[i++];
1629                 polarity ^= 1;
1630             } else if (b < a) { // OVERLAP, take b
1631                 buffer[k++] = b;
1632                 b = other[j++];
1633                 polarity ^= 2;
1634             } else { // a == b, drop both!
1635                 if (a == UNICODESET_HIGH) goto loop_end;
1636                 a = list[i++];
1637                 polarity ^= 1;
1638                 b = other[j++];
1639                 polarity ^= 2;
1640             }
1641             break;
1642           case 2: // a first, b second; if a < b, overlap
1643             if (b < a) { // no overlap, drop b
1644                 b = other[j++];
1645                 polarity ^= 2;
1646             } else  if (a < b) { // OVERLAP, take a
1647                 buffer[k++] = a;
1648                 a = list[i++];
1649                 polarity ^= 1;
1650             } else { // a == b, drop both!
1651                 if (a == UNICODESET_HIGH) goto loop_end;
1652                 a = list[i++];
1653                 polarity ^= 1;
1654                 b = other[j++];
1655                 polarity ^= 2;
1656             }
1657             break;
1658         }
1659     }
1660  loop_end:
1661     buffer[k++] = UNICODESET_HIGH;    // terminate
1662     len = k;
1663     swapBuffers();
1664     pat.truncate(0);
1665 }
1666
1667 /**
1668  * Append the <code>toPattern()</code> representation of a
1669  * string to the given <code>StringBuffer</code>.
1670  */
1671 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1672 escapeUnprintable) {
1673     UChar32 cp;
1674     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1675         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1676     }
1677 }
1678
1679 /**
1680  * Append the <code>toPattern()</code> representation of a
1681  * character to the given <code>StringBuffer</code>.
1682  */
1683 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1684 escapeUnprintable) {
1685     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1686         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1687         // unprintable
1688         if (ICU_Utility::escapeUnprintable(buf, c)) {
1689             return;
1690         }
1691     }
1692     // Okay to let ':' pass through
1693     switch (c) {
1694     case SET_OPEN:
1695     case SET_CLOSE:
1696     case HYPHEN:
1697     case COMPLEMENT:
1698     case INTERSECTION:
1699     case BACKSLASH:
1700     case OPEN_BRACE:
1701     case CLOSE_BRACE:
1702     case COLON:
1703     case SymbolTable::SYMBOL_REF:
1704         buf.append(BACKSLASH);
1705         break;
1706     default:
1707         // Escape whitespace
1708         if (uprv_isRuleWhiteSpace(c)) {
1709             buf.append(BACKSLASH);
1710         }
1711         break;
1712     }
1713     buf.append(c);
1714 }
1715
1716 /**
1717  * Append a string representation of this set to result.  This will be
1718  * a cleaned version of the string passed to applyPattern(), if there
1719  * is one.  Otherwise it will be generated.
1720  */
1721 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1722                                       UBool escapeUnprintable) const {
1723     if (pat.length() > 0) {
1724         int32_t i;
1725         int32_t backslashCount = 0;
1726         for (i=0; i<pat.length(); ) {
1727             UChar32 c = pat.char32At(i);
1728             i += UTF_CHAR_LENGTH(c);
1729             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1730                 // If the unprintable character is preceded by an odd
1731                 // number of backslashes, then it has been escaped.
1732                 // Before unescaping it, we delete the final
1733                 // backslash.
1734                 if ((backslashCount % 2) == 1) {
1735                     result.truncate(result.length() - 1);
1736                 }
1737                 ICU_Utility::escapeUnprintable(result, c);
1738                 backslashCount = 0;
1739             } else {
1740                 result.append(c);
1741                 if (c == BACKSLASH) {
1742                     ++backslashCount;
1743                 } else {
1744                     backslashCount = 0;
1745                 }
1746             }
1747         }
1748         return result;
1749     }
1750
1751     return _generatePattern(result, escapeUnprintable);
1752 }
1753
1754 /**
1755  * Returns a string representation of this set.  If the result of
1756  * calling this function is passed to a UnicodeSet constructor, it
1757  * will produce another set that is equal to this one.
1758  */
1759 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1760                                      UBool escapeUnprintable) const {
1761     result.truncate(0);
1762     return _toPattern(result, escapeUnprintable);
1763 }
1764
1765 /**
1766  * Generate and append a string representation of this set to result.
1767  * This does not use this.pat, the cleaned up copy of the string
1768  * passed to applyPattern().
1769  */
1770 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1771                                             UBool escapeUnprintable) const {
1772     result.append(SET_OPEN);
1773
1774 //  // Check against the predefined categories.  We implicitly build
1775 //  // up ALL category sets the first time toPattern() is called.
1776 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
1777 //      if (*this == getCategorySet(cat)) {
1778 //          result.append(COLON);
1779 //          result.append(CATEGORY_NAMES, cat*2, 2);
1780 //          return result.append(CATEGORY_CLOSE);
1781 //      }
1782 //  }
1783
1784     int32_t count = getRangeCount();
1785
1786     // If the set contains at least 2 intervals and includes both
1787     // MIN_VALUE and MAX_VALUE, then the inverse representation will
1788     // be more economical.
1789     if (count > 1 &&
1790         getRangeStart(0) == MIN_VALUE &&
1791         getRangeEnd(count-1) == MAX_VALUE) {
1792
1793         // Emit the inverse
1794         result.append(COMPLEMENT);
1795
1796         for (int32_t i = 1; i < count; ++i) {
1797             UChar32 start = getRangeEnd(i-1)+1;
1798             UChar32 end = getRangeStart(i)-1;
1799             _appendToPat(result, start, escapeUnprintable);
1800             if (start != end) {
1801                 if ((start+1) != end) {
1802                     result.append(HYPHEN);
1803                 }
1804                 _appendToPat(result, end, escapeUnprintable);
1805             }
1806         }
1807     }
1808
1809     // Default; emit the ranges as pairs
1810     else {
1811         for (int32_t i = 0; i < count; ++i) {
1812             UChar32 start = getRangeStart(i);
1813             UChar32 end = getRangeEnd(i);
1814             _appendToPat(result, start, escapeUnprintable);
1815             if (start != end) {
1816                 if ((start+1) != end) {
1817                     result.append(HYPHEN);
1818                 }
1819                 _appendToPat(result, end, escapeUnprintable);
1820             }
1821         }
1822     }
1823
1824     for (int32_t i = 0; i<strings->size(); ++i) {
1825         result.append(OPEN_BRACE);
1826         _appendToPat(result,
1827                      *(const UnicodeString*) strings->elementAt(i),
1828                      escapeUnprintable);
1829         result.append(CLOSE_BRACE);
1830     }
1831     return result.append(SET_CLOSE);
1832 }
1833
1834
1835 U_NAMESPACE_END