icuSources/common/uniset.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1999-2006, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12 #include "unicode/uniset.h"
  13 #include "unicode/parsepos.h"
  14 #include "unicode/symtable.h"
  15 #include "ruleiter.h"
  16 #include "cmemory.h"
  17 #include "uhash.h"
  18 #include "util.h"
  19 #include "uvector.h"
  20 #include "charstr.h"
  21 #include "ustrfmt.h"
  22 #include "mutex.h"
  23 #include "uassert.h"
  24 #include "hash.h"
  25
  26 // Define UChar constants using hex for EBCDIC compatibility
  27 // Used #define to reduce private static exports and memory access time.
  28 #define SET_OPEN        ((UChar)0x005B) /*[*/
  29 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  30 #define HYPHEN          ((UChar)0x002D) /*-*/
  31 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  32 #define COLON           ((UChar)0x003A) /*:*/
  33 #define BACKSLASH       ((UChar)0x005C) /*\*/
  34 #define INTERSECTION    ((UChar)0x0026) /*&*/
  35 #define UPPER_U         ((UChar)0x0055) /*U*/
  36 #define LOWER_U         ((UChar)0x0075) /*u*/
  37 #define OPEN_BRACE      ((UChar)123)    /*{*/
  38 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  39 #define UPPER_P         ((UChar)0x0050) /*P*/
  40 #define LOWER_P         ((UChar)0x0070) /*p*/
  41 #define UPPER_N         ((UChar)78)     /*N*/
  42 #define EQUALS          ((UChar)0x003D) /*=*/
  43
  44 // HIGH_VALUE > all valid values. 110000 for codepoints
  45 #define UNICODESET_HIGH 0x0110000
  46
  47 // LOW <= all valid values. ZERO for codepoints
  48 #define UNICODESET_LOW 0x000000
  49
  50 // initial storage. Must be >= 0
  51 #define START_EXTRA 16
  52
  53 // extra amount for growth. Must be >= 0
  54 #define GROW_EXTRA START_EXTRA
  55
  56 U_NAMESPACE_BEGIN
  57
  58 SymbolTable::~SymbolTable() {}
  59
  60 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
  61
  62 /**
  63  * Modify the given UChar32 variable so that it is in range, by
  64  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
  65  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
  66  * It modifies its argument in-place and also returns it.
  67  */
  68 static inline UChar32 pinCodePoint(UChar32& c) {
  69     if (c < UNICODESET_LOW) {
  70         c = UNICODESET_LOW;
  71     } else if (c > (UNICODESET_HIGH-1)) {
  72         c = (UNICODESET_HIGH-1);
  73     }
  74     return c;
  75 }
  76
  77 //----------------------------------------------------------------
  78 // Debugging
  79 //----------------------------------------------------------------
  80
  81 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
  82 // To enable the debugging, define the symbol DEBUG_MEM in the line
  83 // below.  This will result in text being sent to stdout that looks
  84 // like this:
  85 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
  86 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
  87 // Each line lists a construction (ct) or destruction (dt) event, the
  88 // object address, the number of outstanding objects after the event,
  89 // and the pattern of the object in question.
  90
  91 // #define DEBUG_MEM
  92
  93 #ifdef DEBUG_MEM
  94 #include <stdio.h>
  95 static int32_t _dbgCount = 0;
  96
  97 static inline void _dbgct(UnicodeSet* set) {
  98     UnicodeString str;
  99     set->toPattern(str, TRUE);
 100     char buf[40];
 101     str.extract(0, 39, buf, "");
 102     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
 103 }
 104
 105 static inline void _dbgdt(UnicodeSet* set) {
 106     UnicodeString str;
 107     set->toPattern(str, TRUE);
 108     char buf[40];
 109     str.extract(0, 39, buf, "");
 110     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
 111 }
 112
 113 #else
 114
 115 #define _dbgct(set)
 116 #define _dbgdt(set)
 117
 118 #endif
 119
 120 //----------------------------------------------------------------
 121 // UnicodeString in UVector support
 122 //----------------------------------------------------------------
 123
 124 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
 125     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
 126 }
 127
 128 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
 129     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
 130     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
 131     return a.compare(b);
 132 }
 133
 134 //----------------------------------------------------------------
 135 // Constructors &c
 136 //----------------------------------------------------------------
 137
 138 /**
 139  * Constructs an empty set.
 140  */
 141 UnicodeSet::UnicodeSet() :
 142     len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
 143     list(0), buffer(0), strings(0)
 144 {
 145     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 146     if(list!=NULL){
 147         list[0] = UNICODESET_HIGH;
 148     }
 149     allocateStrings();
 150     _dbgct(this);
 151 }
 152
 153 /**
 154  * Constructs a set containing the given range. If <code>end >
 155  * start</code> then an empty set is created.
 156  *
 157  * @param start first character, inclusive, of range
 158  * @param end last character, inclusive, of range
 159  */
 160 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
 161     len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
 162     list(0), buffer(0), strings(0)
 163 {
 164     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 165     if(list!=NULL){
 166         list[0] = UNICODESET_HIGH;
 167     }
 168     allocateStrings();
 169     complement(start, end);
 170     _dbgct(this);
 171 }
 172
 173 /**
 174  * Constructs a set that is identical to the given UnicodeSet.
 175  */
 176 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
 177     UnicodeFilter(o),
 178     len(0), capacity(o.len + GROW_EXTRA), bufferCapacity(0),
 179     list(0), buffer(0), strings(0)
 180 {
 181     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 182     if(list!=NULL){
 183         allocateStrings();
 184         *this = o;
 185     }
 186     _dbgct(this);
 187 }
 188
 189 /**
 190  * Destructs the set.
 191  */
 192 UnicodeSet::~UnicodeSet() {
 193     _dbgdt(this); // first!
 194     uprv_free(list);
 195     if (buffer) {
 196         uprv_free(buffer);
 197     }
 198     delete strings;
 199 }
 200
 201 /**
 202  * Assigns this object to be a copy of another.
 203  */
 204 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
 205     ensureCapacity(o.len);
 206     len = o.len;
 207     uprv_memcpy(list, o.list, len*sizeof(UChar32));
 208     UErrorCode ec = U_ZERO_ERROR;
 209     strings->assign(*o.strings, cloneUnicodeString, ec);
 210     pat = o.pat;
 211     return *this;
 212 }
 213
 214 /**
 215  * Compares the specified object with this set for equality.  Returns
 216  * <tt>true</tt> if the two sets
 217  * have the same size, and every member of the specified set is
 218  * contained in this set (or equivalently, every member of this set is
 219  * contained in the specified set).
 220  *
 221  * @param o set to be compared for equality with this set.
 222  * @return <tt>true</tt> if the specified set is equal to this set.
 223  */
 224 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
 225     if (len != o.len) return FALSE;
 226     for (int32_t i = 0; i < len; ++i) {
 227         if (list[i] != o.list[i]) return FALSE;
 228     }
 229     if (*strings != *o.strings) return FALSE;
 230     return TRUE;
 231 }
 232
 233 /**
 234  * Returns a copy of this object.  All UnicodeMatcher objects have
 235  * to support cloning in order to allow classes using
 236  * UnicodeMatchers, such as Transliterator, to implement cloning.
 237  */
 238 UnicodeFunctor* UnicodeSet::clone() const {
 239     return new UnicodeSet(*this);
 240 }
 241
 242 /**
 243  * Returns the hash code value for this set.
 244  *
 245  * @return the hash code value for this set.
 246  * @see Object#hashCode()
 247  */
 248 int32_t UnicodeSet::hashCode(void) const {
 249     int32_t result = len;
 250     for (int32_t i = 0; i < len; ++i) {
 251         result *= 1000003;
 252         result += list[i];
 253     }
 254     return result;
 255 }
 256
 257 //----------------------------------------------------------------
 258 // Public API
 259 //----------------------------------------------------------------
 260
 261 /**
 262  * Make this object represent the range <code>start - end</code>.
 263  * If <code>end > start</code> then this object is set to an
 264  * an empty range.
 265  *
 266  * @param start first character in the set, inclusive
 267  * @rparam end last character in the set, inclusive
 268  */
 269 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
 270     clear();
 271     complement(start, end);
 272     return *this;
 273 }
 274
 275 /**
 276  * Returns the number of elements in this set (its cardinality),
 277  * Note than the elements of a set may include both individual
 278  * codepoints and strings.
 279  *
 280  * @return the number of elements in this set (its cardinality).
 281  */
 282 int32_t UnicodeSet::size(void) const {
 283     int32_t n = 0;
 284     int32_t count = getRangeCount();
 285     for (int32_t i = 0; i < count; ++i) {
 286         n += getRangeEnd(i) - getRangeStart(i) + 1;
 287     }
 288     return n + strings->size();
 289 }
 290
 291 /**
 292  * Returns <tt>true</tt> if this set contains no elements.
 293  *
 294  * @return <tt>true</tt> if this set contains no elements.
 295  */
 296 UBool UnicodeSet::isEmpty(void) const {
 297     return len == 1 && strings->size() == 0;
 298 }
 299
 300 /**
 301  * Returns true if this set contains the given character.
 302  * @param c character to be checked for containment
 303  * @return true if the test condition is met
 304  */
 305 UBool UnicodeSet::contains(UChar32 c) const {
 306     // Set i to the index of the start item greater than ch
 307     // We know we will terminate without length test!
 308     // LATER: for large sets, add binary search
 309     //int32_t i = -1;
 310     //for (;;) {
 311     //    if (c < list[++i]) break;
 312     //}
 313     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
 314         return FALSE;
 315     }
 316     int32_t i = findCodePoint(c);
 317     return ((i & 1) != 0); // return true if odd
 318 }
 319
 320 /**
 321  * Returns the smallest value i such that c < list[i].  Caller
 322  * must ensure that c is a legal value or this method will enter
 323  * an infinite loop.  This method performs a binary search.
 324  * @param c a character in the range MIN_VALUE..MAX_VALUE
 325  * inclusive
 326  * @return the smallest integer i in the range 0..len-1,
 327  * inclusive, such that c < list[i]
 328  */
 329 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
 330     /* Examples:
 331                                        findCodePoint(c)
 332        set              list[]         c=0 1 3 4 7 8
 333        ===              ==============   ===========
 334        []               [110000]         0 0 0 0 0 0
 335        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 336        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 337        [:Any:]          [0, 110000]      1 1 1 1 1 1
 338      */
 339
 340     // Return the smallest i such that c < list[i].  Assume
 341     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 342     if (c < list[0])
 343         return 0;
 344     // High runner test.  c is often after the last range, so an
 345     // initial check for this condition pays off.
 346     if (len >= 2 && c >= list[len-2])
 347         return len-1;
 348     int32_t lo = 0;
 349     int32_t hi = len - 1;
 350     // invariant: c >= list[lo]
 351     // invariant: c < list[hi]
 352     for (;;) {
 353         int32_t i = (lo + hi) >> 1;
 354         if (i == lo) {
 355             break; // Found!
 356         } else if (c < list[i]) {
 357             hi = i;
 358         } else {
 359             lo = i;
 360         }
 361     }
 362     return hi;
 363 }
 364
 365 /**
 366  * Returns true if this set contains every character
 367  * of the given range.
 368  * @param start first character, inclusive, of the range
 369  * @param end last character, inclusive, of the range
 370  * @return true if the test condition is met
 371  */
 372 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
 373     //int32_t i = -1;
 374     //for (;;) {
 375     //    if (start < list[++i]) break;
 376     //}
 377     int32_t i = findCodePoint(start);
 378     return ((i & 1) != 0 && end < list[i]);
 379 }
 380
 381 /**
 382  * Returns <tt>true</tt> if this set contains the given
 383  * multicharacter string.
 384  * @param s string to be checked for containment
 385  * @return <tt>true</tt> if this set contains the specified string
 386  */
 387 UBool UnicodeSet::contains(const UnicodeString& s) const {
 388     if (s.length() == 0) return FALSE;
 389     int32_t cp = getSingleCP(s);
 390     if (cp < 0) {
 391         return strings->contains((void*) &s);
 392     } else {
 393         return contains((UChar32) cp);
 394     }
 395 }
 396
 397 /**
 398  * Returns true if this set contains all the characters and strings
 399  * of the given set.
 400  * @param c set to be checked for containment
 401  * @return true if the test condition is met
 402  */
 403 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
 404     // The specified set is a subset if all of its pairs are contained in
 405     // this set.  It's possible to code this more efficiently in terms of
 406     // direct manipulation of the inversion lists if the need arises.
 407     int32_t n = c.getRangeCount();
 408     for (int i=0; i<n; ++i) {
 409         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
 410             return FALSE;
 411         }
 412     }
 413     if (!strings->containsAll(*c.strings)) return FALSE;
 414     return TRUE;
 415 }
 416
 417 /**
 418  * Returns true if this set contains all the characters
 419  * of the given string.
 420  * @param s string containing characters to be checked for containment
 421  * @return true if the test condition is met
 422  */
 423 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
 424     UChar32 cp;
 425     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
 426         cp = s.char32At(i);
 427         if (!contains(cp)) return FALSE;
 428     }
 429     return TRUE;
 430 }
 431
 432 /**
 433  * Returns true if this set contains none of the characters
 434  * of the given range.
 435  * @param start first character, inclusive, of the range
 436  * @param end last character, inclusive, of the range
 437  * @return true if the test condition is met
 438  */
 439 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
 440     //int32_t i = -1;
 441     //for (;;) {
 442     //    if (start < list[++i]) break;
 443     //}
 444     int32_t i = findCodePoint(start);
 445     return ((i & 1) == 0 && end < list[i]);
 446 }
 447
 448 /**
 449  * Returns true if this set contains none of the characters and strings
 450  * of the given set.
 451  * @param c set to be checked for containment
 452  * @return true if the test condition is met
 453  */
 454 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
 455     // The specified set is a subset if all of its pairs are contained in
 456     // this set.  It's possible to code this more efficiently in terms of
 457     // direct manipulation of the inversion lists if the need arises.
 458     int32_t n = c.getRangeCount();
 459     for (int32_t i=0; i<n; ++i) {
 460         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
 461             return FALSE;
 462         }
 463     }
 464     if (!strings->containsNone(*c.strings)) return FALSE;
 465     return TRUE;
 466 }
 467
 468 /**
 469  * Returns true if this set contains none of the characters
 470  * of the given string.
 471  * @param s string containing characters to be checked for containment
 472  * @return true if the test condition is met
 473  */
 474 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
 475     UChar32 cp;
 476     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
 477         cp = s.char32At(i);
 478         if (contains(cp)) return FALSE;
 479     }
 480     return TRUE;
 481 }
 482
 483 /**
 484  * Returns <tt>true</tt> if this set contains any character whose low byte
 485  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
 486  * indexing.
 487  */
 488 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
 489     /* The index value v, in the range [0,255], is contained in this set if
 490      * it is contained in any pair of this set.  Pairs either have the high
 491      * bytes equal, or unequal.  If the high bytes are equal, then we have
 492      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
 493      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
 494      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
 495      * time zone month containment logic.)
 496      */
 497     int32_t i;
 498     for (i=0; i<getRangeCount(); ++i) {
 499         UChar32 low = getRangeStart(i);
 500         UChar32 high = getRangeEnd(i);
 501         if ((low & ~0xFF) == (high & ~0xFF)) {
 502             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
 503                 return TRUE;
 504             }
 505         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
 506             return TRUE;
 507         }
 508     }
 509     if (strings->size() != 0) {
 510         for (i=0; i<strings->size(); ++i) {
 511             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
 512             //if (s.length() == 0) {
 513             //    // Empty strings match everything
 514             //    return TRUE;
 515             //}
 516             // assert(s.length() != 0); // We enforce this elsewhere
 517             UChar32 c = s.char32At(0);
 518             if ((c & 0xFF) == v) {
 519                 return TRUE;
 520             }
 521         }
 522     }
 523     return FALSE;
 524 }
 525
 526 /**
 527  * Implementation of UnicodeMatcher::matches().  Always matches the
 528  * longest possible multichar string.
 529  */
 530 UMatchDegree UnicodeSet::matches(const Replaceable& text,
 531                                  int32_t& offset,
 532                                  int32_t limit,
 533                                  UBool incremental) {
 534     if (offset == limit) {
 535         // Strings, if any, have length != 0, so we don't worry
 536         // about them here.  If we ever allow zero-length strings
 537         // we much check for them here.
 538         if (contains(U_ETHER)) {
 539             return incremental ? U_PARTIAL_MATCH : U_MATCH;
 540         } else {
 541             return U_MISMATCH;
 542         }
 543     } else {
 544         if (strings->size() != 0) { // try strings first
 545
 546             // might separate forward and backward loops later
 547             // for now they are combined
 548
 549             // TODO Improve efficiency of this, at least in the forward
 550             // direction, if not in both.  In the forward direction we
 551             // can assume the strings are sorted.
 552
 553             int32_t i;
 554             UBool forward = offset < limit;
 555
 556             // firstChar is the leftmost char to match in the
 557             // forward direction or the rightmost char to match in
 558             // the reverse direction.
 559             UChar firstChar = text.charAt(offset);
 560
 561             // If there are multiple strings that can match we
 562             // return the longest match.
 563             int32_t highWaterLength = 0;
 564
 565             for (i=0; i<strings->size(); ++i) {
 566                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
 567
 568                 //if (trial.length() == 0) {
 569                 //    return U_MATCH; // null-string always matches
 570                 //}
 571                 // assert(trial.length() != 0); // We ensure this elsewhere
 572
 573                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 574
 575                 // Strings are sorted, so we can optimize in the
 576                 // forward direction.
 577                 if (forward && c > firstChar) break;
 578                 if (c != firstChar) continue;
 579
 580                 int32_t matchLen = matchRest(text, offset, limit, trial);
 581
 582                 if (incremental) {
 583                     int32_t maxLen = forward ? limit-offset : offset-limit;
 584                     if (matchLen == maxLen) {
 585                         // We have successfully matched but only up to limit.
 586                         return U_PARTIAL_MATCH;
 587                     }
 588                 }
 589
 590                 if (matchLen == trial.length()) {
 591                     // We have successfully matched the whole string.
 592                     if (matchLen > highWaterLength) {
 593                         highWaterLength = matchLen;
 594                     }
 595                     // In the forward direction we know strings
 596                     // are sorted so we can bail early.
 597                     if (forward && matchLen < highWaterLength) {
 598                         break;
 599                     }
 600                     continue;
 601                 }
 602             }
 603
 604             // We've checked all strings without a partial match.
 605             // If we have full matches, return the longest one.
 606             if (highWaterLength != 0) {
 607                 offset += forward ? highWaterLength : -highWaterLength;
 608                 return U_MATCH;
 609             }
 610         }
 611         return UnicodeFilter::matches(text, offset, limit, incremental);
 612     }
 613 }
 614
 615 /**
 616  * Returns the longest match for s in text at the given position.
 617  * If limit > start then match forward from start+1 to limit
 618  * matching all characters except s.charAt(0).  If limit < start,
 619  * go backward starting from start-1 matching all characters
 620  * except s.charAt(s.length()-1).  This method assumes that the
 621  * first character, text.charAt(start), matches s, so it does not
 622  * check it.
 623  * @param text the text to match
 624  * @param start the first character to match.  In the forward
 625  * direction, text.charAt(start) is matched against s.charAt(0).
 626  * In the reverse direction, it is matched against
 627  * s.charAt(s.length()-1).
 628  * @param limit the limit offset for matching, either last+1 in
 629  * the forward direction, or last-1 in the reverse direction,
 630  * where last is the index of the last character to match.
 631  * @return If part of s matches up to the limit, return |limit -
 632  * start|.  If all of s matches before reaching the limit, return
 633  * s.length().  If there is a mismatch between s and text, return
 634  * 0
 635  */
 636 int32_t UnicodeSet::matchRest(const Replaceable& text,
 637                               int32_t start, int32_t limit,
 638                               const UnicodeString& s) {
 639     int32_t i;
 640     int32_t maxLen;
 641     int32_t slen = s.length();
 642     if (start < limit) {
 643         maxLen = limit - start;
 644         if (maxLen > slen) maxLen = slen;
 645         for (i = 1; i < maxLen; ++i) {
 646             if (text.charAt(start + i) != s.charAt(i)) return 0;
 647         }
 648     } else {
 649         maxLen = start - limit;
 650         if (maxLen > slen) maxLen = slen;
 651         --slen; // <=> slen = s.length() - 1;
 652         for (i = 1; i < maxLen; ++i) {
 653             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
 654         }
 655     }
 656     return maxLen;
 657 }
 658
 659 /**
 660  * Implement of UnicodeMatcher
 661  */
 662 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
 663     toUnionTo.addAll(*this);
 664 }
 665
 666 /**
 667  * Returns the index of the given character within this set, where
 668  * the set is ordered by ascending code point.  If the character
 669  * is not in this set, return -1.  The inverse of this method is
 670  * <code>charAt()</code>.
 671  * @return an index from 0..size()-1, or -1
 672  */
 673 int32_t UnicodeSet::indexOf(UChar32 c) const {
 674     if (c < MIN_VALUE || c > MAX_VALUE) {
 675         return -1;
 676     }
 677     int32_t i = 0;
 678     int32_t n = 0;
 679     for (;;) {
 680         UChar32 start = list[i++];
 681         if (c < start) {
 682             return -1;
 683         }
 684         UChar32 limit = list[i++];
 685         if (c < limit) {
 686             return n + c - start;
 687         }
 688         n += limit - start;
 689     }
 690 }
 691
 692 /**
 693  * Returns the character at the given index within this set, where
 694  * the set is ordered by ascending code point.  If the index is
 695  * out of range, return (UChar32)-1.  The inverse of this method is
 696  * <code>indexOf()</code>.
 697  * @param index an index from 0..size()-1
 698  * @return the character at the given index, or (UChar32)-1.
 699  */
 700 UChar32 UnicodeSet::charAt(int32_t index) const {
 701     if (index >= 0) {
 702         // len2 is the largest even integer <= len, that is, it is len
 703         // for even values and len-1 for odd values.  With odd values
 704         // the last entry is UNICODESET_HIGH.
 705         int32_t len2 = len & ~1;
 706         for (int32_t i=0; i < len2;) {
 707             UChar32 start = list[i++];
 708             int32_t count = list[i++] - start;
 709             if (index < count) {
 710                 return (UChar32)(start + index);
 711             }
 712             index -= count;
 713         }
 714     }
 715     return (UChar32)-1;
 716 }
 717
 718 /**
 719  * Adds the specified range to this set if it is not already
 720  * present.  If this set already contains the specified range,
 721  * the call leaves this set unchanged.  If <code>end > start</code>
 722  * then an empty range is added, leaving the set unchanged.
 723  *
 724  * @param start first character, inclusive, of range to be added
 725  * to this set.
 726  * @param end last character, inclusive, of range to be added
 727  * to this set.
 728  */
 729 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
 730     if (pinCodePoint(start) < pinCodePoint(end)) {
 731         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
 732         add(range, 2, 0);
 733     } else if (start == end) {
 734         add(start);
 735     }
 736     return *this;
 737 }
 738
 739 // #define DEBUG_US_ADD
 740
 741 #ifdef DEBUG_US_ADD
 742 #include <stdio.h>
 743 void dump(UChar32 c) {
 744     if (c <= 0xFF) {
 745         printf("%c", (char)c);
 746     } else {
 747         printf("U+%04X", c);
 748     }
 749 }
 750 void dump(const UChar32* list, int32_t len) {
 751     printf("[");
 752     for (int32_t i=0; i<len; ++i) {
 753         if (i != 0) printf(", ");
 754         dump(list[i]);
 755     }
 756     printf("]");
 757 }
 758 #endif
 759
 760 /**
 761  * Adds the specified character to this set if it is not already
 762  * present.  If this set already contains the specified character,
 763  * the call leaves this set unchanged.
 764  */
 765 UnicodeSet& UnicodeSet::add(UChar32 c) {
 766     // find smallest i such that c < list[i]
 767     // if odd, then it is IN the set
 768     // if even, then it is OUT of the set
 769     int32_t i = findCodePoint(pinCodePoint(c));
 770
 771     // already in set?
 772     if ((i & 1) != 0) return *this;
 773
 774     // HIGH is 0x110000
 775     // assert(list[len-1] == HIGH);
 776
 777     // empty = [HIGH]
 778     // [start_0, limit_0, start_1, limit_1, HIGH]
 779
 780     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 781     //                             ^
 782     //                             list[i]
 783
 784     // i == 0 means c is before the first range
 785
 786 #ifdef DEBUG_US_ADD
 787     printf("Add of ");
 788     dump(c);
 789     printf(" found at %d", i);
 790     printf(": ");
 791     dump(list, len);
 792     printf(" => ");
 793 #endif
 794
 795     if (c == list[i]-1) {
 796         // c is before start of next range
 797         list[i] = c;
 798         // if we touched the HIGH mark, then add a new one
 799         if (c == (UNICODESET_HIGH - 1)) {
 800             ensureCapacity(len+1);
 801             list[len++] = UNICODESET_HIGH;
 802         }
 803         if (i > 0 && c == list[i-1]) {
 804             // collapse adjacent ranges
 805
 806             // [..., start_k-1, c, c, limit_k, ..., HIGH]
 807             //                     ^
 808             //                     list[i]
 809
 810             //for (int32_t k=i-1; k<len-2; ++k) {
 811             //    list[k] = list[k+2];
 812             //}
 813             UChar32* dst = list + i - 1;
 814             UChar32* src = dst + 2;
 815             UChar32* srclimit = list + len;
 816             while (src < srclimit) *(dst++) = *(src++);
 817
 818             len -= 2;
 819         }
 820     }
 821
 822     else if (i > 0 && c == list[i-1]) {
 823         // c is after end of prior range
 824         list[i-1]++;
 825         // no need to check for collapse here
 826     }
 827
 828     else {
 829         // At this point we know the new char is not adjacent to
 830         // any existing ranges, and it is not 10FFFF.
 831
 832
 833         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 834         //                             ^
 835         //                             list[i]
 836
 837         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 838         //                             ^
 839         //                             list[i]
 840
 841         ensureCapacity(len+2);
 842
 843         //for (int32_t k=len-1; k>=i; --k) {
 844         //    list[k+2] = list[k];
 845         //}
 846         UChar32* src = list + len;
 847         UChar32* dst = src + 2;
 848         UChar32* srclimit = list + i;
 849         while (src > srclimit) *(--dst) = *(--src);
 850
 851         list[i] = c;
 852         list[i+1] = c+1;
 853         len += 2;
 854     }
 855
 856 #ifdef DEBUG_US_ADD
 857     dump(list, len);
 858     printf("\n");
 859
 860     for (i=1; i<len; ++i) {
 861         if (list[i] <= list[i-1]) {
 862             // Corrupt array!
 863             printf("ERROR: list has been corrupted\n");
 864             exit(1);
 865         }
 866     }
 867 #endif
 868
 869     pat.truncate(0);
 870     return *this;
 871 }
 872
 873 /**
 874  * Adds the specified multicharacter to this set if it is not already
 875  * present.  If this set already contains the multicharacter,
 876  * the call leaves this set unchanged.
 877  * Thus "ch" => {"ch"}
 878  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 879  * @param s the source string
 880  * @return the modified set, for chaining
 881  */
 882 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
 883     if (s.length() == 0) return *this;
 884     int32_t cp = getSingleCP(s);
 885     if (cp < 0) {
 886         if (!strings->contains((void*) &s)) {
 887             _add(s);
 888             pat.truncate(0);
 889         }
 890     } else {
 891         add((UChar32)cp, (UChar32)cp);
 892     }
 893     return *this;
 894 }
 895
 896 /**
 897  * Adds the given string, in order, to 'strings'.  The given string
 898  * must have been checked by the caller to not be empty and to not
 899  * already be in 'strings'.
 900  */
 901 void UnicodeSet::_add(const UnicodeString& s) {
 902     UnicodeString* t = new UnicodeString(s);
 903     UErrorCode ec = U_ZERO_ERROR;
 904     strings->sortedInsert(t, compareUnicodeString, ec);
 905 }
 906
 907 /**
 908  * @return a code point IF the string consists of a single one.
 909  * otherwise returns -1.
 910  * @param string to test
 911  */
 912 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
 913     //if (s.length() < 1) {
 914     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
 915     //}
 916     if (s.length() > 2) return -1;
 917     if (s.length() == 1) return s.charAt(0);
 918
 919     // at this point, len = 2
 920     UChar32 cp = s.char32At(0);
 921     if (cp > 0xFFFF) { // is surrogate pair
 922         return cp;
 923     }
 924     return -1;
 925 }
 926
 927 /**
 928  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
 929  * If this set already any particular character, it has no effect on that character.
 930  * @param the source string
 931  * @return the modified set, for chaining
 932  */
 933 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
 934     UChar32 cp;
 935     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
 936         cp = s.char32At(i);
 937         add(cp, cp);
 938     }
 939     return *this;
 940 }
 941
 942 /**
 943  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
 944  * If this set already any particular character, it has no effect on that character.
 945  * @param the source string
 946  * @return the modified set, for chaining
 947  */
 948 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
 949     UnicodeSet set;
 950     set.addAll(s);
 951     retainAll(set);
 952     return *this;
 953 }
 954
 955 /**
 956  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
 957  * If this set already any particular character, it has no effect on that character.
 958  * @param the source string
 959  * @return the modified set, for chaining
 960  */
 961 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
 962     UnicodeSet set;
 963     set.addAll(s);
 964     complementAll(set);
 965     return *this;
 966 }
 967
 968 /**
 969  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
 970  * If this set already any particular character, it has no effect on that character.
 971  * @param the source string
 972  * @return the modified set, for chaining
 973  */
 974 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
 975     UnicodeSet set;
 976     set.addAll(s);
 977     removeAll(set);
 978     return *this;
 979 }
 980
 981 /**
 982  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
 983  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 984  * @param the source string
 985  * @return a newly created set containing the given string
 986  */
 987 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
 988     UnicodeSet *set = new UnicodeSet();
 989     set->add(s);
 990     return set;
 991 }
 992
 993
 994 /**
 995  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
 996  * @param the source string
 997  * @return a newly created set containing the given characters
 998  */
 999 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1000     UnicodeSet *set = new UnicodeSet();
1001     set->addAll(s);
1002     return set;
1003 }
1004
1005 /**
1006  * Retain only the elements in this set that are contained in the
1007  * specified range.  If <code>end > start</code> then an empty range is
1008  * retained, leaving the set empty.
1009  *
1010  * @param start first character, inclusive, of range to be retained
1011  * to this set.
1012  * @param end last character, inclusive, of range to be retained
1013  * to this set.
1014  */
1015 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1016     if (pinCodePoint(start) <= pinCodePoint(end)) {
1017         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1018         retain(range, 2, 0);
1019     } else {
1020         clear();
1021     }
1022     return *this;
1023 }
1024
1025 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1026     return retain(c, c);
1027 }
1028
1029 /**
1030  * Removes the specified range from this set if it is present.
1031  * The set will not contain the specified range once the call
1032  * returns.  If <code>end > start</code> then an empty range is
1033  * removed, leaving the set unchanged.
1034  *
1035  * @param start first character, inclusive, of range to be removed
1036  * from this set.
1037  * @param end last character, inclusive, of range to be removed
1038  * from this set.
1039  */
1040 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1041     if (pinCodePoint(start) <= pinCodePoint(end)) {
1042         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1043         retain(range, 2, 2);
1044     }
1045     return *this;
1046 }
1047
1048 /**
1049  * Removes the specified character from this set if it is present.
1050  * The set will not contain the specified range once the call
1051  * returns.
1052  */
1053 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1054     return remove(c, c);
1055 }
1056
1057 /**
1058  * Removes the specified string from this set if it is present.
1059  * The set will not contain the specified character once the call
1060  * returns.
1061  * @param the source string
1062  * @return the modified set, for chaining
1063  */
1064 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1065     if (s.length() == 0) return *this;
1066     int32_t cp = getSingleCP(s);
1067     if (cp < 0) {
1068         strings->removeElement((void*) &s);
1069         pat.truncate(0);
1070     } else {
1071         remove((UChar32)cp, (UChar32)cp);
1072     }
1073     return *this;
1074 }
1075
1076 /**
1077  * Complements the specified range in this set.  Any character in
1078  * the range will be removed if it is in this set, or will be
1079  * added if it is not in this set.  If <code>end > start</code>
1080  * then an empty range is xor'ed, leaving the set unchanged.
1081  *
1082  * @param start first character, inclusive, of range to be removed
1083  * from this set.
1084  * @param end last character, inclusive, of range to be removed
1085  * from this set.
1086  */
1087 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1088     if (pinCodePoint(start) <= pinCodePoint(end)) {
1089         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1090         exclusiveOr(range, 2, 0);
1091     }
1092     pat.truncate(0);
1093     return *this;
1094 }
1095
1096 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1097     return complement(c, c);
1098 }
1099
1100 /**
1101  * This is equivalent to
1102  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1103  */
1104 UnicodeSet& UnicodeSet::complement(void) {
1105     if (list[0] == UNICODESET_LOW) {
1106         ensureBufferCapacity(len-1);
1107         uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1108         --len;
1109     } else {
1110         ensureBufferCapacity(len+1);
1111         uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1112         buffer[0] = UNICODESET_LOW;
1113         ++len;
1114     }
1115     swapBuffers();
1116     pat.truncate(0);
1117     return *this;
1118 }
1119
1120 /**
1121  * Complement the specified string in this set.
1122  * The set will not contain the specified string once the call
1123  * returns.
1124  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1125  * @param s the string to complement
1126  * @return this object, for chaining
1127  */
1128 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1129     if (s.length() == 0) return *this;
1130     int32_t cp = getSingleCP(s);
1131     if (cp < 0) {
1132         if (strings->contains((void*) &s)) {
1133             strings->removeElement((void*) &s);
1134         } else {
1135             _add(s);
1136         }
1137         pat.truncate(0);
1138     } else {
1139         complement((UChar32)cp, (UChar32)cp);
1140     }
1141     return *this;
1142 }
1143
1144 /**
1145  * Adds all of the elements in the specified set to this set if
1146  * they're not already present.  This operation effectively
1147  * modifies this set so that its value is the <i>union</i> of the two
1148  * sets.  The behavior of this operation is unspecified if the specified
1149  * collection is modified while the operation is in progress.
1150  *
1151  * @param c set whose elements are to be added to this set.
1152  * @see #add(char, char)
1153  */
1154 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1155     add(c.list, c.len, 0);
1156
1157     // Add strings in order
1158     for (int32_t i=0; i<c.strings->size(); ++i) {
1159         const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1160         if (!strings->contains((void*) s)) {
1161             _add(*s);
1162         }
1163     }
1164     return *this;
1165 }
1166
1167 /**
1168  * Retains only the elements in this set that are contained in the
1169  * specified set.  In other words, removes from this set all of
1170  * its elements that are not contained in the specified set.  This
1171  * operation effectively modifies this set so that its value is
1172  * the <i>intersection</i> of the two sets.
1173  *
1174  * @param c set that defines which elements this set will retain.
1175  */
1176 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1177     retain(c.list, c.len, 0);
1178     strings->retainAll(*c.strings);
1179     return *this;
1180 }
1181
1182 /**
1183  * Removes from this set all of its elements that are contained in the
1184  * specified set.  This operation effectively modifies this
1185  * set so that its value is the <i>asymmetric set difference</i> of
1186  * the two sets.
1187  *
1188  * @param c set that defines which elements will be removed from
1189  *          this set.
1190  */
1191 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1192     retain(c.list, c.len, 2);
1193     strings->removeAll(*c.strings);
1194     return *this;
1195 }
1196
1197 /**
1198  * Complements in this set all elements contained in the specified
1199  * set.  Any character in the other set will be removed if it is
1200  * in this set, or will be added if it is not in this set.
1201  *
1202  * @param c set that defines which elements will be xor'ed from
1203  *          this set.
1204  */
1205 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1206     exclusiveOr(c.list, c.len, 0);
1207
1208     for (int32_t i=0; i<c.strings->size(); ++i) {
1209         void* e = c.strings->elementAt(i);
1210         if (!strings->removeElement(e)) {
1211             _add(*(const UnicodeString*)e);
1212         }
1213     }
1214     return *this;
1215 }
1216
1217 /**
1218  * Removes all of the elements from this set.  This set will be
1219  * empty after this call returns.
1220  */
1221 UnicodeSet& UnicodeSet::clear(void) {
1222     list[0] = UNICODESET_HIGH;
1223     len = 1;
1224     pat.truncate(0);
1225     strings->removeAllElements();
1226     return *this;
1227 }
1228
1229 /**
1230  * Iteration method that returns the number of ranges contained in
1231  * this set.
1232  * @see #getRangeStart
1233  * @see #getRangeEnd
1234  */
1235 int32_t UnicodeSet::getRangeCount() const {
1236     return len/2;
1237 }
1238
1239 /**
1240  * Iteration method that returns the first character in the
1241  * specified range of this set.
1242  * @see #getRangeCount
1243  * @see #getRangeEnd
1244  */
1245 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1246     return list[index*2];
1247 }
1248
1249 /**
1250  * Iteration method that returns the last character in the
1251  * specified range of this set.
1252  * @see #getRangeStart
1253  * @see #getRangeEnd
1254  */
1255 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1256     return list[index*2 + 1] - 1;
1257 }
1258
1259 int32_t UnicodeSet::getStringCount() const {
1260     return strings->size();
1261 }
1262
1263 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1264     return (const UnicodeString*) strings->elementAt(index);
1265 }
1266
1267 /**
1268  * Reallocate this objects internal structures to take up the least
1269  * possible space, without changing this object's value.
1270  */
1271 UnicodeSet& UnicodeSet::compact() {
1272     if (len != capacity) {
1273         capacity = len;
1274         UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1275         uprv_memcpy(temp, list, len*sizeof(UChar32));
1276         uprv_free(list);
1277         list = temp;
1278     }
1279     uprv_free(buffer);
1280     buffer = NULL;
1281     return *this;
1282 }
1283
1284 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1285     int32_t bmpLength, length, destLength;
1286
1287     if (U_FAILURE(ec)) {
1288         return 0;
1289     }
1290
1291     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1292         ec=U_ILLEGAL_ARGUMENT_ERROR;
1293         return 0;
1294     }
1295
1296     /* count necessary 16-bit units */
1297     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1298     // assert(length>=0);
1299     if (length==0) {
1300         /* empty set */
1301         if (destCapacity>0) {
1302             *dest=0;
1303         } else {
1304             ec=U_BUFFER_OVERFLOW_ERROR;
1305         }
1306         return 1;
1307     }
1308     /* now length>0 */
1309
1310     if (this->list[length-1]<=0xffff) {
1311         /* all BMP */
1312         bmpLength=length;
1313     } else if (this->list[0]>=0x10000) {
1314         /* all supplementary */
1315         bmpLength=0;
1316         length*=2;
1317     } else {
1318         /* some BMP, some supplementary */
1319         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1320         length=bmpLength+2*(length-bmpLength);
1321     }
1322
1323     /* length: number of 16-bit array units */
1324     if (length>0x7fff) {
1325         /* there are only 15 bits for the length in the first serialized word */
1326         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1327         return 0;
1328     }
1329
1330     /*
1331      * total serialized length:
1332      * number of 16-bit array units (length) +
1333      * 1 length unit (always) +
1334      * 1 bmpLength unit (if there are supplementary values)
1335      */
1336     destLength=length+((length>bmpLength)?2:1);
1337     if (destLength<=destCapacity) {
1338         const UChar32 *p;
1339         int32_t i;
1340
1341         *dest=(uint16_t)length;
1342         if (length>bmpLength) {
1343             *dest|=0x8000;
1344             *++dest=(uint16_t)bmpLength;
1345         }
1346         ++dest;
1347
1348         /* write the BMP part of the array */
1349         p=this->list;
1350         for (i=0; i<bmpLength; ++i) {
1351             *dest++=(uint16_t)*p++;
1352         }
1353
1354         /* write the supplementary part of the array */
1355         for (; i<length; i+=2) {
1356             *dest++=(uint16_t)(*p>>16);
1357             *dest++=(uint16_t)*p++;
1358         }
1359     } else {
1360         ec=U_BUFFER_OVERFLOW_ERROR;
1361     }
1362     return destLength;
1363 }
1364
1365 //----------------------------------------------------------------
1366 // Implementation: Utility methods
1367 //----------------------------------------------------------------
1368
1369 /**
1370  * Allocate our strings vector and return TRUE if successful.
1371  */
1372 UBool UnicodeSet::allocateStrings() {
1373     UErrorCode ec = U_ZERO_ERROR;
1374     strings = new UVector(uhash_deleteUnicodeString,
1375                           uhash_compareUnicodeString, ec);
1376     if (U_FAILURE(ec)) {
1377         delete strings;
1378         strings = NULL;
1379         return FALSE;
1380     }
1381     return TRUE;
1382 }
1383
1384 void UnicodeSet::ensureCapacity(int32_t newLen) {
1385     if (newLen <= capacity)
1386         return;
1387     capacity = newLen + GROW_EXTRA;
1388     UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1389     uprv_memcpy(temp, list, len*sizeof(UChar32));
1390     uprv_free(list);
1391     list = temp;
1392 }
1393
1394 void UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1395     if (buffer != NULL && newLen <= bufferCapacity)
1396         return;
1397     if (buffer) {
1398         uprv_free(buffer);
1399     }
1400     bufferCapacity = newLen + GROW_EXTRA;
1401     buffer = (UChar32*) uprv_malloc(sizeof(UChar32) * bufferCapacity);
1402 }
1403
1404 /**
1405  * Swap list and buffer.
1406  */
1407 void UnicodeSet::swapBuffers(void) {
1408     // swap list and buffer
1409     UChar32* temp = list;
1410     list = buffer;
1411     buffer = temp;
1412
1413     int32_t c = capacity;
1414     capacity = bufferCapacity;
1415     bufferCapacity = c;
1416 }
1417
1418 //----------------------------------------------------------------
1419 // Implementation: Fundamental operators
1420 //----------------------------------------------------------------
1421
1422 static inline UChar32 max(UChar32 a, UChar32 b) {
1423     return (a > b) ? a : b;
1424 }
1425
1426 // polarity = 0, 3 is normal: x xor y
1427 // polarity = 1, 2: x xor ~y == x === y
1428
1429 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1430     ensureBufferCapacity(len + otherLen);
1431     int32_t i = 0, j = 0, k = 0;
1432     UChar32 a = list[i++];
1433     UChar32 b;
1434     if (polarity == 1 || polarity == 2) {
1435         b = UNICODESET_LOW;
1436         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1437             ++j;
1438             b = other[j];
1439         }
1440     } else {
1441         b = other[j++];
1442     }
1443     // simplest of all the routines
1444     // sort the values, discarding identicals!
1445     for (;;) {
1446         if (a < b) {
1447             buffer[k++] = a;
1448             a = list[i++];
1449         } else if (b < a) {
1450             buffer[k++] = b;
1451             b = other[j++];
1452         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1453             // discard both values!
1454             a = list[i++];
1455             b = other[j++];
1456         } else { // DONE!
1457             buffer[k++] = UNICODESET_HIGH;
1458             len = k;
1459             break;
1460         }
1461     }
1462     swapBuffers();
1463     pat.truncate(0);
1464 }
1465
1466 // polarity = 0 is normal: x union y
1467 // polarity = 2: x union ~y
1468 // polarity = 1: ~x union y
1469 // polarity = 3: ~x union ~y
1470
1471 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1472     ensureBufferCapacity(len + otherLen);
1473     int32_t i = 0, j = 0, k = 0;
1474     UChar32 a = list[i++];
1475     UChar32 b = other[j++];
1476     // change from xor is that we have to check overlapping pairs
1477     // polarity bit 1 means a is second, bit 2 means b is.
1478     for (;;) {
1479         switch (polarity) {
1480           case 0: // both first; take lower if unequal
1481             if (a < b) { // take a
1482                 // Back up over overlapping ranges in buffer[]
1483                 if (k > 0 && a <= buffer[k-1]) {
1484                     // Pick latter end value in buffer[] vs. list[]
1485                     a = max(list[i], buffer[--k]);
1486                 } else {
1487                     // No overlap
1488                     buffer[k++] = a;
1489                     a = list[i];
1490                 }
1491                 i++; // Common if/else code factored out
1492                 polarity ^= 1;
1493             } else if (b < a) { // take b
1494                 if (k > 0 && b <= buffer[k-1]) {
1495                     b = max(other[j], buffer[--k]);
1496                 } else {
1497                     buffer[k++] = b;
1498                     b = other[j];
1499                 }
1500                 j++;
1501                 polarity ^= 2;
1502             } else { // a == b, take a, drop b
1503                 if (a == UNICODESET_HIGH) goto loop_end;
1504                 // This is symmetrical; it doesn't matter if
1505                 // we backtrack with a or b. - liu
1506                 if (k > 0 && a <= buffer[k-1]) {
1507                     a = max(list[i], buffer[--k]);
1508                 } else {
1509                     // No overlap
1510                     buffer[k++] = a;
1511                     a = list[i];
1512                 }
1513                 i++;
1514                 polarity ^= 1;
1515                 b = other[j++];
1516                 polarity ^= 2;
1517             }
1518             break;
1519           case 3: // both second; take higher if unequal, and drop other
1520             if (b <= a) { // take a
1521                 if (a == UNICODESET_HIGH) goto loop_end;
1522                 buffer[k++] = a;
1523             } else { // take b
1524                 if (b == UNICODESET_HIGH) goto loop_end;
1525                 buffer[k++] = b;
1526             }
1527             a = list[i++];
1528             polarity ^= 1;   // factored common code
1529             b = other[j++];
1530             polarity ^= 2;
1531             break;
1532           case 1: // a second, b first; if b < a, overlap
1533             if (a < b) { // no overlap, take a
1534                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1535             } else if (b < a) { // OVERLAP, drop b
1536                 b = other[j++];
1537                 polarity ^= 2;
1538             } else { // a == b, drop both!
1539                 if (a == UNICODESET_HIGH) goto loop_end;
1540                 a = list[i++];
1541                 polarity ^= 1;
1542                 b = other[j++];
1543                 polarity ^= 2;
1544             }
1545             break;
1546           case 2: // a first, b second; if a < b, overlap
1547             if (b < a) { // no overlap, take b
1548                 buffer[k++] = b;
1549                 b = other[j++];
1550                 polarity ^= 2;
1551             } else  if (a < b) { // OVERLAP, drop a
1552                 a = list[i++];
1553                 polarity ^= 1;
1554             } else { // a == b, drop both!
1555                 if (a == UNICODESET_HIGH) goto loop_end;
1556                 a = list[i++];
1557                 polarity ^= 1;
1558                 b = other[j++];
1559                 polarity ^= 2;
1560             }
1561             break;
1562         }
1563     }
1564  loop_end:
1565     buffer[k++] = UNICODESET_HIGH;    // terminate
1566     len = k;
1567     swapBuffers();
1568     pat.truncate(0);
1569 }
1570
1571 // polarity = 0 is normal: x intersect y
1572 // polarity = 2: x intersect ~y == set-minus
1573 // polarity = 1: ~x intersect y
1574 // polarity = 3: ~x intersect ~y
1575
1576 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1577     ensureBufferCapacity(len + otherLen);
1578     int32_t i = 0, j = 0, k = 0;
1579     UChar32 a = list[i++];
1580     UChar32 b = other[j++];
1581     // change from xor is that we have to check overlapping pairs
1582     // polarity bit 1 means a is second, bit 2 means b is.
1583     for (;;) {
1584         switch (polarity) {
1585           case 0: // both first; drop the smaller
1586             if (a < b) { // drop a
1587                 a = list[i++];
1588                 polarity ^= 1;
1589             } else if (b < a) { // drop b
1590                 b = other[j++];
1591                 polarity ^= 2;
1592             } else { // a == b, take one, drop other
1593                 if (a == UNICODESET_HIGH) goto loop_end;
1594                 buffer[k++] = a;
1595                 a = list[i++];
1596                 polarity ^= 1;
1597                 b = other[j++];
1598                 polarity ^= 2;
1599             }
1600             break;
1601           case 3: // both second; take lower if unequal
1602             if (a < b) { // take a
1603                 buffer[k++] = a;
1604                 a = list[i++];
1605                 polarity ^= 1;
1606             } else if (b < a) { // take b
1607                 buffer[k++] = b;
1608                 b = other[j++];
1609                 polarity ^= 2;
1610             } else { // a == b, take one, drop other
1611                 if (a == UNICODESET_HIGH) goto loop_end;
1612                 buffer[k++] = a;
1613                 a = list[i++];
1614                 polarity ^= 1;
1615                 b = other[j++];
1616                 polarity ^= 2;
1617             }
1618             break;
1619           case 1: // a second, b first;
1620             if (a < b) { // NO OVERLAP, drop a
1621                 a = list[i++];
1622                 polarity ^= 1;
1623             } else if (b < a) { // OVERLAP, take b
1624                 buffer[k++] = b;
1625                 b = other[j++];
1626                 polarity ^= 2;
1627             } else { // a == b, drop both!
1628                 if (a == UNICODESET_HIGH) goto loop_end;
1629                 a = list[i++];
1630                 polarity ^= 1;
1631                 b = other[j++];
1632                 polarity ^= 2;
1633             }
1634             break;
1635           case 2: // a first, b second; if a < b, overlap
1636             if (b < a) { // no overlap, drop b
1637                 b = other[j++];
1638                 polarity ^= 2;
1639             } else  if (a < b) { // OVERLAP, take a
1640                 buffer[k++] = a;
1641                 a = list[i++];
1642                 polarity ^= 1;
1643             } else { // a == b, drop both!
1644                 if (a == UNICODESET_HIGH) goto loop_end;
1645                 a = list[i++];
1646                 polarity ^= 1;
1647                 b = other[j++];
1648                 polarity ^= 2;
1649             }
1650             break;
1651         }
1652     }
1653  loop_end:
1654     buffer[k++] = UNICODESET_HIGH;    // terminate
1655     len = k;
1656     swapBuffers();
1657     pat.truncate(0);
1658 }
1659
1660 /**
1661  * Append the <code>toPattern()</code> representation of a
1662  * string to the given <code>StringBuffer</code>.
1663  */
1664 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1665 escapeUnprintable) {
1666     UChar32 cp;
1667     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1668         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1669     }
1670 }
1671
1672 /**
1673  * Append the <code>toPattern()</code> representation of a
1674  * character to the given <code>StringBuffer</code>.
1675  */
1676 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1677 escapeUnprintable) {
1678     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1679         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1680         // unprintable
1681         if (ICU_Utility::escapeUnprintable(buf, c)) {
1682             return;
1683         }
1684     }
1685     // Okay to let ':' pass through
1686     switch (c) {
1687     case SET_OPEN:
1688     case SET_CLOSE:
1689     case HYPHEN:
1690     case COMPLEMENT:
1691     case INTERSECTION:
1692     case BACKSLASH:
1693     case OPEN_BRACE:
1694     case CLOSE_BRACE:
1695     case COLON:
1696     case SymbolTable::SYMBOL_REF:
1697         buf.append(BACKSLASH);
1698         break;
1699     default:
1700         // Escape whitespace
1701         if (uprv_isRuleWhiteSpace(c)) {
1702             buf.append(BACKSLASH);
1703         }
1704         break;
1705     }
1706     buf.append(c);
1707 }
1708
1709 /**
1710  * Append a string representation of this set to result.  This will be
1711  * a cleaned version of the string passed to applyPattern(), if there
1712  * is one.  Otherwise it will be generated.
1713  */
1714 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1715                                       UBool escapeUnprintable) const {
1716     if (pat.length() > 0) {
1717         int32_t i;
1718         int32_t backslashCount = 0;
1719         for (i=0; i<pat.length(); ) {
1720             UChar32 c = pat.char32At(i);
1721             i += UTF_CHAR_LENGTH(c);
1722             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1723                 // If the unprintable character is preceded by an odd
1724                 // number of backslashes, then it has been escaped.
1725                 // Before unescaping it, we delete the final
1726                 // backslash.
1727                 if ((backslashCount % 2) == 1) {
1728                     result.truncate(result.length() - 1);
1729                 }
1730                 ICU_Utility::escapeUnprintable(result, c);
1731                 backslashCount = 0;
1732             } else {
1733                 result.append(c);
1734                 if (c == BACKSLASH) {
1735                     ++backslashCount;
1736                 } else {
1737                     backslashCount = 0;
1738                 }
1739             }
1740         }
1741         return result;
1742     }
1743
1744     return _generatePattern(result, escapeUnprintable);
1745 }
1746
1747 /**
1748  * Returns a string representation of this set.  If the result of
1749  * calling this function is passed to a UnicodeSet constructor, it
1750  * will produce another set that is equal to this one.
1751  */
1752 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1753                                      UBool escapeUnprintable) const {
1754     result.truncate(0);
1755     return _toPattern(result, escapeUnprintable);
1756 }
1757
1758 /**
1759  * Generate and append a string representation of this set to result.
1760  * This does not use this.pat, the cleaned up copy of the string
1761  * passed to applyPattern().
1762  */
1763 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1764                                             UBool escapeUnprintable) const {
1765     result.append(SET_OPEN);
1766
1767 //  // Check against the predefined categories.  We implicitly build
1768 //  // up ALL category sets the first time toPattern() is called.
1769 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
1770 //      if (*this == getCategorySet(cat)) {
1771 //          result.append(COLON);
1772 //          result.append(CATEGORY_NAMES, cat*2, 2);
1773 //          return result.append(CATEGORY_CLOSE);
1774 //      }
1775 //  }
1776
1777     int32_t count = getRangeCount();
1778
1779     // If the set contains at least 2 intervals and includes both
1780     // MIN_VALUE and MAX_VALUE, then the inverse representation will
1781     // be more economical.
1782     if (count > 1 &&
1783         getRangeStart(0) == MIN_VALUE &&
1784         getRangeEnd(count-1) == MAX_VALUE) {
1785
1786         // Emit the inverse
1787         result.append(COMPLEMENT);
1788
1789         for (int32_t i = 1; i < count; ++i) {
1790             UChar32 start = getRangeEnd(i-1)+1;
1791             UChar32 end = getRangeStart(i)-1;
1792             _appendToPat(result, start, escapeUnprintable);
1793             if (start != end) {
1794                 if ((start+1) != end) {
1795                     result.append(HYPHEN);
1796                 }
1797                 _appendToPat(result, end, escapeUnprintable);
1798             }
1799         }
1800     }
1801
1802     // Default; emit the ranges as pairs
1803     else {
1804         for (int32_t i = 0; i < count; ++i) {
1805             UChar32 start = getRangeStart(i);
1806             UChar32 end = getRangeEnd(i);
1807             _appendToPat(result, start, escapeUnprintable);
1808             if (start != end) {
1809                 if ((start+1) != end) {
1810                     result.append(HYPHEN);
1811                 }
1812                 _appendToPat(result, end, escapeUnprintable);
1813             }
1814         }
1815     }
1816
1817     for (int32_t i = 0; i<strings->size(); ++i) {
1818         result.append(OPEN_BRACE);
1819         _appendToPat(result,
1820                      *(const UnicodeString*) strings->elementAt(i),
1821                      escapeUnprintable);
1822         result.append(CLOSE_BRACE);
1823     }
1824     return result.append(SET_CLOSE);
1825 }
1826
1827
1828 U_NAMESPACE_END