icuSources/common/uniset_props.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  uniset_props.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character property dependent functions moved here from uniset.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/parsepos.h"
  22 #include "unicode/uchar.h"
  23 #include "unicode/uscript.h"
  24 #include "unicode/symtable.h"
  25 #include "unicode/uset.h"
  26 #include "unicode/locid.h"
  27 #include "unicode/brkiter.h"
  28 #include "uset_imp.h"
  29 #include "ruleiter.h"
  30 #include "cmemory.h"
  31 #include "ucln_cmn.h"
  32 #include "util.h"
  33 #include "uvector.h"
  34 #include "uprops.h"
  35 #include "propname.h"
  36 #include "unormimp.h"
  37 #include "ucase.h"
  38 #include "ubidi_props.h"
  39 #include "uinvchar.h"
  40 #include "charstr.h"
  41 #include "cstring.h"
  42 #include "umutex.h"
  43 #include "uassert.h"
  44 #include "hash.h"
  45
  46 U_NAMESPACE_USE
  47
  48 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  49
  50 // initial storage. Must be >= 0
  51 // *** same as in uniset.cpp ! ***
  52 #define START_EXTRA 16
  53
  54 // Define UChar constants using hex for EBCDIC compatibility
  55 // Used #define to reduce private static exports and memory access time.
  56 #define SET_OPEN        ((UChar)0x005B) /*[*/
  57 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  58 #define HYPHEN          ((UChar)0x002D) /*-*/
  59 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  60 #define COLON           ((UChar)0x003A) /*:*/
  61 #define BACKSLASH       ((UChar)0x005C) /*\*/
  62 #define INTERSECTION    ((UChar)0x0026) /*&*/
  63 #define UPPER_U         ((UChar)0x0055) /*U*/
  64 #define LOWER_U         ((UChar)0x0075) /*u*/
  65 #define OPEN_BRACE      ((UChar)123)    /*{*/
  66 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  67 #define UPPER_P         ((UChar)0x0050) /*P*/
  68 #define LOWER_P         ((UChar)0x0070) /*p*/
  69 #define UPPER_N         ((UChar)78)     /*N*/
  70 #define EQUALS          ((UChar)0x003D) /*=*/
  71
  72 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
  73 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
  74 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
  75 static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
  76 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
  77 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
  78
  79 // Special property set IDs
  80 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
  81 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
  82 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
  83
  84 // Unicode name property alias
  85 #define NAME_PROP "na"
  86 #define NAME_PROP_LENGTH 2
  87
  88 /**
  89  * Delimiter string used in patterns to close a category reference:
  90  * ":]".  Example: "[:Lu:]".
  91  */
  92 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
  93
  94 U_CDECL_BEGIN
  95
  96 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
  97
  98 //----------------------------------------------------------------
  99 // Inclusions list
 100 //----------------------------------------------------------------
 101
 102 // USetAdder implementation
 103 // Does not use uset.h to reduce code dependencies
 104 static void U_CALLCONV
 105 _set_add(USet *set, UChar32 c) {
 106     ((UnicodeSet *)set)->add(c);
 107 }
 108
 109 static void U_CALLCONV
 110 _set_addRange(USet *set, UChar32 start, UChar32 end) {
 111     ((UnicodeSet *)set)->add(start, end);
 112 }
 113
 114 static void U_CALLCONV
 115 _set_addString(USet *set, const UChar *str, int32_t length) {
 116     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
 117 }
 118
 119 /**
 120  * Cleanup function for UnicodeSet
 121  */
 122 static UBool U_CALLCONV uset_cleanup(void) {
 123     int32_t i;
 124
 125     for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
 126         if (INCLUSIONS[i] != NULL) {
 127             delete INCLUSIONS[i];
 128             INCLUSIONS[i] = NULL;
 129         }
 130     }
 131
 132     return TRUE;
 133 }
 134
 135 U_CDECL_END
 136
 137 U_NAMESPACE_BEGIN
 138
 139 /*
 140 Reduce excessive reallocation, and make it easier to detect initialization
 141 problems.
 142 Usually you don't see smaller sets than this for Unicode 5.0.
 143 */
 144 #define DEFAULT_INCLUSION_CAPACITY 3072
 145
 146 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
 147     UBool needInit;
 148     UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
 149     if (needInit) {
 150         UnicodeSet* incl = new UnicodeSet();
 151         USetAdder sa = {
 152             (USet *)incl,
 153             _set_add,
 154             _set_addRange,
 155             _set_addString,
 156             NULL, // don't need remove()
 157             NULL // don't need removeRange()
 158         };
 159         incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
 160         if (incl != NULL) {
 161             switch(src) {
 162             case UPROPS_SRC_CHAR:
 163                 uchar_addPropertyStarts(&sa, &status);
 164                 break;
 165             case UPROPS_SRC_PROPSVEC:
 166                 upropsvec_addPropertyStarts(&sa, &status);
 167                 break;
 168             case UPROPS_SRC_CHAR_AND_PROPSVEC:
 169                 uchar_addPropertyStarts(&sa, &status);
 170                 upropsvec_addPropertyStarts(&sa, &status);
 171                 break;
 172             case UPROPS_SRC_HST:
 173                 uhst_addPropertyStarts(&sa, &status);
 174                 break;
 175 #if !UCONFIG_NO_NORMALIZATION
 176             case UPROPS_SRC_NORM:
 177                 unorm_addPropertyStarts(&sa, &status);
 178                 break;
 179 #endif
 180             case UPROPS_SRC_CASE:
 181                 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
 182                 break;
 183             case UPROPS_SRC_BIDI:
 184                 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
 185                 break;
 186             default:
 187                 status = U_INTERNAL_PROGRAM_ERROR;
 188                 break;
 189             }
 190             if (U_SUCCESS(status)) {
 191                 // Compact for caching
 192                 incl->compact();
 193                 umtx_lock(NULL);
 194                 if (INCLUSIONS[src] == NULL) {
 195                     INCLUSIONS[src] = incl;
 196                     incl = NULL;
 197                     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
 198                 }
 199                 umtx_unlock(NULL);
 200             }
 201             delete incl;
 202         } else {
 203             status = U_MEMORY_ALLOCATION_ERROR;
 204         }
 205     }
 206     return INCLUSIONS[src];
 207 }
 208
 209 // helper functions for matching of pattern syntax pieces ------------------ ***
 210 // these functions are parallel to the PERL_OPEN etc. strings above
 211
 212 // using these functions is not only faster than UnicodeString::compare() and
 213 // caseCompare(), but they also make UnicodeSet work for simple patterns when
 214 // no Unicode properties data is available - when caseCompare() fails
 215
 216 static inline UBool
 217 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
 218     UChar c;
 219     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
 220 }
 221
 222 /*static inline UBool
 223 isPerlClose(const UnicodeString &pattern, int32_t pos) {
 224     return pattern.charAt(pos)==CLOSE_BRACE;
 225 }*/
 226
 227 static inline UBool
 228 isNameOpen(const UnicodeString &pattern, int32_t pos) {
 229     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
 230 }
 231
 232 static inline UBool
 233 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
 234     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
 235 }
 236
 237 /*static inline UBool
 238 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
 239     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
 240 }*/
 241
 242 // TODO memory debugging provided inside uniset.cpp
 243 // could be made available here but probably obsolete with use of modern
 244 // memory leak checker tools
 245 #define _dbgct(me)
 246
 247 //----------------------------------------------------------------
 248 // Constructors &c
 249 //----------------------------------------------------------------
 250
 251 /**
 252  * Constructs a set from the given pattern, optionally ignoring
 253  * white space.  See the class description for the syntax of the
 254  * pattern language.
 255  * @param pattern a string specifying what characters are in the set
 256  */
 257 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
 258                        UErrorCode& status) :
 259     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 260     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 261     fFlags(0)
 262 {
 263     if(U_SUCCESS(status)){
 264         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 265         /* test for NULL */
 266         if(list == NULL) {
 267             status = U_MEMORY_ALLOCATION_ERROR;
 268         }else{
 269             allocateStrings(status);
 270             applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
 271         }
 272     }
 273     _dbgct(this);
 274 }
 275
 276 /**
 277  * Constructs a set from the given pattern, optionally ignoring
 278  * white space.  See the class description for the syntax of the
 279  * pattern language.
 280  * @param pattern a string specifying what characters are in the set
 281  * @param options bitmask for options to apply to the pattern.
 282  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 283  */
 284 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
 285                        uint32_t options,
 286                        const SymbolTable* symbols,
 287                        UErrorCode& status) :
 288     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 289     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 290     fFlags(0)
 291 {
 292     if(U_SUCCESS(status)){
 293         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 294         /* test for NULL */
 295         if(list == NULL) {
 296             status = U_MEMORY_ALLOCATION_ERROR;
 297         }else{
 298             allocateStrings(status);
 299             applyPattern(pattern, options, symbols, status);
 300         }
 301     }
 302     _dbgct(this);
 303 }
 304
 305 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
 306                        uint32_t options,
 307                        const SymbolTable* symbols,
 308                        UErrorCode& status) :
 309     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 310     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 311     fFlags(0)
 312 {
 313     if(U_SUCCESS(status)){
 314         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 315         /* test for NULL */
 316         if(list == NULL) {
 317             status = U_MEMORY_ALLOCATION_ERROR;
 318         }else{
 319             allocateStrings(status);
 320             applyPattern(pattern, pos, options, symbols, status);
 321         }
 322     }
 323     _dbgct(this);
 324 }
 325
 326 //----------------------------------------------------------------
 327 // Public API
 328 //----------------------------------------------------------------
 329
 330 /**
 331  * Modifies this set to represent the set specified by the given
 332  * pattern, optionally ignoring white space.  See the class
 333  * description for the syntax of the pattern language.
 334  * @param pattern a string specifying what characters are in the set
 335  * @param ignoreSpaces if <code>true</code>, all spaces in the
 336  * pattern are ignored.  Spaces are those characters for which
 337  * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
 338  * Characters preceded by '\\' are escaped, losing any special
 339  * meaning they otherwise have.  Spaces may be included by
 340  * escaping them.
 341  * @exception <code>IllegalArgumentException</code> if the pattern
 342  * contains a syntax error.
 343  */
 344 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 345                                      UErrorCode& status) {
 346     return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
 347 }
 348
 349
 350 /**
 351  * Modifies this set to represent the set specified by the given
 352  * pattern, optionally ignoring white space.  See the class
 353  * description for the syntax of the pattern language.
 354  * @param pattern a string specifying what characters are in the set
 355  * @param options bitmask for options to apply to the pattern.
 356  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 357  */
 358 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 359                                      uint32_t options,
 360                                      const SymbolTable* symbols,
 361                                      UErrorCode& status) {
 362     if (U_FAILURE(status) || isFrozen()) {
 363         return *this;
 364     }
 365
 366     ParsePosition pos(0);
 367     applyPattern(pattern, pos, options, symbols, status);
 368     if (U_FAILURE(status)) return *this;
 369
 370     int32_t i = pos.getIndex();
 371
 372     if (options & USET_IGNORE_SPACE) {
 373         // Skip over trailing whitespace
 374         ICU_Utility::skipWhitespace(pattern, i, TRUE);
 375     }
 376
 377     if (i != pattern.length()) {
 378         status = U_ILLEGAL_ARGUMENT_ERROR;
 379     }
 380     return *this;
 381 }
 382
 383 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 384                               ParsePosition& pos,
 385                               uint32_t options,
 386                               const SymbolTable* symbols,
 387                               UErrorCode& status) {
 388     if (U_FAILURE(status) || isFrozen()) {
 389         return *this;
 390     }
 391     // Need to build the pattern in a temporary string because
 392     // _applyPattern calls add() etc., which set pat to empty.
 393     UnicodeString rebuiltPat;
 394     RuleCharacterIterator chars(pattern, symbols, pos);
 395     applyPattern(chars, symbols, rebuiltPat, options, status);
 396     if (U_FAILURE(status)) return *this;
 397     if (chars.inVariable()) {
 398         // syntaxError(chars, "Extra chars in variable value");
 399         status = U_MALFORMED_SET;
 400         return *this;
 401     }
 402     setPattern(rebuiltPat);
 403     return *this;
 404 }
 405
 406 /**
 407  * Return true if the given position, in the given pattern, appears
 408  * to be the start of a UnicodeSet pattern.
 409  */
 410 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
 411     return ((pos+1) < pattern.length() &&
 412             pattern.charAt(pos) == (UChar)91/*[*/) ||
 413         resemblesPropertyPattern(pattern, pos);
 414 }
 415
 416 //----------------------------------------------------------------
 417 // Implementation: Pattern parsing
 418 //----------------------------------------------------------------
 419
 420 /**
 421  * A small all-inline class to manage a UnicodeSet pointer.  Add
 422  * operator->() etc. as needed.
 423  */
 424 class UnicodeSetPointer {
 425     UnicodeSet* p;
 426 public:
 427     inline UnicodeSetPointer() : p(0) {}
 428     inline ~UnicodeSetPointer() { delete p; }
 429     inline UnicodeSet* pointer() { return p; }
 430     inline UBool allocate() {
 431         if (p == 0) {
 432             p = new UnicodeSet();
 433         }
 434         return p != 0;
 435     }
 436 };
 437
 438 /**
 439  * Parse the pattern from the given RuleCharacterIterator.  The
 440  * iterator is advanced over the parsed pattern.
 441  * @param chars iterator over the pattern characters.  Upon return
 442  * it will be advanced to the first character after the parsed
 443  * pattern, or the end of the iteration if all characters are
 444  * parsed.
 445  * @param symbols symbol table to use to parse and dereference
 446  * variables, or null if none.
 447  * @param rebuiltPat the pattern that was parsed, rebuilt or
 448  * copied from the input pattern, as appropriate.
 449  * @param options a bit mask of zero or more of the following:
 450  * IGNORE_SPACE, CASE.
 451  */
 452 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 453                               const SymbolTable* symbols,
 454                               UnicodeString& rebuiltPat,
 455                               uint32_t options,
 456                               UErrorCode& ec) {
 457     if (U_FAILURE(ec)) return;
 458
 459     // Syntax characters: [ ] ^ - & { }
 460
 461     // Recognized special forms for chars, sets: c-c s-s s&s
 462
 463     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
 464                    RuleCharacterIterator::PARSE_ESCAPES;
 465     if ((options & USET_IGNORE_SPACE) != 0) {
 466         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
 467     }
 468
 469     UnicodeString patLocal, buf;
 470     UBool usePat = FALSE;
 471     UnicodeSetPointer scratch;
 472     RuleCharacterIterator::Pos backup;
 473
 474     // mode: 0=before [, 1=between [...], 2=after ]
 475     // lastItem: 0=none, 1=char, 2=set
 476     int8_t lastItem = 0, mode = 0;
 477     UChar32 lastChar = 0;
 478     UChar op = 0;
 479
 480     UBool invert = FALSE;
 481
 482     clear();
 483
 484     while (mode != 2 && !chars.atEnd()) {
 485         U_ASSERT((lastItem == 0 && op == 0) ||
 486                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
 487                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
 488                                     op == INTERSECTION /*'&'*/)));
 489
 490         UChar32 c = 0;
 491         UBool literal = FALSE;
 492         UnicodeSet* nested = 0; // alias - do not delete
 493
 494         // -------- Check for property pattern
 495
 496         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
 497         int8_t setMode = 0;
 498         if (resemblesPropertyPattern(chars, opts)) {
 499             setMode = 2;
 500         }
 501
 502         // -------- Parse '[' of opening delimiter OR nested set.
 503         // If there is a nested set, use `setMode' to define how
 504         // the set should be parsed.  If the '[' is part of the
 505         // opening delimiter for this pattern, parse special
 506         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
 507         // characters representing a nested set in the symbol
 508         // table.
 509
 510         else {
 511             // Prepare to backup if necessary
 512             chars.getPos(backup);
 513             c = chars.next(opts, literal, ec);
 514             if (U_FAILURE(ec)) return;
 515
 516             if (c == 0x5B /*'['*/ && !literal) {
 517                 if (mode == 1) {
 518                     chars.setPos(backup); // backup
 519                     setMode = 1;
 520                 } else {
 521                     // Handle opening '[' delimiter
 522                     mode = 1;
 523                     patLocal.append((UChar) 0x5B /*'['*/);
 524                     chars.getPos(backup); // prepare to backup
 525                     c = chars.next(opts, literal, ec);
 526                     if (U_FAILURE(ec)) return;
 527                     if (c == 0x5E /*'^'*/ && !literal) {
 528                         invert = TRUE;
 529                         patLocal.append((UChar) 0x5E /*'^'*/);
 530                         chars.getPos(backup); // prepare to backup
 531                         c = chars.next(opts, literal, ec);
 532                         if (U_FAILURE(ec)) return;
 533                     }
 534                     // Fall through to handle special leading '-';
 535                     // otherwise restart loop for nested [], \p{}, etc.
 536                     if (c == HYPHEN /*'-'*/) {
 537                         literal = TRUE;
 538                         // Fall through to handle literal '-' below
 539                     } else {
 540                         chars.setPos(backup); // backup
 541                         continue;
 542                     }
 543                 }
 544             } else if (symbols != 0) {
 545                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
 546                 if (m != 0) {
 547                     if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
 548                         ec = U_MALFORMED_SET;
 549                         return;
 550                     }
 551                     // casting away const, but `nested' won't be modified
 552                     // (important not to modify stored set)
 553                     nested = (UnicodeSet*) m;
 554                     setMode = 3;
 555                 }
 556             }
 557         }
 558
 559         // -------- Handle a nested set.  This either is inline in
 560         // the pattern or represented by a stand-in that has
 561         // previously been parsed and was looked up in the symbol
 562         // table.
 563
 564         if (setMode != 0) {
 565             if (lastItem == 1) {
 566                 if (op != 0) {
 567                     // syntaxError(chars, "Char expected after operator");
 568                     ec = U_MALFORMED_SET;
 569                     return;
 570                 }
 571                 add(lastChar, lastChar);
 572                 _appendToPat(patLocal, lastChar, FALSE);
 573                 lastItem = 0;
 574                 op = 0;
 575             }
 576
 577             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
 578                 patLocal.append(op);
 579             }
 580
 581             if (nested == 0) {
 582                 // lazy allocation
 583                 if (!scratch.allocate()) {
 584                     ec = U_MEMORY_ALLOCATION_ERROR;
 585                     return;
 586                 }
 587                 nested = scratch.pointer();
 588             }
 589             switch (setMode) {
 590             case 1:
 591                 nested->applyPattern(chars, symbols, patLocal, options, ec);
 592                 break;
 593             case 2:
 594                 chars.skipIgnored(opts);
 595                 nested->applyPropertyPattern(chars, patLocal, ec);
 596                 if (U_FAILURE(ec)) return;
 597                 break;
 598             case 3: // `nested' already parsed
 599                 nested->_toPattern(patLocal, FALSE);
 600                 break;
 601             }
 602
 603             usePat = TRUE;
 604
 605             if (mode == 0) {
 606                 // Entire pattern is a category; leave parse loop
 607                 *this = *nested;
 608                 mode = 2;
 609                 break;
 610             }
 611
 612             switch (op) {
 613             case HYPHEN: /*'-'*/
 614                 removeAll(*nested);
 615                 break;
 616             case INTERSECTION: /*'&'*/
 617                 retainAll(*nested);
 618                 break;
 619             case 0:
 620                 addAll(*nested);
 621                 break;
 622             }
 623
 624             op = 0;
 625             lastItem = 2;
 626
 627             continue;
 628         }
 629
 630         if (mode == 0) {
 631             // syntaxError(chars, "Missing '['");
 632             ec = U_MALFORMED_SET;
 633             return;
 634         }
 635
 636         // -------- Parse special (syntax) characters.  If the
 637         // current character is not special, or if it is escaped,
 638         // then fall through and handle it below.
 639
 640         if (!literal) {
 641             switch (c) {
 642             case 0x5D /*']'*/:
 643                 if (lastItem == 1) {
 644                     add(lastChar, lastChar);
 645                     _appendToPat(patLocal, lastChar, FALSE);
 646                 }
 647                 // Treat final trailing '-' as a literal
 648                 if (op == HYPHEN /*'-'*/) {
 649                     add(op, op);
 650                     patLocal.append(op);
 651                 } else if (op == INTERSECTION /*'&'*/) {
 652                     // syntaxError(chars, "Trailing '&'");
 653                     ec = U_MALFORMED_SET;
 654                     return;
 655                 }
 656                 patLocal.append((UChar) 0x5D /*']'*/);
 657                 mode = 2;
 658                 continue;
 659             case HYPHEN /*'-'*/:
 660                 if (op == 0) {
 661                     if (lastItem != 0) {
 662                         op = (UChar) c;
 663                         continue;
 664                     } else {
 665                         // Treat final trailing '-' as a literal
 666                         add(c, c);
 667                         c = chars.next(opts, literal, ec);
 668                         if (U_FAILURE(ec)) return;
 669                         if (c == 0x5D /*']'*/ && !literal) {
 670                             patLocal.append(HYPHEN_RIGHT_BRACE);
 671                             mode = 2;
 672                             continue;
 673                         }
 674                     }
 675                 }
 676                 // syntaxError(chars, "'-' not after char or set");
 677                 ec = U_MALFORMED_SET;
 678                 return;
 679             case INTERSECTION /*'&'*/:
 680                 if (lastItem == 2 && op == 0) {
 681                     op = (UChar) c;
 682                     continue;
 683                 }
 684                 // syntaxError(chars, "'&' not after set");
 685                 ec = U_MALFORMED_SET;
 686                 return;
 687             case 0x5E /*'^'*/:
 688                 // syntaxError(chars, "'^' not after '['");
 689                 ec = U_MALFORMED_SET;
 690                 return;
 691             case 0x7B /*'{'*/:
 692                 if (op != 0) {
 693                     // syntaxError(chars, "Missing operand after operator");
 694                     ec = U_MALFORMED_SET;
 695                     return;
 696                 }
 697                 if (lastItem == 1) {
 698                     add(lastChar, lastChar);
 699                     _appendToPat(patLocal, lastChar, FALSE);
 700                 }
 701                 lastItem = 0;
 702                 buf.truncate(0);
 703                 {
 704                     UBool ok = FALSE;
 705                     while (!chars.atEnd()) {
 706                         c = chars.next(opts, literal, ec);
 707                         if (U_FAILURE(ec)) return;
 708                         if (c == 0x7D /*'}'*/ && !literal) {
 709                             ok = TRUE;
 710                             break;
 711                         }
 712                         buf.append(c);
 713                     }
 714                     if (buf.length() < 1 || !ok) {
 715                         // syntaxError(chars, "Invalid multicharacter string");
 716                         ec = U_MALFORMED_SET;
 717                         return;
 718                     }
 719                 }
 720                 // We have new string. Add it to set and continue;
 721                 // we don't need to drop through to the further
 722                 // processing
 723                 add(buf);
 724                 patLocal.append((UChar) 0x7B /*'{'*/);
 725                 _appendToPat(patLocal, buf, FALSE);
 726                 patLocal.append((UChar) 0x7D /*'}'*/);
 727                 continue;
 728             case SymbolTable::SYMBOL_REF:
 729                 //         symbols  nosymbols
 730                 // [a-$]   error    error (ambiguous)
 731                 // [a$]    anchor   anchor
 732                 // [a-$x]  var "x"* literal '$'
 733                 // [a-$.]  error    literal '$'
 734                 // *We won't get here in the case of var "x"
 735                 {
 736                     chars.getPos(backup);
 737                     c = chars.next(opts, literal, ec);
 738                     if (U_FAILURE(ec)) return;
 739                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
 740                     if (symbols == 0 && !anchor) {
 741                         c = SymbolTable::SYMBOL_REF;
 742                         chars.setPos(backup);
 743                         break; // literal '$'
 744                     }
 745                     if (anchor && op == 0) {
 746                         if (lastItem == 1) {
 747                             add(lastChar, lastChar);
 748                             _appendToPat(patLocal, lastChar, FALSE);
 749                         }
 750                         add(U_ETHER);
 751                         usePat = TRUE;
 752                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
 753                         patLocal.append((UChar) 0x5D /*']'*/);
 754                         mode = 2;
 755                         continue;
 756                     }
 757                     // syntaxError(chars, "Unquoted '$'");
 758                     ec = U_MALFORMED_SET;
 759                     return;
 760                 }
 761             default:
 762                 break;
 763             }
 764         }
 765
 766         // -------- Parse literal characters.  This includes both
 767         // escaped chars ("\u4E01") and non-syntax characters
 768         // ("a").
 769
 770         switch (lastItem) {
 771         case 0:
 772             lastItem = 1;
 773             lastChar = c;
 774             break;
 775         case 1:
 776             if (op == HYPHEN /*'-'*/) {
 777                 if (lastChar >= c) {
 778                     // Don't allow redundant (a-a) or empty (b-a) ranges;
 779                     // these are most likely typos.
 780                     // syntaxError(chars, "Invalid range");
 781                     ec = U_MALFORMED_SET;
 782                     return;
 783                 }
 784                 add(lastChar, c);
 785                 _appendToPat(patLocal, lastChar, FALSE);
 786                 patLocal.append(op);
 787                 _appendToPat(patLocal, c, FALSE);
 788                 lastItem = 0;
 789                 op = 0;
 790             } else {
 791                 add(lastChar, lastChar);
 792                 _appendToPat(patLocal, lastChar, FALSE);
 793                 lastChar = c;
 794             }
 795             break;
 796         case 2:
 797             if (op != 0) {
 798                 // syntaxError(chars, "Set expected after operator");
 799                 ec = U_MALFORMED_SET;
 800                 return;
 801             }
 802             lastChar = c;
 803             lastItem = 1;
 804             break;
 805         }
 806     }
 807
 808     if (mode != 2) {
 809         // syntaxError(chars, "Missing ']'");
 810         ec = U_MALFORMED_SET;
 811         return;
 812     }
 813
 814     chars.skipIgnored(opts);
 815
 816     /**
 817      * Handle global flags (invert, case insensitivity).  If this
 818      * pattern should be compiled case-insensitive, then we need
 819      * to close over case BEFORE COMPLEMENTING.  This makes
 820      * patterns like /[^abc]/i work.
 821      */
 822     if ((options & USET_CASE_INSENSITIVE) != 0) {
 823         closeOver(USET_CASE_INSENSITIVE);
 824     }
 825     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
 826         closeOver(USET_ADD_CASE_MAPPINGS);
 827     }
 828     if (invert) {
 829         complement();
 830     }
 831
 832     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
 833     // generated pattern.
 834     if (usePat) {
 835         rebuiltPat.append(patLocal);
 836     } else {
 837         _generatePattern(rebuiltPat, FALSE);
 838     }
 839     if (isBogus() && U_SUCCESS(ec)) {
 840         // We likely ran out of memory. AHHH!
 841         ec = U_MEMORY_ALLOCATION_ERROR;
 842     }
 843 }
 844
 845 //----------------------------------------------------------------
 846 // Property set implementation
 847 //----------------------------------------------------------------
 848
 849 static UBool numericValueFilter(UChar32 ch, void* context) {
 850     return u_getNumericValue(ch) == *(double*)context;
 851 }
 852
 853 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
 854     int32_t value = *(int32_t*)context;
 855     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
 856 }
 857
 858 static UBool versionFilter(UChar32 ch, void* context) {
 859     UVersionInfo v, none = { 0, 0, 0, 0};
 860     UVersionInfo* version = (UVersionInfo*)context;
 861     u_charAge(ch, v);
 862     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
 863 }
 864
 865 typedef struct {
 866     UProperty prop;
 867     int32_t value;
 868 } IntPropertyContext;
 869
 870 static UBool intPropertyFilter(UChar32 ch, void* context) {
 871     IntPropertyContext* c = (IntPropertyContext*)context;
 872     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
 873 }
 874
 875
 876 /**
 877  * Generic filter-based scanning code for UCD property UnicodeSets.
 878  */
 879 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
 880                              void* context,
 881                              int32_t src,
 882                              UErrorCode &status) {
 883     // Walk through all Unicode characters, noting the start
 884     // and end of each range for which filter.contain(c) is
 885     // true.  Add each range to a set.
 886     //
 887     // To improve performance, use the INCLUSIONS set, which
 888     // encodes information about character ranges that are known
 889     // to have identical properties. INCLUSIONS contains
 890     // only the first characters of such ranges.
 891     //
 892     // TODO Where possible, instead of scanning over code points,
 893     // use internal property data to initialize UnicodeSets for
 894     // those properties.  Scanning code points is slow.
 895     if (U_FAILURE(status)) return;
 896
 897     const UnicodeSet* inclusions = getInclusions(src, status);
 898     if (U_FAILURE(status)) {
 899         return;
 900     }
 901
 902     clear();
 903
 904     UChar32 startHasProperty = -1;
 905     int32_t limitRange = inclusions->getRangeCount();
 906
 907     for (int j=0; j<limitRange; ++j) {
 908         // get current range
 909         UChar32 start = inclusions->getRangeStart(j);
 910         UChar32 end = inclusions->getRangeEnd(j);
 911
 912         // for all the code points in the range, process
 913         for (UChar32 ch = start; ch <= end; ++ch) {
 914             // only add to this UnicodeSet on inflection points --
 915             // where the hasProperty value changes to false
 916             if ((*filter)(ch, context)) {
 917                 if (startHasProperty < 0) {
 918                     startHasProperty = ch;
 919                 }
 920             } else if (startHasProperty >= 0) {
 921                 add(startHasProperty, ch-1);
 922                 startHasProperty = -1;
 923             }
 924         }
 925     }
 926     if (startHasProperty >= 0) {
 927         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
 928     }
 929     if (isBogus() && U_SUCCESS(status)) {
 930         // We likely ran out of memory. AHHH!
 931         status = U_MEMORY_ALLOCATION_ERROR;
 932     }
 933 }
 934
 935 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
 936     /* Note: we use ' ' in compiler code page */
 937     int32_t j = 0;
 938     char ch;
 939     --dstCapacity; /* make room for term. zero */
 940     while ((ch = *src++) != 0) {
 941         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
 942             continue;
 943         }
 944         if (j >= dstCapacity) return FALSE;
 945         dst[j++] = ch;
 946     }
 947     if (j > 0 && dst[j-1] == ' ') --j;
 948     dst[j] = 0;
 949     return TRUE;
 950 }
 951
 952 //----------------------------------------------------------------
 953 // Property set API
 954 //----------------------------------------------------------------
 955
 956 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
 957
 958 UnicodeSet&
 959 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
 960     if (U_FAILURE(ec) || isFrozen()) return *this;
 961
 962     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
 963         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
 964     } else {
 965         IntPropertyContext c = {prop, value};
 966         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
 967     }
 968     return *this;
 969 }
 970
 971 UnicodeSet&
 972 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
 973                                const UnicodeString& value,
 974                                UErrorCode& ec) {
 975     if (U_FAILURE(ec) || isFrozen()) return *this;
 976
 977     // prop and value used to be converted to char * using the default
 978     // converter instead of the invariant conversion.
 979     // This should not be necessary because all Unicode property and value
 980     // names use only invariant characters.
 981     // If there are any variant characters, then we won't find them anyway.
 982     // Checking first avoids assertion failures in the conversion.
 983     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
 984         !uprv_isInvariantUString(value.getBuffer(), value.length())
 985     ) {
 986         FAIL(ec);
 987     }
 988     CharString pname(prop);
 989     CharString vname(value);
 990
 991     UProperty p;
 992     int32_t v;
 993     UBool mustNotBeEmpty = FALSE, invert = FALSE;
 994
 995     if (value.length() > 0) {
 996         p = u_getPropertyEnum(pname);
 997         if (p == UCHAR_INVALID_CODE) FAIL(ec);
 998
 999         // Treat gc as gcm
1000         if (p == UCHAR_GENERAL_CATEGORY) {
1001             p = UCHAR_GENERAL_CATEGORY_MASK;
1002         }
1003
1004         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1005             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1006             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1007             v = u_getPropertyValueEnum(p, vname);
1008             if (v == UCHAR_INVALID_CODE) {
1009                 // Handle numeric CCC
1010                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1011                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1012                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1013                     char* end;
1014                     double value = uprv_strtod(vname, &end);
1015                     v = (int32_t) value;
1016                     if (v != value || v < 0 || *end != 0) {
1017                         // non-integral or negative value, or trailing junk
1018                         FAIL(ec);
1019                     }
1020                     // If the resultant set is empty then the numeric value
1021                     // was invalid.
1022                     mustNotBeEmpty = TRUE;
1023                 } else {
1024                     FAIL(ec);
1025                 }
1026             }
1027         }
1028
1029         else {
1030
1031             switch (p) {
1032             case UCHAR_NUMERIC_VALUE:
1033                 {
1034                     char* end;
1035                     double value = uprv_strtod(vname, &end);
1036                     if (*end != 0) {
1037                         FAIL(ec);
1038                     }
1039                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1040                     return *this;
1041                 }
1042                 break;
1043             case UCHAR_NAME:
1044             case UCHAR_UNICODE_1_NAME:
1045                 {
1046                     // Must munge name, since u_charFromName() does not do
1047                     // 'loose' matching.
1048                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1049                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1050                     UCharNameChoice choice = (p == UCHAR_NAME) ?
1051                         U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
1052                     UChar32 ch = u_charFromName(choice, buf, &ec);
1053                     if (U_SUCCESS(ec)) {
1054                         clear();
1055                         add(ch);
1056                         return *this;
1057                     } else {
1058                         FAIL(ec);
1059                     }
1060                 }
1061                 break;
1062             case UCHAR_AGE:
1063                 {
1064                     // Must munge name, since u_versionFromString() does not do
1065                     // 'loose' matching.
1066                     char buf[128];
1067                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1068                     UVersionInfo version;
1069                     u_versionFromString(version, buf);
1070                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
1071                     return *this;
1072                 }
1073                 break;
1074             default:
1075                 // p is a non-binary, non-enumerated property that we
1076                 // don't support (yet).
1077                 FAIL(ec);
1078             }
1079         }
1080     }
1081
1082     else {
1083         // value is empty.  Interpret as General Category, Script, or
1084         // Binary property.
1085         p = UCHAR_GENERAL_CATEGORY_MASK;
1086         v = u_getPropertyValueEnum(p, pname);
1087         if (v == UCHAR_INVALID_CODE) {
1088             p = UCHAR_SCRIPT;
1089             v = u_getPropertyValueEnum(p, pname);
1090             if (v == UCHAR_INVALID_CODE) {
1091                 p = u_getPropertyEnum(pname);
1092                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1093                     v = 1;
1094                 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
1095                     set(MIN_VALUE, MAX_VALUE);
1096                     return *this;
1097                 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
1098                     set(0, 0x7F);
1099                     return *this;
1100                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
1101                     // [:Assigned:]=[:^Cn:]
1102                     p = UCHAR_GENERAL_CATEGORY_MASK;
1103                     v = U_GC_CN_MASK;
1104                     invert = TRUE;
1105                 } else {
1106                     FAIL(ec);
1107                 }
1108             }
1109         }
1110     }
1111
1112     applyIntPropertyValue(p, v, ec);
1113     if(invert) {
1114         complement();
1115     }
1116
1117     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1118         // mustNotBeEmpty is set to true if an empty set indicates
1119         // invalid input.
1120         ec = U_ILLEGAL_ARGUMENT_ERROR;
1121     }
1122
1123     if (isBogus() && U_SUCCESS(ec)) {
1124         // We likely ran out of memory. AHHH!
1125         ec = U_MEMORY_ALLOCATION_ERROR;
1126     }
1127     return *this;
1128 }
1129
1130 //----------------------------------------------------------------
1131 // Property set patterns
1132 //----------------------------------------------------------------
1133
1134 /**
1135  * Return true if the given position, in the given pattern, appears
1136  * to be the start of a property set pattern.
1137  */
1138 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1139                                            int32_t pos) {
1140     // Patterns are at least 5 characters long
1141     if ((pos+5) > pattern.length()) {
1142         return FALSE;
1143     }
1144
1145     // Look for an opening [:, [:^, \p, or \P
1146     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1147 }
1148
1149 /**
1150  * Return true if the given iterator appears to point at a
1151  * property pattern.  Regardless of the result, return with the
1152  * iterator unchanged.
1153  * @param chars iterator over the pattern characters.  Upon return
1154  * it will be unchanged.
1155  * @param iterOpts RuleCharacterIterator options
1156  */
1157 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1158                                            int32_t iterOpts) {
1159     // NOTE: literal will always be FALSE, because we don't parse escapes.
1160     UBool result = FALSE, literal;
1161     UErrorCode ec = U_ZERO_ERROR;
1162     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1163     RuleCharacterIterator::Pos pos;
1164     chars.getPos(pos);
1165     UChar32 c = chars.next(iterOpts, literal, ec);
1166     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1167         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1168                                literal, ec);
1169         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1170                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1171     }
1172     chars.setPos(pos);
1173     return result && U_SUCCESS(ec);
1174 }
1175
1176 /**
1177  * Parse the given property pattern at the given parse position.
1178  */
1179 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1180                                              ParsePosition& ppos,
1181                                              UErrorCode &ec) {
1182     int32_t pos = ppos.getIndex();
1183
1184     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1185     UBool isName = FALSE; // true for \N{pat}, o/w false
1186     UBool invert = FALSE;
1187
1188     if (U_FAILURE(ec)) return *this;
1189
1190     // Minimum length is 5 characters, e.g. \p{L}
1191     if ((pos+5) > pattern.length()) {
1192         FAIL(ec);
1193     }
1194
1195     // On entry, ppos should point to one of the following locations:
1196     // Look for an opening [:, [:^, \p, or \P
1197     if (isPOSIXOpen(pattern, pos)) {
1198         posix = TRUE;
1199         pos += 2;
1200         pos = ICU_Utility::skipWhitespace(pattern, pos);
1201         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1202             ++pos;
1203             invert = TRUE;
1204         }
1205     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1206         UChar c = pattern.charAt(pos+1);
1207         invert = (c == UPPER_P);
1208         isName = (c == UPPER_N);
1209         pos += 2;
1210         pos = ICU_Utility::skipWhitespace(pattern, pos);
1211         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1212             // Syntax error; "\p" or "\P" not followed by "{"
1213             FAIL(ec);
1214         }
1215     } else {
1216         // Open delimiter not seen
1217         FAIL(ec);
1218     }
1219
1220     // Look for the matching close delimiter, either :] or }
1221     int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1222     if (close < 0) {
1223         // Syntax error; close delimiter missing
1224         FAIL(ec);
1225     }
1226
1227     // Look for an '=' sign.  If this is present, we will parse a
1228     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1229     // pattern.
1230     int32_t equals = pattern.indexOf(EQUALS, pos);
1231     UnicodeString propName, valueName;
1232     if (equals >= 0 && equals < close && !isName) {
1233         // Equals seen; parse medium/long pattern
1234         pattern.extractBetween(pos, equals, propName);
1235         pattern.extractBetween(equals+1, close, valueName);
1236     }
1237
1238     else {
1239         // Handle case where no '=' is seen, and \N{}
1240         pattern.extractBetween(pos, close, propName);
1241
1242         // Handle \N{name}
1243         if (isName) {
1244             // This is a little inefficient since it means we have to
1245             // parse NAME_PROP back to UCHAR_NAME even though we already
1246             // know it's UCHAR_NAME.  If we refactor the API to
1247             // support args of (UProperty, char*) then we can remove
1248             // NAME_PROP and make this a little more efficient.
1249             valueName = propName;
1250             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1251         }
1252     }
1253
1254     applyPropertyAlias(propName, valueName, ec);
1255
1256     if (U_SUCCESS(ec)) {
1257         if (invert) {
1258             complement();
1259         }
1260
1261         // Move to the limit position after the close delimiter if the
1262         // parse succeeded.
1263         ppos.setIndex(close + (posix ? 2 : 1));
1264     }
1265
1266     return *this;
1267 }
1268
1269 /**
1270  * Parse a property pattern.
1271  * @param chars iterator over the pattern characters.  Upon return
1272  * it will be advanced to the first character after the parsed
1273  * pattern, or the end of the iteration if all characters are
1274  * parsed.
1275  * @param rebuiltPat the pattern that was parsed, rebuilt or
1276  * copied from the input pattern, as appropriate.
1277  */
1278 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1279                                       UnicodeString& rebuiltPat,
1280                                       UErrorCode& ec) {
1281     if (U_FAILURE(ec)) return;
1282     UnicodeString pattern;
1283     chars.lookahead(pattern);
1284     ParsePosition pos(0);
1285     applyPropertyPattern(pattern, pos, ec);
1286     if (U_FAILURE(ec)) return;
1287     if (pos.getIndex() == 0) {
1288         // syntaxError(chars, "Invalid property pattern");
1289         ec = U_MALFORMED_SET;
1290         return;
1291     }
1292     chars.jumpahead(pos.getIndex());
1293     rebuiltPat.append(pattern, 0, pos.getIndex());
1294 }
1295
1296 //----------------------------------------------------------------
1297 // Case folding API
1298 //----------------------------------------------------------------
1299
1300 // add the result of a full case mapping to the set
1301 // use str as a temporary string to avoid constructing one
1302 static inline void
1303 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1304     if(result >= 0) {
1305         if(result > UCASE_MAX_STRING_LENGTH) {
1306             // add a single-code point case mapping
1307             set.add(result);
1308         } else {
1309             // add a string case mapping from full with length result
1310             str.setTo((UBool)FALSE, full, result);
1311             set.add(str);
1312         }
1313     }
1314     // result < 0: the code point mapped to itself, no need to add it
1315     // see ucase.h
1316 }
1317
1318 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
1319     if (isFrozen() || isBogus()) {
1320         return *this;
1321     }
1322     if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
1323         UErrorCode status = U_ZERO_ERROR;
1324         const UCaseProps *csp = ucase_getSingleton(&status);
1325         if (U_SUCCESS(status)) {
1326             UnicodeSet foldSet(*this);
1327             UnicodeString str;
1328             USetAdder sa = {
1329                 (USet *)&foldSet,
1330                 _set_add,
1331                 _set_addRange,
1332                 _set_addString,
1333                 NULL, // don't need remove()
1334                 NULL // don't need removeRange()
1335             };
1336
1337             // start with input set to guarantee inclusion
1338             // USET_CASE: remove strings because the strings will actually be reduced (folded);
1339             //            therefore, start with no strings and add only those needed
1340             if (attribute & USET_CASE_INSENSITIVE) {
1341                 foldSet.strings->removeAllElements();
1342             }
1343
1344             int32_t n = getRangeCount();
1345             UChar32 result;
1346             const UChar *full;
1347             int32_t locCache = 0;
1348
1349             for (int32_t i=0; i<n; ++i) {
1350                 UChar32 start = getRangeStart(i);
1351                 UChar32 end   = getRangeEnd(i);
1352
1353                 if (attribute & USET_CASE_INSENSITIVE) {
1354                     // full case closure
1355                     for (UChar32 cp=start; cp<=end; ++cp) {
1356                         ucase_addCaseClosure(csp, cp, &sa);
1357                     }
1358                 } else {
1359                     // add case mappings
1360                     // (does not add long s for regular s, or Kelvin for k, for example)
1361                     for (UChar32 cp=start; cp<=end; ++cp) {
1362                         result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1363                         addCaseMapping(foldSet, result, full, str);
1364
1365                         result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1366                         addCaseMapping(foldSet, result, full, str);
1367
1368                         result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1369                         addCaseMapping(foldSet, result, full, str);
1370
1371                         result = ucase_toFullFolding(csp, cp, &full, 0);
1372                         addCaseMapping(foldSet, result, full, str);
1373                     }
1374                 }
1375             }
1376             if (strings != NULL && strings->size() > 0) {
1377                 if (attribute & USET_CASE_INSENSITIVE) {
1378                     for (int32_t j=0; j<strings->size(); ++j) {
1379                         str = *(const UnicodeString *) strings->elementAt(j);
1380                         str.foldCase();
1381                         if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1382                             foldSet.add(str); // does not map to code points: add the folded string itself
1383                         }
1384                     }
1385                 } else {
1386                     Locale root("");
1387 #if !UCONFIG_NO_BREAK_ITERATION
1388                     BreakIterator *bi = BreakIterator::createWordInstance(root, status);
1389 #endif
1390                     if (U_SUCCESS(status)) {
1391                         const UnicodeString *pStr;
1392
1393                         for (int32_t j=0; j<strings->size(); ++j) {
1394                             pStr = (const UnicodeString *) strings->elementAt(j);
1395                             (str = *pStr).toLower(root);
1396                             foldSet.add(str);
1397 #if !UCONFIG_NO_BREAK_ITERATION
1398                             (str = *pStr).toTitle(bi, root);
1399                             foldSet.add(str);
1400 #endif
1401                             (str = *pStr).toUpper(root);
1402                             foldSet.add(str);
1403                             (str = *pStr).foldCase();
1404                             foldSet.add(str);
1405                         }
1406                     }
1407 #if !UCONFIG_NO_BREAK_ITERATION
1408                     delete bi;
1409 #endif
1410                 }
1411             }
1412             *this = foldSet;
1413         }
1414     }
1415     return *this;
1416 }
1417
1418 U_NAMESPACE_END