icuSources/common/uniset_props.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  uniset_props.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character property dependent functions moved here from uniset.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/parsepos.h"
  22 #include "unicode/uchar.h"
  23 #include "unicode/uscript.h"
  24 #include "unicode/symtable.h"
  25 #include "unicode/uset.h"
  26 #include "unicode/locid.h"
  27 #include "unicode/brkiter.h"
  28 #include "uset_imp.h"
  29 #include "ruleiter.h"
  30 #include "cmemory.h"
  31 #include "ucln_cmn.h"
  32 #include "util.h"
  33 #include "uvector.h"
  34 #include "uprops.h"
  35 #include "propname.h"
  36 #include "normalizer2impl.h"
  37 #include "ucase.h"
  38 #include "ubidi_props.h"
  39 #include "uinvchar.h"
  40 #include "uprops.h"
  41 #include "charstr.h"
  42 #include "cstring.h"
  43 #include "mutex.h"
  44 #include "umutex.h"
  45 #include "uassert.h"
  46 #include "hash.h"
  47
  48 U_NAMESPACE_USE
  49
  50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  51
  52 // initial storage. Must be >= 0
  53 // *** same as in uniset.cpp ! ***
  54 #define START_EXTRA 16
  55
  56 // Define UChar constants using hex for EBCDIC compatibility
  57 // Used #define to reduce private static exports and memory access time.
  58 #define SET_OPEN        ((UChar)0x005B) /*[*/
  59 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  60 #define HYPHEN          ((UChar)0x002D) /*-*/
  61 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  62 #define COLON           ((UChar)0x003A) /*:*/
  63 #define BACKSLASH       ((UChar)0x005C) /*\*/
  64 #define INTERSECTION    ((UChar)0x0026) /*&*/
  65 #define UPPER_U         ((UChar)0x0055) /*U*/
  66 #define LOWER_U         ((UChar)0x0075) /*u*/
  67 #define OPEN_BRACE      ((UChar)123)    /*{*/
  68 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  69 #define UPPER_P         ((UChar)0x0050) /*P*/
  70 #define LOWER_P         ((UChar)0x0070) /*p*/
  71 #define UPPER_N         ((UChar)78)     /*N*/
  72 #define EQUALS          ((UChar)0x003D) /*=*/
  73
  74 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
  75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
  76 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
  77 //static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
  78 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
  79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
  80
  81 // Special property set IDs
  82 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
  83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
  84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
  85
  86 // Unicode name property alias
  87 #define NAME_PROP "na"
  88 #define NAME_PROP_LENGTH 2
  89
  90 /**
  91  * Delimiter string used in patterns to close a category reference:
  92  * ":]".  Example: "[:Lu:]".
  93  */
  94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
  95
  96 // Cached sets ------------------------------------------------------------- ***
  97
  98 U_CDECL_BEGIN
  99 static UBool U_CALLCONV uset_cleanup();
 100 U_CDECL_END
 101
 102 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
 103 // can only fail with an out-of-memory error
 104 // if we have a correct pattern and the properties data is hardcoded and always available.
 105 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
 106 public:
 107     UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
 108             SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
 109     UnicodeSet *getInstance(UErrorCode &errorCode) {
 110         return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
 111     }
 112 private:
 113     static void *createInstance(const void *context, UErrorCode &errorCode) {
 114         UnicodeString pattern((const char *)context, -1, US_INV);
 115         UnicodeSet *set=new UnicodeSet(pattern, errorCode);
 116         if(set==NULL) {
 117             errorCode=U_MEMORY_ALLOCATION_ERROR;
 118             return NULL;
 119         }
 120         set->freeze();
 121         ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
 122         return set;
 123     }
 124
 125     const char *fPattern;
 126 };
 127
 128 U_CDECL_BEGIN
 129
 130 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
 131
 132 STATIC_SIMPLE_SINGLETON(uni32Singleton);
 133
 134 //----------------------------------------------------------------
 135 // Inclusions list
 136 //----------------------------------------------------------------
 137
 138 // USetAdder implementation
 139 // Does not use uset.h to reduce code dependencies
 140 static void U_CALLCONV
 141 _set_add(USet *set, UChar32 c) {
 142     ((UnicodeSet *)set)->add(c);
 143 }
 144
 145 static void U_CALLCONV
 146 _set_addRange(USet *set, UChar32 start, UChar32 end) {
 147     ((UnicodeSet *)set)->add(start, end);
 148 }
 149
 150 static void U_CALLCONV
 151 _set_addString(USet *set, const UChar *str, int32_t length) {
 152     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
 153 }
 154
 155 /**
 156  * Cleanup function for UnicodeSet
 157  */
 158 static UBool U_CALLCONV uset_cleanup(void) {
 159     int32_t i;
 160
 161     for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
 162         if (INCLUSIONS[i] != NULL) {
 163             delete INCLUSIONS[i];
 164             INCLUSIONS[i] = NULL;
 165         }
 166     }
 167     UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
 168     return TRUE;
 169 }
 170
 171 U_CDECL_END
 172
 173 U_NAMESPACE_BEGIN
 174
 175 /*
 176 Reduce excessive reallocation, and make it easier to detect initialization
 177 problems.
 178 Usually you don't see smaller sets than this for Unicode 5.0.
 179 */
 180 #define DEFAULT_INCLUSION_CAPACITY 3072
 181
 182 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
 183     UBool needInit;
 184     UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
 185     if (needInit) {
 186         UnicodeSet* incl = new UnicodeSet();
 187         USetAdder sa = {
 188             (USet *)incl,
 189             _set_add,
 190             _set_addRange,
 191             _set_addString,
 192             NULL, // don't need remove()
 193             NULL // don't need removeRange()
 194         };
 195         if (incl != NULL) {
 196             incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
 197             switch(src) {
 198             case UPROPS_SRC_CHAR:
 199                 uchar_addPropertyStarts(&sa, &status);
 200                 break;
 201             case UPROPS_SRC_PROPSVEC:
 202                 upropsvec_addPropertyStarts(&sa, &status);
 203                 break;
 204             case UPROPS_SRC_CHAR_AND_PROPSVEC:
 205                 uchar_addPropertyStarts(&sa, &status);
 206                 upropsvec_addPropertyStarts(&sa, &status);
 207                 break;
 208 #if !UCONFIG_NO_NORMALIZATION
 209             case UPROPS_SRC_CASE_AND_NORM: {
 210                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
 211                 if(U_SUCCESS(status)) {
 212                     impl->addPropertyStarts(&sa, status);
 213                 }
 214                 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
 215                 break;
 216             }
 217             case UPROPS_SRC_NFC: {
 218                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
 219                 if(U_SUCCESS(status)) {
 220                     impl->addPropertyStarts(&sa, status);
 221                 }
 222                 break;
 223             }
 224             case UPROPS_SRC_NFKC: {
 225                 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
 226                 if(U_SUCCESS(status)) {
 227                     impl->addPropertyStarts(&sa, status);
 228                 }
 229                 break;
 230             }
 231             case UPROPS_SRC_NFKC_CF: {
 232                 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
 233                 if(U_SUCCESS(status)) {
 234                     impl->addPropertyStarts(&sa, status);
 235                 }
 236                 break;
 237             }
 238             case UPROPS_SRC_NFC_CANON_ITER: {
 239                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
 240                 if(U_SUCCESS(status)) {
 241                     impl->addCanonIterPropertyStarts(&sa, status);
 242                 }
 243                 break;
 244             }
 245 #endif
 246             case UPROPS_SRC_CASE:
 247                 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
 248                 break;
 249             case UPROPS_SRC_BIDI:
 250                 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
 251                 break;
 252             default:
 253                 status = U_INTERNAL_PROGRAM_ERROR;
 254                 break;
 255             }
 256             if (U_SUCCESS(status)) {
 257                 // Compact for caching
 258                 incl->compact();
 259                 umtx_lock(NULL);
 260                 if (INCLUSIONS[src] == NULL) {
 261                     INCLUSIONS[src] = incl;
 262                     incl = NULL;
 263                     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
 264                 }
 265                 umtx_unlock(NULL);
 266             }
 267             delete incl;
 268         } else {
 269             status = U_MEMORY_ALLOCATION_ERROR;
 270         }
 271     }
 272     return INCLUSIONS[src];
 273 }
 274
 275 // Cache some sets for other services -------------------------------------- ***
 276
 277 U_CFUNC UnicodeSet *
 278 uniset_getUnicode32Instance(UErrorCode &errorCode) {
 279     return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
 280 }
 281
 282 // helper functions for matching of pattern syntax pieces ------------------ ***
 283 // these functions are parallel to the PERL_OPEN etc. strings above
 284
 285 // using these functions is not only faster than UnicodeString::compare() and
 286 // caseCompare(), but they also make UnicodeSet work for simple patterns when
 287 // no Unicode properties data is available - when caseCompare() fails
 288
 289 static inline UBool
 290 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
 291     UChar c;
 292     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
 293 }
 294
 295 /*static inline UBool
 296 isPerlClose(const UnicodeString &pattern, int32_t pos) {
 297     return pattern.charAt(pos)==CLOSE_BRACE;
 298 }*/
 299
 300 static inline UBool
 301 isNameOpen(const UnicodeString &pattern, int32_t pos) {
 302     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
 303 }
 304
 305 static inline UBool
 306 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
 307     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
 308 }
 309
 310 /*static inline UBool
 311 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
 312     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
 313 }*/
 314
 315 // TODO memory debugging provided inside uniset.cpp
 316 // could be made available here but probably obsolete with use of modern
 317 // memory leak checker tools
 318 #define _dbgct(me)
 319
 320 //----------------------------------------------------------------
 321 // Constructors &c
 322 //----------------------------------------------------------------
 323
 324 /**
 325  * Constructs a set from the given pattern, optionally ignoring
 326  * white space.  See the class description for the syntax of the
 327  * pattern language.
 328  * @param pattern a string specifying what characters are in the set
 329  */
 330 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
 331                        UErrorCode& status) :
 332     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
 333     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
 334     fFlags(0)
 335 {
 336     if(U_SUCCESS(status)){
 337         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 338         /* test for NULL */
 339         if(list == NULL) {
 340             status = U_MEMORY_ALLOCATION_ERROR;
 341         }else{
 342             allocateStrings(status);
 343             applyPattern(pattern, status);
 344         }
 345     }
 346     _dbgct(this);
 347 }
 348
 349 //----------------------------------------------------------------
 350 // Public API
 351 //----------------------------------------------------------------
 352
 353 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 354                                      UErrorCode& status) {
 355     // Equivalent to
 356     //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
 357     // but without dependency on closeOver().
 358     ParsePosition pos(0);
 359     applyPatternIgnoreSpace(pattern, pos, NULL, status);
 360     if (U_FAILURE(status)) return *this;
 361
 362     int32_t i = pos.getIndex();
 363     // Skip over trailing whitespace
 364     ICU_Utility::skipWhitespace(pattern, i, TRUE);
 365     if (i != pattern.length()) {
 366         status = U_ILLEGAL_ARGUMENT_ERROR;
 367     }
 368     return *this;
 369 }
 370
 371 void
 372 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
 373                                     ParsePosition& pos,
 374                                     const SymbolTable* symbols,
 375                                     UErrorCode& status) {
 376     if (U_FAILURE(status)) {
 377         return;
 378     }
 379     if (isFrozen()) {
 380         status = U_NO_WRITE_PERMISSION;
 381         return;
 382     }
 383     // Need to build the pattern in a temporary string because
 384     // _applyPattern calls add() etc., which set pat to empty.
 385     UnicodeString rebuiltPat;
 386     RuleCharacterIterator chars(pattern, symbols, pos);
 387     applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
 388     if (U_FAILURE(status)) return;
 389     if (chars.inVariable()) {
 390         // syntaxError(chars, "Extra chars in variable value");
 391         status = U_MALFORMED_SET;
 392         return;
 393     }
 394     setPattern(rebuiltPat);
 395 }
 396
 397 /**
 398  * Return true if the given position, in the given pattern, appears
 399  * to be the start of a UnicodeSet pattern.
 400  */
 401 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
 402     return ((pos+1) < pattern.length() &&
 403             pattern.charAt(pos) == (UChar)91/*[*/) ||
 404         resemblesPropertyPattern(pattern, pos);
 405 }
 406
 407 //----------------------------------------------------------------
 408 // Implementation: Pattern parsing
 409 //----------------------------------------------------------------
 410
 411 /**
 412  * A small all-inline class to manage a UnicodeSet pointer.  Add
 413  * operator->() etc. as needed.
 414  */
 415 class UnicodeSetPointer {
 416     UnicodeSet* p;
 417 public:
 418     inline UnicodeSetPointer() : p(0) {}
 419     inline ~UnicodeSetPointer() { delete p; }
 420     inline UnicodeSet* pointer() { return p; }
 421     inline UBool allocate() {
 422         if (p == 0) {
 423             p = new UnicodeSet();
 424         }
 425         return p != 0;
 426     }
 427 };
 428
 429 /**
 430  * Parse the pattern from the given RuleCharacterIterator.  The
 431  * iterator is advanced over the parsed pattern.
 432  * @param chars iterator over the pattern characters.  Upon return
 433  * it will be advanced to the first character after the parsed
 434  * pattern, or the end of the iteration if all characters are
 435  * parsed.
 436  * @param symbols symbol table to use to parse and dereference
 437  * variables, or null if none.
 438  * @param rebuiltPat the pattern that was parsed, rebuilt or
 439  * copied from the input pattern, as appropriate.
 440  * @param options a bit mask of zero or more of the following:
 441  * IGNORE_SPACE, CASE.
 442  */
 443 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 444                               const SymbolTable* symbols,
 445                               UnicodeString& rebuiltPat,
 446                               uint32_t options,
 447                               UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
 448                               UErrorCode& ec) {
 449     if (U_FAILURE(ec)) return;
 450
 451     // Syntax characters: [ ] ^ - & { }
 452
 453     // Recognized special forms for chars, sets: c-c s-s s&s
 454
 455     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
 456                    RuleCharacterIterator::PARSE_ESCAPES;
 457     if ((options & USET_IGNORE_SPACE) != 0) {
 458         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
 459     }
 460
 461     UnicodeString patLocal, buf;
 462     UBool usePat = FALSE;
 463     UnicodeSetPointer scratch;
 464     RuleCharacterIterator::Pos backup;
 465
 466     // mode: 0=before [, 1=between [...], 2=after ]
 467     // lastItem: 0=none, 1=char, 2=set
 468     int8_t lastItem = 0, mode = 0;
 469     UChar32 lastChar = 0;
 470     UChar op = 0;
 471
 472     UBool invert = FALSE;
 473
 474     clear();
 475
 476     while (mode != 2 && !chars.atEnd()) {
 477         U_ASSERT((lastItem == 0 && op == 0) ||
 478                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
 479                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
 480                                     op == INTERSECTION /*'&'*/)));
 481
 482         UChar32 c = 0;
 483         UBool literal = FALSE;
 484         UnicodeSet* nested = 0; // alias - do not delete
 485
 486         // -------- Check for property pattern
 487
 488         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
 489         int8_t setMode = 0;
 490         if (resemblesPropertyPattern(chars, opts)) {
 491             setMode = 2;
 492         }
 493
 494         // -------- Parse '[' of opening delimiter OR nested set.
 495         // If there is a nested set, use `setMode' to define how
 496         // the set should be parsed.  If the '[' is part of the
 497         // opening delimiter for this pattern, parse special
 498         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
 499         // characters representing a nested set in the symbol
 500         // table.
 501
 502         else {
 503             // Prepare to backup if necessary
 504             chars.getPos(backup);
 505             c = chars.next(opts, literal, ec);
 506             if (U_FAILURE(ec)) return;
 507
 508             if (c == 0x5B /*'['*/ && !literal) {
 509                 if (mode == 1) {
 510                     chars.setPos(backup); // backup
 511                     setMode = 1;
 512                 } else {
 513                     // Handle opening '[' delimiter
 514                     mode = 1;
 515                     patLocal.append((UChar) 0x5B /*'['*/);
 516                     chars.getPos(backup); // prepare to backup
 517                     c = chars.next(opts, literal, ec);
 518                     if (U_FAILURE(ec)) return;
 519                     if (c == 0x5E /*'^'*/ && !literal) {
 520                         invert = TRUE;
 521                         patLocal.append((UChar) 0x5E /*'^'*/);
 522                         chars.getPos(backup); // prepare to backup
 523                         c = chars.next(opts, literal, ec);
 524                         if (U_FAILURE(ec)) return;
 525                     }
 526                     // Fall through to handle special leading '-';
 527                     // otherwise restart loop for nested [], \p{}, etc.
 528                     if (c == HYPHEN /*'-'*/) {
 529                         literal = TRUE;
 530                         // Fall through to handle literal '-' below
 531                     } else {
 532                         chars.setPos(backup); // backup
 533                         continue;
 534                     }
 535                 }
 536             } else if (symbols != 0) {
 537                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
 538                 if (m != 0) {
 539                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
 540                     if (ms == NULL) {
 541                         ec = U_MALFORMED_SET;
 542                         return;
 543                     }
 544                     // casting away const, but `nested' won't be modified
 545                     // (important not to modify stored set)
 546                     nested = const_cast<UnicodeSet*>(ms);
 547                     setMode = 3;
 548                 }
 549             }
 550         }
 551
 552         // -------- Handle a nested set.  This either is inline in
 553         // the pattern or represented by a stand-in that has
 554         // previously been parsed and was looked up in the symbol
 555         // table.
 556
 557         if (setMode != 0) {
 558             if (lastItem == 1) {
 559                 if (op != 0) {
 560                     // syntaxError(chars, "Char expected after operator");
 561                     ec = U_MALFORMED_SET;
 562                     return;
 563                 }
 564                 add(lastChar, lastChar);
 565                 _appendToPat(patLocal, lastChar, FALSE);
 566                 lastItem = 0;
 567                 op = 0;
 568             }
 569
 570             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
 571                 patLocal.append(op);
 572             }
 573
 574             if (nested == 0) {
 575                 // lazy allocation
 576                 if (!scratch.allocate()) {
 577                     ec = U_MEMORY_ALLOCATION_ERROR;
 578                     return;
 579                 }
 580                 nested = scratch.pointer();
 581             }
 582             switch (setMode) {
 583             case 1:
 584                 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
 585                 break;
 586             case 2:
 587                 chars.skipIgnored(opts);
 588                 nested->applyPropertyPattern(chars, patLocal, ec);
 589                 if (U_FAILURE(ec)) return;
 590                 break;
 591             case 3: // `nested' already parsed
 592                 nested->_toPattern(patLocal, FALSE);
 593                 break;
 594             }
 595
 596             usePat = TRUE;
 597
 598             if (mode == 0) {
 599                 // Entire pattern is a category; leave parse loop
 600                 *this = *nested;
 601                 mode = 2;
 602                 break;
 603             }
 604
 605             switch (op) {
 606             case HYPHEN: /*'-'*/
 607                 removeAll(*nested);
 608                 break;
 609             case INTERSECTION: /*'&'*/
 610                 retainAll(*nested);
 611                 break;
 612             case 0:
 613                 addAll(*nested);
 614                 break;
 615             }
 616
 617             op = 0;
 618             lastItem = 2;
 619
 620             continue;
 621         }
 622
 623         if (mode == 0) {
 624             // syntaxError(chars, "Missing '['");
 625             ec = U_MALFORMED_SET;
 626             return;
 627         }
 628
 629         // -------- Parse special (syntax) characters.  If the
 630         // current character is not special, or if it is escaped,
 631         // then fall through and handle it below.
 632
 633         if (!literal) {
 634             switch (c) {
 635             case 0x5D /*']'*/:
 636                 if (lastItem == 1) {
 637                     add(lastChar, lastChar);
 638                     _appendToPat(patLocal, lastChar, FALSE);
 639                 }
 640                 // Treat final trailing '-' as a literal
 641                 if (op == HYPHEN /*'-'*/) {
 642                     add(op, op);
 643                     patLocal.append(op);
 644                 } else if (op == INTERSECTION /*'&'*/) {
 645                     // syntaxError(chars, "Trailing '&'");
 646                     ec = U_MALFORMED_SET;
 647                     return;
 648                 }
 649                 patLocal.append((UChar) 0x5D /*']'*/);
 650                 mode = 2;
 651                 continue;
 652             case HYPHEN /*'-'*/:
 653                 if (op == 0) {
 654                     if (lastItem != 0) {
 655                         op = (UChar) c;
 656                         continue;
 657                     } else {
 658                         // Treat final trailing '-' as a literal
 659                         add(c, c);
 660                         c = chars.next(opts, literal, ec);
 661                         if (U_FAILURE(ec)) return;
 662                         if (c == 0x5D /*']'*/ && !literal) {
 663                             patLocal.append(HYPHEN_RIGHT_BRACE, 2);
 664                             mode = 2;
 665                             continue;
 666                         }
 667                     }
 668                 }
 669                 // syntaxError(chars, "'-' not after char or set");
 670                 ec = U_MALFORMED_SET;
 671                 return;
 672             case INTERSECTION /*'&'*/:
 673                 if (lastItem == 2 && op == 0) {
 674                     op = (UChar) c;
 675                     continue;
 676                 }
 677                 // syntaxError(chars, "'&' not after set");
 678                 ec = U_MALFORMED_SET;
 679                 return;
 680             case 0x5E /*'^'*/:
 681                 // syntaxError(chars, "'^' not after '['");
 682                 ec = U_MALFORMED_SET;
 683                 return;
 684             case 0x7B /*'{'*/:
 685                 if (op != 0) {
 686                     // syntaxError(chars, "Missing operand after operator");
 687                     ec = U_MALFORMED_SET;
 688                     return;
 689                 }
 690                 if (lastItem == 1) {
 691                     add(lastChar, lastChar);
 692                     _appendToPat(patLocal, lastChar, FALSE);
 693                 }
 694                 lastItem = 0;
 695                 buf.truncate(0);
 696                 {
 697                     UBool ok = FALSE;
 698                     while (!chars.atEnd()) {
 699                         c = chars.next(opts, literal, ec);
 700                         if (U_FAILURE(ec)) return;
 701                         if (c == 0x7D /*'}'*/ && !literal) {
 702                             ok = TRUE;
 703                             break;
 704                         }
 705                         buf.append(c);
 706                     }
 707                     if (buf.length() < 1 || !ok) {
 708                         // syntaxError(chars, "Invalid multicharacter string");
 709                         ec = U_MALFORMED_SET;
 710                         return;
 711                     }
 712                 }
 713                 // We have new string. Add it to set and continue;
 714                 // we don't need to drop through to the further
 715                 // processing
 716                 add(buf);
 717                 patLocal.append((UChar) 0x7B /*'{'*/);
 718                 _appendToPat(patLocal, buf, FALSE);
 719                 patLocal.append((UChar) 0x7D /*'}'*/);
 720                 continue;
 721             case SymbolTable::SYMBOL_REF:
 722                 //         symbols  nosymbols
 723                 // [a-$]   error    error (ambiguous)
 724                 // [a$]    anchor   anchor
 725                 // [a-$x]  var "x"* literal '$'
 726                 // [a-$.]  error    literal '$'
 727                 // *We won't get here in the case of var "x"
 728                 {
 729                     chars.getPos(backup);
 730                     c = chars.next(opts, literal, ec);
 731                     if (U_FAILURE(ec)) return;
 732                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
 733                     if (symbols == 0 && !anchor) {
 734                         c = SymbolTable::SYMBOL_REF;
 735                         chars.setPos(backup);
 736                         break; // literal '$'
 737                     }
 738                     if (anchor && op == 0) {
 739                         if (lastItem == 1) {
 740                             add(lastChar, lastChar);
 741                             _appendToPat(patLocal, lastChar, FALSE);
 742                         }
 743                         add(U_ETHER);
 744                         usePat = TRUE;
 745                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
 746                         patLocal.append((UChar) 0x5D /*']'*/);
 747                         mode = 2;
 748                         continue;
 749                     }
 750                     // syntaxError(chars, "Unquoted '$'");
 751                     ec = U_MALFORMED_SET;
 752                     return;
 753                 }
 754             default:
 755                 break;
 756             }
 757         }
 758
 759         // -------- Parse literal characters.  This includes both
 760         // escaped chars ("\u4E01") and non-syntax characters
 761         // ("a").
 762
 763         switch (lastItem) {
 764         case 0:
 765             lastItem = 1;
 766             lastChar = c;
 767             break;
 768         case 1:
 769             if (op == HYPHEN /*'-'*/) {
 770                 if (lastChar >= c) {
 771                     // Don't allow redundant (a-a) or empty (b-a) ranges;
 772                     // these are most likely typos.
 773                     // syntaxError(chars, "Invalid range");
 774                     ec = U_MALFORMED_SET;
 775                     return;
 776                 }
 777                 add(lastChar, c);
 778                 _appendToPat(patLocal, lastChar, FALSE);
 779                 patLocal.append(op);
 780                 _appendToPat(patLocal, c, FALSE);
 781                 lastItem = 0;
 782                 op = 0;
 783             } else {
 784                 add(lastChar, lastChar);
 785                 _appendToPat(patLocal, lastChar, FALSE);
 786                 lastChar = c;
 787             }
 788             break;
 789         case 2:
 790             if (op != 0) {
 791                 // syntaxError(chars, "Set expected after operator");
 792                 ec = U_MALFORMED_SET;
 793                 return;
 794             }
 795             lastChar = c;
 796             lastItem = 1;
 797             break;
 798         }
 799     }
 800
 801     if (mode != 2) {
 802         // syntaxError(chars, "Missing ']'");
 803         ec = U_MALFORMED_SET;
 804         return;
 805     }
 806
 807     chars.skipIgnored(opts);
 808
 809     /**
 810      * Handle global flags (invert, case insensitivity).  If this
 811      * pattern should be compiled case-insensitive, then we need
 812      * to close over case BEFORE COMPLEMENTING.  This makes
 813      * patterns like /[^abc]/i work.
 814      */
 815     if ((options & USET_CASE_INSENSITIVE) != 0) {
 816         (this->*caseClosure)(USET_CASE_INSENSITIVE);
 817     }
 818     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
 819         (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
 820     }
 821     if (invert) {
 822         complement();
 823     }
 824
 825     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
 826     // generated pattern.
 827     if (usePat) {
 828         rebuiltPat.append(patLocal);
 829     } else {
 830         _generatePattern(rebuiltPat, FALSE);
 831     }
 832     if (isBogus() && U_SUCCESS(ec)) {
 833         // We likely ran out of memory. AHHH!
 834         ec = U_MEMORY_ALLOCATION_ERROR;
 835     }
 836 }
 837
 838 //----------------------------------------------------------------
 839 // Property set implementation
 840 //----------------------------------------------------------------
 841
 842 static UBool numericValueFilter(UChar32 ch, void* context) {
 843     return u_getNumericValue(ch) == *(double*)context;
 844 }
 845
 846 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
 847     int32_t value = *(int32_t*)context;
 848     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
 849 }
 850
 851 static UBool versionFilter(UChar32 ch, void* context) {
 852     static const UVersionInfo none = { 0, 0, 0, 0 };
 853     UVersionInfo v;
 854     u_charAge(ch, v);
 855     UVersionInfo* version = (UVersionInfo*)context;
 856     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
 857 }
 858
 859 typedef struct {
 860     UProperty prop;
 861     int32_t value;
 862 } IntPropertyContext;
 863
 864 static UBool intPropertyFilter(UChar32 ch, void* context) {
 865     IntPropertyContext* c = (IntPropertyContext*)context;
 866     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
 867 }
 868
 869 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
 870     return uscript_hasScript(ch, *(UScriptCode*)context);
 871 }
 872
 873 /**
 874  * Generic filter-based scanning code for UCD property UnicodeSets.
 875  */
 876 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
 877                              void* context,
 878                              int32_t src,
 879                              UErrorCode &status) {
 880     if (U_FAILURE(status)) return;
 881
 882     // Logically, walk through all Unicode characters, noting the start
 883     // and end of each range for which filter.contain(c) is
 884     // true.  Add each range to a set.
 885     //
 886     // To improve performance, use an inclusions set which
 887     // encodes information about character ranges that are known
 888     // to have identical properties.
 889     // getInclusions(src) contains exactly the first characters of
 890     // same-value ranges for the given properties "source".
 891     const UnicodeSet* inclusions = getInclusions(src, status);
 892     if (U_FAILURE(status)) {
 893         return;
 894     }
 895
 896     clear();
 897
 898     UChar32 startHasProperty = -1;
 899     int32_t limitRange = inclusions->getRangeCount();
 900
 901     for (int j=0; j<limitRange; ++j) {
 902         // get current range
 903         UChar32 start = inclusions->getRangeStart(j);
 904         UChar32 end = inclusions->getRangeEnd(j);
 905
 906         // for all the code points in the range, process
 907         for (UChar32 ch = start; ch <= end; ++ch) {
 908             // only add to this UnicodeSet on inflection points --
 909             // where the hasProperty value changes to false
 910             if ((*filter)(ch, context)) {
 911                 if (startHasProperty < 0) {
 912                     startHasProperty = ch;
 913                 }
 914             } else if (startHasProperty >= 0) {
 915                 add(startHasProperty, ch-1);
 916                 startHasProperty = -1;
 917             }
 918         }
 919     }
 920     if (startHasProperty >= 0) {
 921         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
 922     }
 923     if (isBogus() && U_SUCCESS(status)) {
 924         // We likely ran out of memory. AHHH!
 925         status = U_MEMORY_ALLOCATION_ERROR;
 926     }
 927 }
 928
 929 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
 930     /* Note: we use ' ' in compiler code page */
 931     int32_t j = 0;
 932     char ch;
 933     --dstCapacity; /* make room for term. zero */
 934     while ((ch = *src++) != 0) {
 935         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
 936             continue;
 937         }
 938         if (j >= dstCapacity) return FALSE;
 939         dst[j++] = ch;
 940     }
 941     if (j > 0 && dst[j-1] == ' ') --j;
 942     dst[j] = 0;
 943     return TRUE;
 944 }
 945
 946 //----------------------------------------------------------------
 947 // Property set API
 948 //----------------------------------------------------------------
 949
 950 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
 951
 952 UnicodeSet&
 953 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
 954     if (U_FAILURE(ec) || isFrozen()) return *this;
 955
 956     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
 957         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
 958     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
 959         UScriptCode script = (UScriptCode)value;
 960         applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
 961     } else {
 962         IntPropertyContext c = {prop, value};
 963         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
 964     }
 965     return *this;
 966 }
 967
 968 UnicodeSet&
 969 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
 970                                const UnicodeString& value,
 971                                UErrorCode& ec) {
 972     if (U_FAILURE(ec) || isFrozen()) return *this;
 973
 974     // prop and value used to be converted to char * using the default
 975     // converter instead of the invariant conversion.
 976     // This should not be necessary because all Unicode property and value
 977     // names use only invariant characters.
 978     // If there are any variant characters, then we won't find them anyway.
 979     // Checking first avoids assertion failures in the conversion.
 980     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
 981         !uprv_isInvariantUString(value.getBuffer(), value.length())
 982     ) {
 983         FAIL(ec);
 984     }
 985     CharString pname, vname;
 986     pname.appendInvariantChars(prop, ec);
 987     vname.appendInvariantChars(value, ec);
 988     if (U_FAILURE(ec)) return *this;
 989
 990     UProperty p;
 991     int32_t v;
 992     UBool mustNotBeEmpty = FALSE, invert = FALSE;
 993
 994     if (value.length() > 0) {
 995         p = u_getPropertyEnum(pname.data());
 996         if (p == UCHAR_INVALID_CODE) FAIL(ec);
 997
 998         // Treat gc as gcm
 999         if (p == UCHAR_GENERAL_CATEGORY) {
1000             p = UCHAR_GENERAL_CATEGORY_MASK;
1001         }
1002
1003         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1004             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1005             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1006             v = u_getPropertyValueEnum(p, vname.data());
1007             if (v == UCHAR_INVALID_CODE) {
1008                 // Handle numeric CCC
1009                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1010                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1011                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1012                     char* end;
1013                     double value = uprv_strtod(vname.data(), &end);
1014                     v = (int32_t) value;
1015                     if (v != value || v < 0 || *end != 0) {
1016                         // non-integral or negative value, or trailing junk
1017                         FAIL(ec);
1018                     }
1019                     // If the resultant set is empty then the numeric value
1020                     // was invalid.
1021                     mustNotBeEmpty = TRUE;
1022                 } else {
1023                     FAIL(ec);
1024                 }
1025             }
1026         }
1027
1028         else {
1029
1030             switch (p) {
1031             case UCHAR_NUMERIC_VALUE:
1032                 {
1033                     char* end;
1034                     double value = uprv_strtod(vname.data(), &end);
1035                     if (*end != 0) {
1036                         FAIL(ec);
1037                     }
1038                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1039                     return *this;
1040                 }
1041             case UCHAR_NAME:
1042                 {
1043                     // Must munge name, since u_charFromName() does not do
1044                     // 'loose' matching.
1045                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1046                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
1047                     UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
1048                     if (U_SUCCESS(ec)) {
1049                         clear();
1050                         add(ch);
1051                         return *this;
1052                     } else {
1053                         FAIL(ec);
1054                     }
1055                 }
1056             case UCHAR_UNICODE_1_NAME:
1057                 // ICU 49 deprecates the Unicode_1_Name property APIs.
1058                 FAIL(ec);
1059             case UCHAR_AGE:
1060                 {
1061                     // Must munge name, since u_versionFromString() does not do
1062                     // 'loose' matching.
1063                     char buf[128];
1064                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
1065                     UVersionInfo version;
1066                     u_versionFromString(version, buf);
1067                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
1068                     return *this;
1069                 }
1070             case UCHAR_SCRIPT_EXTENSIONS:
1071                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
1072                 if (v == UCHAR_INVALID_CODE) {
1073                     FAIL(ec);
1074                 }
1075                 // fall through to calling applyIntPropertyValue()
1076                 break;
1077             default:
1078                 // p is a non-binary, non-enumerated property that we
1079                 // don't support (yet).
1080                 FAIL(ec);
1081             }
1082         }
1083     }
1084
1085     else {
1086         // value is empty.  Interpret as General Category, Script, or
1087         // Binary property.
1088         p = UCHAR_GENERAL_CATEGORY_MASK;
1089         v = u_getPropertyValueEnum(p, pname.data());
1090         if (v == UCHAR_INVALID_CODE) {
1091             p = UCHAR_SCRIPT;
1092             v = u_getPropertyValueEnum(p, pname.data());
1093             if (v == UCHAR_INVALID_CODE) {
1094                 p = u_getPropertyEnum(pname.data());
1095                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1096                     v = 1;
1097                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
1098                     set(MIN_VALUE, MAX_VALUE);
1099                     return *this;
1100                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
1101                     set(0, 0x7F);
1102                     return *this;
1103                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
1104                     // [:Assigned:]=[:^Cn:]
1105                     p = UCHAR_GENERAL_CATEGORY_MASK;
1106                     v = U_GC_CN_MASK;
1107                     invert = TRUE;
1108                 } else {
1109                     FAIL(ec);
1110                 }
1111             }
1112         }
1113     }
1114
1115     applyIntPropertyValue(p, v, ec);
1116     if(invert) {
1117         complement();
1118     }
1119
1120     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1121         // mustNotBeEmpty is set to true if an empty set indicates
1122         // invalid input.
1123         ec = U_ILLEGAL_ARGUMENT_ERROR;
1124     }
1125
1126     if (isBogus() && U_SUCCESS(ec)) {
1127         // We likely ran out of memory. AHHH!
1128         ec = U_MEMORY_ALLOCATION_ERROR;
1129     }
1130     return *this;
1131 }
1132
1133 //----------------------------------------------------------------
1134 // Property set patterns
1135 //----------------------------------------------------------------
1136
1137 /**
1138  * Return true if the given position, in the given pattern, appears
1139  * to be the start of a property set pattern.
1140  */
1141 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1142                                            int32_t pos) {
1143     // Patterns are at least 5 characters long
1144     if ((pos+5) > pattern.length()) {
1145         return FALSE;
1146     }
1147
1148     // Look for an opening [:, [:^, \p, or \P
1149     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1150 }
1151
1152 /**
1153  * Return true if the given iterator appears to point at a
1154  * property pattern.  Regardless of the result, return with the
1155  * iterator unchanged.
1156  * @param chars iterator over the pattern characters.  Upon return
1157  * it will be unchanged.
1158  * @param iterOpts RuleCharacterIterator options
1159  */
1160 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1161                                            int32_t iterOpts) {
1162     // NOTE: literal will always be FALSE, because we don't parse escapes.
1163     UBool result = FALSE, literal;
1164     UErrorCode ec = U_ZERO_ERROR;
1165     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1166     RuleCharacterIterator::Pos pos;
1167     chars.getPos(pos);
1168     UChar32 c = chars.next(iterOpts, literal, ec);
1169     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1170         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1171                                literal, ec);
1172         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1173                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1174     }
1175     chars.setPos(pos);
1176     return result && U_SUCCESS(ec);
1177 }
1178
1179 /**
1180  * Parse the given property pattern at the given parse position.
1181  */
1182 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1183                                              ParsePosition& ppos,
1184                                              UErrorCode &ec) {
1185     int32_t pos = ppos.getIndex();
1186
1187     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1188     UBool isName = FALSE; // true for \N{pat}, o/w false
1189     UBool invert = FALSE;
1190
1191     if (U_FAILURE(ec)) return *this;
1192
1193     // Minimum length is 5 characters, e.g. \p{L}
1194     if ((pos+5) > pattern.length()) {
1195         FAIL(ec);
1196     }
1197
1198     // On entry, ppos should point to one of the following locations:
1199     // Look for an opening [:, [:^, \p, or \P
1200     if (isPOSIXOpen(pattern, pos)) {
1201         posix = TRUE;
1202         pos += 2;
1203         pos = ICU_Utility::skipWhitespace(pattern, pos);
1204         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1205             ++pos;
1206             invert = TRUE;
1207         }
1208     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1209         UChar c = pattern.charAt(pos+1);
1210         invert = (c == UPPER_P);
1211         isName = (c == UPPER_N);
1212         pos += 2;
1213         pos = ICU_Utility::skipWhitespace(pattern, pos);
1214         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1215             // Syntax error; "\p" or "\P" not followed by "{"
1216             FAIL(ec);
1217         }
1218     } else {
1219         // Open delimiter not seen
1220         FAIL(ec);
1221     }
1222
1223     // Look for the matching close delimiter, either :] or }
1224     int32_t close;
1225     if (posix) {
1226       close = pattern.indexOf(POSIX_CLOSE, 2, pos);
1227     } else {
1228       close = pattern.indexOf(CLOSE_BRACE, pos);
1229     }
1230     if (close < 0) {
1231         // Syntax error; close delimiter missing
1232         FAIL(ec);
1233     }
1234
1235     // Look for an '=' sign.  If this is present, we will parse a
1236     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1237     // pattern.
1238     int32_t equals = pattern.indexOf(EQUALS, pos);
1239     UnicodeString propName, valueName;
1240     if (equals >= 0 && equals < close && !isName) {
1241         // Equals seen; parse medium/long pattern
1242         pattern.extractBetween(pos, equals, propName);
1243         pattern.extractBetween(equals+1, close, valueName);
1244     }
1245
1246     else {
1247         // Handle case where no '=' is seen, and \N{}
1248         pattern.extractBetween(pos, close, propName);
1249
1250         // Handle \N{name}
1251         if (isName) {
1252             // This is a little inefficient since it means we have to
1253             // parse NAME_PROP back to UCHAR_NAME even though we already
1254             // know it's UCHAR_NAME.  If we refactor the API to
1255             // support args of (UProperty, char*) then we can remove
1256             // NAME_PROP and make this a little more efficient.
1257             valueName = propName;
1258             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1259         }
1260     }
1261
1262     applyPropertyAlias(propName, valueName, ec);
1263
1264     if (U_SUCCESS(ec)) {
1265         if (invert) {
1266             complement();
1267         }
1268
1269         // Move to the limit position after the close delimiter if the
1270         // parse succeeded.
1271         ppos.setIndex(close + (posix ? 2 : 1));
1272     }
1273
1274     return *this;
1275 }
1276
1277 /**
1278  * Parse a property pattern.
1279  * @param chars iterator over the pattern characters.  Upon return
1280  * it will be advanced to the first character after the parsed
1281  * pattern, or the end of the iteration if all characters are
1282  * parsed.
1283  * @param rebuiltPat the pattern that was parsed, rebuilt or
1284  * copied from the input pattern, as appropriate.
1285  */
1286 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1287                                       UnicodeString& rebuiltPat,
1288                                       UErrorCode& ec) {
1289     if (U_FAILURE(ec)) return;
1290     UnicodeString pattern;
1291     chars.lookahead(pattern);
1292     ParsePosition pos(0);
1293     applyPropertyPattern(pattern, pos, ec);
1294     if (U_FAILURE(ec)) return;
1295     if (pos.getIndex() == 0) {
1296         // syntaxError(chars, "Invalid property pattern");
1297         ec = U_MALFORMED_SET;
1298         return;
1299     }
1300     chars.jumpahead(pos.getIndex());
1301     rebuiltPat.append(pattern, 0, pos.getIndex());
1302 }
1303
1304 U_NAMESPACE_END