icuSources/common/uniset_props.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2006, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  uniset_props.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character property dependent functions moved here from uniset.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/parsepos.h"
  22 #include "unicode/uchar.h"
  23 #include "unicode/uscript.h"
  24 #include "unicode/symtable.h"
  25 #include "unicode/uset.h"
  26 #include "unicode/locid.h"
  27 #include "unicode/brkiter.h"
  28 #include "uset_imp.h"
  29 #include "ruleiter.h"
  30 #include "cmemory.h"
  31 #include "ucln_cmn.h"
  32 #include "util.h"
  33 #include "uvector.h"
  34 #include "uprops.h"
  35 #include "propname.h"
  36 #include "unormimp.h"
  37 #include "ucase.h"
  38 #include "ubidi_props.h"
  39 #include "uinvchar.h"
  40 #include "charstr.h"
  41 #include "cstring.h"
  42 #include "mutex.h"
  43 #include "uassert.h"
  44 #include "hash.h"
  45
  46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  47
  48 // initial storage. Must be >= 0
  49 // *** same as in uniset.cpp ! ***
  50 #define START_EXTRA 16
  51
  52 // Define UChar constants using hex for EBCDIC compatibility
  53 // Used #define to reduce private static exports and memory access time.
  54 #define SET_OPEN        ((UChar)0x005B) /*[*/
  55 #define SET_CLOSE       ((UChar)0x005D) /*]*/
  56 #define HYPHEN          ((UChar)0x002D) /*-*/
  57 #define COMPLEMENT      ((UChar)0x005E) /*^*/
  58 #define COLON           ((UChar)0x003A) /*:*/
  59 #define BACKSLASH       ((UChar)0x005C) /*\*/
  60 #define INTERSECTION    ((UChar)0x0026) /*&*/
  61 #define UPPER_U         ((UChar)0x0055) /*U*/
  62 #define LOWER_U         ((UChar)0x0075) /*u*/
  63 #define OPEN_BRACE      ((UChar)123)    /*{*/
  64 #define CLOSE_BRACE     ((UChar)125)    /*}*/
  65 #define UPPER_P         ((UChar)0x0050) /*P*/
  66 #define LOWER_P         ((UChar)0x0070) /*p*/
  67 #define UPPER_N         ((UChar)78)     /*N*/
  68 #define EQUALS          ((UChar)0x003D) /*=*/
  69
  70 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
  71 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
  72 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
  73 static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
  74 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
  75 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
  76
  77 // Special property set IDs
  78 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
  79 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
  80 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
  81
  82 // Unicode name property alias
  83 #define NAME_PROP "na"
  84 #define NAME_PROP_LENGTH 2
  85
  86 /**
  87  * Delimiter string used in patterns to close a category reference:
  88  * ":]".  Example: "[:Lu:]".
  89  */
  90 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
  91
  92 U_NAMESPACE_BEGIN
  93
  94 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
  95
  96 // helper functions for matching of pattern syntax pieces ------------------ ***
  97 // these functions are parallel to the PERL_OPEN etc. strings above
  98
  99 // using these functions is not only faster than UnicodeString::compare() and
 100 // caseCompare(), but they also make UnicodeSet work for simple patterns when
 101 // no Unicode properties data is available - when caseCompare() fails
 102
 103 static inline UBool
 104 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
 105     UChar c;
 106     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
 107 }
 108
 109 /*static inline UBool
 110 isPerlClose(const UnicodeString &pattern, int32_t pos) {
 111     return pattern.charAt(pos)==CLOSE_BRACE;
 112 }*/
 113
 114 static inline UBool
 115 isNameOpen(const UnicodeString &pattern, int32_t pos) {
 116     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
 117 }
 118
 119 static inline UBool
 120 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
 121     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
 122 }
 123
 124 /*static inline UBool
 125 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
 126     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
 127 }*/
 128
 129 // TODO memory debugging provided inside uniset.cpp
 130 // could be made available here but probably obsolete with use of modern
 131 // memory leak checker tools
 132 #define _dbgct(me)
 133
 134 //----------------------------------------------------------------
 135 // Constructors &c
 136 //----------------------------------------------------------------
 137
 138 /**
 139  * Constructs a set from the given pattern, optionally ignoring
 140  * white space.  See the class description for the syntax of the
 141  * pattern language.
 142  * @param pattern a string specifying what characters are in the set
 143  */
 144 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
 145                        UErrorCode& status) :
 146     len(0), capacity(START_EXTRA), bufferCapacity(0),
 147     list(0), buffer(0), strings(0)
 148 {
 149     if(U_SUCCESS(status)){
 150         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 151         /* test for NULL */
 152         if(list == NULL) {
 153             status = U_MEMORY_ALLOCATION_ERROR;
 154         }else{
 155             allocateStrings();
 156             applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
 157         }
 158     }
 159     _dbgct(this);
 160 }
 161
 162 /**
 163  * Constructs a set from the given pattern, optionally ignoring
 164  * white space.  See the class description for the syntax of the
 165  * pattern language.
 166  * @param pattern a string specifying what characters are in the set
 167  * @param options bitmask for options to apply to the pattern.
 168  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 169  */
 170 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
 171                        uint32_t options,
 172                        const SymbolTable* symbols,
 173                        UErrorCode& status) :
 174     len(0), capacity(START_EXTRA), bufferCapacity(0),
 175     list(0), buffer(0), strings(0)
 176 {
 177     if(U_SUCCESS(status)){
 178         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 179         /* test for NULL */
 180         if(list == NULL) {
 181             status = U_MEMORY_ALLOCATION_ERROR;
 182         }else{
 183             allocateStrings();
 184             applyPattern(pattern, options, symbols, status);
 185         }
 186     }
 187     _dbgct(this);
 188 }
 189
 190 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
 191                        uint32_t options,
 192                        const SymbolTable* symbols,
 193                        UErrorCode& status) :
 194     len(0), capacity(START_EXTRA), bufferCapacity(0),
 195     list(0), buffer(0), strings(0)
 196 {
 197     if(U_SUCCESS(status)){
 198         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
 199         /* test for NULL */
 200         if(list == NULL) {
 201             status = U_MEMORY_ALLOCATION_ERROR;
 202         }else{
 203             allocateStrings();
 204             applyPattern(pattern, pos, options, symbols, status);
 205         }
 206     }
 207     _dbgct(this);
 208 }
 209
 210 //----------------------------------------------------------------
 211 // Public API
 212 //----------------------------------------------------------------
 213
 214 /**
 215  * Modifies this set to represent the set specified by the given
 216  * pattern, optionally ignoring white space.  See the class
 217  * description for the syntax of the pattern language.
 218  * @param pattern a string specifying what characters are in the set
 219  * @param ignoreSpaces if <code>true</code>, all spaces in the
 220  * pattern are ignored.  Spaces are those characters for which
 221  * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
 222  * Characters preceded by '\\' are escaped, losing any special
 223  * meaning they otherwise have.  Spaces may be included by
 224  * escaping them.
 225  * @exception <code>IllegalArgumentException</code> if the pattern
 226  * contains a syntax error.
 227  */
 228 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 229                                      UErrorCode& status) {
 230     return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
 231 }
 232
 233
 234 /**
 235  * Modifies this set to represent the set specified by the given
 236  * pattern, optionally ignoring white space.  See the class
 237  * description for the syntax of the pattern language.
 238  * @param pattern a string specifying what characters are in the set
 239  * @param options bitmask for options to apply to the pattern.
 240  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 241  */
 242 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 243                                      uint32_t options,
 244                                      const SymbolTable* symbols,
 245                                      UErrorCode& status) {
 246     if (U_FAILURE(status)) {
 247         return *this;
 248     }
 249
 250     ParsePosition pos(0);
 251     applyPattern(pattern, pos, options, symbols, status);
 252     if (U_FAILURE(status)) return *this;
 253
 254     int32_t i = pos.getIndex();
 255
 256     if (options & USET_IGNORE_SPACE) {
 257         // Skip over trailing whitespace
 258         ICU_Utility::skipWhitespace(pattern, i, TRUE);
 259     }
 260
 261     if (i != pattern.length()) {
 262         status = U_ILLEGAL_ARGUMENT_ERROR;
 263     }
 264     return *this;
 265 }
 266
 267 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
 268                               ParsePosition& pos,
 269                               uint32_t options,
 270                               const SymbolTable* symbols,
 271                               UErrorCode& status) {
 272     if (U_FAILURE(status)) {
 273         return *this;
 274     }
 275     // Need to build the pattern in a temporary string because
 276     // _applyPattern calls add() etc., which set pat to empty.
 277     UnicodeString rebuiltPat;
 278     RuleCharacterIterator chars(pattern, symbols, pos);
 279     applyPattern(chars, symbols, rebuiltPat, options, status);
 280     if (U_FAILURE(status)) return *this;
 281     if (chars.inVariable()) {
 282         // syntaxError(chars, "Extra chars in variable value");
 283         status = U_MALFORMED_SET;
 284         return *this;
 285     }
 286     pat = rebuiltPat;
 287     return *this;
 288 }
 289
 290 /**
 291  * Return true if the given position, in the given pattern, appears
 292  * to be the start of a UnicodeSet pattern.
 293  */
 294 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
 295     return ((pos+1) < pattern.length() &&
 296             pattern.charAt(pos) == (UChar)91/*[*/) ||
 297         resemblesPropertyPattern(pattern, pos);
 298 }
 299
 300 //----------------------------------------------------------------
 301 // Implementation: Pattern parsing
 302 //----------------------------------------------------------------
 303
 304 /**
 305  * A small all-inline class to manage a UnicodeSet pointer.  Add
 306  * operator->() etc. as needed.
 307  */
 308 class UnicodeSetPointer {
 309     UnicodeSet* p;
 310 public:
 311     inline UnicodeSetPointer() : p(0) {}
 312     inline ~UnicodeSetPointer() { delete p; }
 313     inline UnicodeSet* pointer() { return p; }
 314     inline UBool allocate() {
 315         if (p == 0) {
 316             p = new UnicodeSet();
 317         }
 318         return p != 0;
 319     }
 320 };
 321
 322 /**
 323  * Parse the pattern from the given RuleCharacterIterator.  The
 324  * iterator is advanced over the parsed pattern.
 325  * @param chars iterator over the pattern characters.  Upon return
 326  * it will be advanced to the first character after the parsed
 327  * pattern, or the end of the iteration if all characters are
 328  * parsed.
 329  * @param symbols symbol table to use to parse and dereference
 330  * variables, or null if none.
 331  * @param rebuiltPat the pattern that was parsed, rebuilt or
 332  * copied from the input pattern, as appropriate.
 333  * @param options a bit mask of zero or more of the following:
 334  * IGNORE_SPACE, CASE.
 335  */
 336 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 337                               const SymbolTable* symbols,
 338                               UnicodeString& rebuiltPat,
 339                               uint32_t options,
 340                               UErrorCode& ec) {
 341     if (U_FAILURE(ec)) return;
 342
 343     // Syntax characters: [ ] ^ - & { }
 344
 345     // Recognized special forms for chars, sets: c-c s-s s&s
 346
 347     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
 348                    RuleCharacterIterator::PARSE_ESCAPES;
 349     if ((options & USET_IGNORE_SPACE) != 0) {
 350         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
 351     }
 352
 353     UnicodeString patLocal, buf;
 354     UBool usePat = FALSE;
 355     UnicodeSetPointer scratch;
 356     RuleCharacterIterator::Pos backup;
 357
 358     // mode: 0=before [, 1=between [...], 2=after ]
 359     // lastItem: 0=none, 1=char, 2=set
 360     int8_t lastItem = 0, mode = 0;
 361     UChar32 lastChar = 0;
 362     UChar op = 0;
 363
 364     UBool invert = FALSE;
 365
 366     clear();
 367
 368     while (mode != 2 && !chars.atEnd()) {
 369         U_ASSERT((lastItem == 0 && op == 0) ||
 370                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
 371                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
 372                                     op == INTERSECTION /*'&'*/)));
 373
 374         UChar32 c = 0;
 375         UBool literal = FALSE;
 376         UnicodeSet* nested = 0; // alias - do not delete
 377
 378         // -------- Check for property pattern
 379
 380         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
 381         int8_t setMode = 0;
 382         if (resemblesPropertyPattern(chars, opts)) {
 383             setMode = 2;
 384         }
 385
 386         // -------- Parse '[' of opening delimiter OR nested set.
 387         // If there is a nested set, use `setMode' to define how
 388         // the set should be parsed.  If the '[' is part of the
 389         // opening delimiter for this pattern, parse special
 390         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
 391         // characters representing a nested set in the symbol
 392         // table.
 393
 394         else {
 395             // Prepare to backup if necessary
 396             chars.getPos(backup);
 397             c = chars.next(opts, literal, ec);
 398             if (U_FAILURE(ec)) return;
 399
 400             if (c == 0x5B /*'['*/ && !literal) {
 401                 if (mode == 1) {
 402                     chars.setPos(backup); // backup
 403                     setMode = 1;
 404                 } else {
 405                     // Handle opening '[' delimiter
 406                     mode = 1;
 407                     patLocal.append((UChar) 0x5B /*'['*/);
 408                     chars.getPos(backup); // prepare to backup
 409                     c = chars.next(opts, literal, ec);
 410                     if (U_FAILURE(ec)) return;
 411                     if (c == 0x5E /*'^'*/ && !literal) {
 412                         invert = TRUE;
 413                         patLocal.append((UChar) 0x5E /*'^'*/);
 414                         chars.getPos(backup); // prepare to backup
 415                         c = chars.next(opts, literal, ec);
 416                         if (U_FAILURE(ec)) return;
 417                     }
 418                     // Fall through to handle special leading '-';
 419                     // otherwise restart loop for nested [], \p{}, etc.
 420                     if (c == HYPHEN /*'-'*/) {
 421                         literal = TRUE;
 422                         // Fall through to handle literal '-' below
 423                     } else {
 424                         chars.setPos(backup); // backup
 425                         continue;
 426                     }
 427                 }
 428             } else if (symbols != 0) {
 429                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
 430                 if (m != 0) {
 431                     if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
 432                         ec = U_MALFORMED_SET;
 433                         return;
 434                     }
 435                     // casting away const, but `nested' won't be modified
 436                     // (important not to modify stored set)
 437                     nested = (UnicodeSet*) m;
 438                     setMode = 3;
 439                 }
 440             }
 441         }
 442
 443         // -------- Handle a nested set.  This either is inline in
 444         // the pattern or represented by a stand-in that has
 445         // previously been parsed and was looked up in the symbol
 446         // table.
 447
 448         if (setMode != 0) {
 449             if (lastItem == 1) {
 450                 if (op != 0) {
 451                     // syntaxError(chars, "Char expected after operator");
 452                     ec = U_MALFORMED_SET;
 453                     return;
 454                 }
 455                 add(lastChar, lastChar);
 456                 _appendToPat(patLocal, lastChar, FALSE);
 457                 lastItem = 0;
 458                 op = 0;
 459             }
 460
 461             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
 462                 patLocal.append(op);
 463             }
 464
 465             if (nested == 0) {
 466                 // lazy allocation
 467                 if (!scratch.allocate()) {
 468                     ec = U_MEMORY_ALLOCATION_ERROR;
 469                     return;
 470                 }
 471                 nested = scratch.pointer();
 472             }
 473             switch (setMode) {
 474             case 1:
 475                 nested->applyPattern(chars, symbols, patLocal, options, ec);
 476                 break;
 477             case 2:
 478                 chars.skipIgnored(opts);
 479                 nested->applyPropertyPattern(chars, patLocal, ec);
 480                 if (U_FAILURE(ec)) return;
 481                 break;
 482             case 3: // `nested' already parsed
 483                 nested->_toPattern(patLocal, FALSE);
 484                 break;
 485             }
 486
 487             usePat = TRUE;
 488
 489             if (mode == 0) {
 490                 // Entire pattern is a category; leave parse loop
 491                 *this = *nested;
 492                 mode = 2;
 493                 break;
 494             }
 495
 496             switch (op) {
 497             case HYPHEN: /*'-'*/
 498                 removeAll(*nested);
 499                 break;
 500             case INTERSECTION: /*'&'*/
 501                 retainAll(*nested);
 502                 break;
 503             case 0:
 504                 addAll(*nested);
 505                 break;
 506             }
 507
 508             op = 0;
 509             lastItem = 2;
 510
 511             continue;
 512         }
 513
 514         if (mode == 0) {
 515             // syntaxError(chars, "Missing '['");
 516             ec = U_MALFORMED_SET;
 517             return;
 518         }
 519
 520         // -------- Parse special (syntax) characters.  If the
 521         // current character is not special, or if it is escaped,
 522         // then fall through and handle it below.
 523
 524         if (!literal) {
 525             switch (c) {
 526             case 0x5D /*']'*/:
 527                 if (lastItem == 1) {
 528                     add(lastChar, lastChar);
 529                     _appendToPat(patLocal, lastChar, FALSE);
 530                 }
 531                 // Treat final trailing '-' as a literal
 532                 if (op == HYPHEN /*'-'*/) {
 533                     add(op, op);
 534                     patLocal.append(op);
 535                 } else if (op == INTERSECTION /*'&'*/) {
 536                     // syntaxError(chars, "Trailing '&'");
 537                     ec = U_MALFORMED_SET;
 538                     return;
 539                 }
 540                 patLocal.append((UChar) 0x5D /*']'*/);
 541                 mode = 2;
 542                 continue;
 543             case HYPHEN /*'-'*/:
 544                 if (op == 0) {
 545                     if (lastItem != 0) {
 546                         op = (UChar) c;
 547                         continue;
 548                     } else {
 549                         // Treat final trailing '-' as a literal
 550                         add(c, c);
 551                         c = chars.next(opts, literal, ec);
 552                         if (U_FAILURE(ec)) return;
 553                         if (c == 0x5D /*']'*/ && !literal) {
 554                             patLocal.append(HYPHEN_RIGHT_BRACE);
 555                             mode = 2;
 556                             continue;
 557                         }
 558                     }
 559                 }
 560                 // syntaxError(chars, "'-' not after char or set");
 561                 ec = U_MALFORMED_SET;
 562                 return;
 563             case INTERSECTION /*'&'*/:
 564                 if (lastItem == 2 && op == 0) {
 565                     op = (UChar) c;
 566                     continue;
 567                 }
 568                 // syntaxError(chars, "'&' not after set");
 569                 ec = U_MALFORMED_SET;
 570                 return;
 571             case 0x5E /*'^'*/:
 572                 // syntaxError(chars, "'^' not after '['");
 573                 ec = U_MALFORMED_SET;
 574                 return;
 575             case 0x7B /*'{'*/:
 576                 if (op != 0) {
 577                     // syntaxError(chars, "Missing operand after operator");
 578                     ec = U_MALFORMED_SET;
 579                     return;
 580                 }
 581                 if (lastItem == 1) {
 582                     add(lastChar, lastChar);
 583                     _appendToPat(patLocal, lastChar, FALSE);
 584                 }
 585                 lastItem = 0;
 586                 buf.truncate(0);
 587                 {
 588                     UBool ok = FALSE;
 589                     while (!chars.atEnd()) {
 590                         c = chars.next(opts, literal, ec);
 591                         if (U_FAILURE(ec)) return;
 592                         if (c == 0x7D /*'}'*/ && !literal) {
 593                             ok = TRUE;
 594                             break;
 595                         }
 596                         buf.append(c);
 597                     }
 598                     if (buf.length() < 1 || !ok) {
 599                         // syntaxError(chars, "Invalid multicharacter string");
 600                         ec = U_MALFORMED_SET;
 601                         return;
 602                     }
 603                 }
 604                 // We have new string. Add it to set and continue;
 605                 // we don't need to drop through to the further
 606                 // processing
 607                 add(buf);
 608                 patLocal.append((UChar) 0x7B /*'{'*/);
 609                 _appendToPat(patLocal, buf, FALSE);
 610                 patLocal.append((UChar) 0x7D /*'}'*/);
 611                 continue;
 612             case SymbolTable::SYMBOL_REF:
 613                 //         symbols  nosymbols
 614                 // [a-$]   error    error (ambiguous)
 615                 // [a$]    anchor   anchor
 616                 // [a-$x]  var "x"* literal '$'
 617                 // [a-$.]  error    literal '$'
 618                 // *We won't get here in the case of var "x"
 619                 {
 620                     chars.getPos(backup);
 621                     c = chars.next(opts, literal, ec);
 622                     if (U_FAILURE(ec)) return;
 623                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
 624                     if (symbols == 0 && !anchor) {
 625                         c = SymbolTable::SYMBOL_REF;
 626                         chars.setPos(backup);
 627                         break; // literal '$'
 628                     }
 629                     if (anchor && op == 0) {
 630                         if (lastItem == 1) {
 631                             add(lastChar, lastChar);
 632                             _appendToPat(patLocal, lastChar, FALSE);
 633                         }
 634                         add(U_ETHER);
 635                         usePat = TRUE;
 636                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
 637                         patLocal.append((UChar) 0x5D /*']'*/);
 638                         mode = 2;
 639                         continue;
 640                     }
 641                     // syntaxError(chars, "Unquoted '$'");
 642                     ec = U_MALFORMED_SET;
 643                     return;
 644                 }
 645             default:
 646                 break;
 647             }
 648         }
 649
 650         // -------- Parse literal characters.  This includes both
 651         // escaped chars ("\u4E01") and non-syntax characters
 652         // ("a").
 653
 654         switch (lastItem) {
 655         case 0:
 656             lastItem = 1;
 657             lastChar = c;
 658             break;
 659         case 1:
 660             if (op == HYPHEN /*'-'*/) {
 661                 if (lastChar >= c) {
 662                     // Don't allow redundant (a-a) or empty (b-a) ranges;
 663                     // these are most likely typos.
 664                     // syntaxError(chars, "Invalid range");
 665                     ec = U_MALFORMED_SET;
 666                     return;
 667                 }
 668                 add(lastChar, c);
 669                 _appendToPat(patLocal, lastChar, FALSE);
 670                 patLocal.append(op);
 671                 _appendToPat(patLocal, c, FALSE);
 672                 lastItem = 0;
 673                 op = 0;
 674             } else {
 675                 add(lastChar, lastChar);
 676                 _appendToPat(patLocal, lastChar, FALSE);
 677                 lastChar = c;
 678             }
 679             break;
 680         case 2:
 681             if (op != 0) {
 682                 // syntaxError(chars, "Set expected after operator");
 683                 ec = U_MALFORMED_SET;
 684                 return;
 685             }
 686             lastChar = c;
 687             lastItem = 1;
 688             break;
 689         }
 690     }
 691
 692     if (mode != 2) {
 693         // syntaxError(chars, "Missing ']'");
 694         ec = U_MALFORMED_SET;
 695         return;
 696     }
 697
 698     chars.skipIgnored(opts);
 699
 700     /**
 701      * Handle global flags (invert, case insensitivity).  If this
 702      * pattern should be compiled case-insensitive, then we need
 703      * to close over case BEFORE COMPLEMENTING.  This makes
 704      * patterns like /[^abc]/i work.
 705      */
 706     if ((options & USET_CASE_INSENSITIVE) != 0) {
 707         closeOver(USET_CASE_INSENSITIVE);
 708     }
 709     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
 710         closeOver(USET_ADD_CASE_MAPPINGS);
 711     }
 712     if (invert) {
 713         complement();
 714     }
 715
 716     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
 717     // generated pattern.
 718     if (usePat) {
 719         rebuiltPat.append(patLocal);
 720     } else {
 721         _generatePattern(rebuiltPat, FALSE);
 722     }
 723 }
 724
 725 //----------------------------------------------------------------
 726 // Property set implementation
 727 //----------------------------------------------------------------
 728
 729 static UBool numericValueFilter(UChar32 ch, void* context) {
 730     return u_getNumericValue(ch) == *(double*)context;
 731 }
 732
 733 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
 734     int32_t value = *(int32_t*)context;
 735     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
 736 }
 737
 738 static UBool versionFilter(UChar32 ch, void* context) {
 739     UVersionInfo v, none = { 0, 0, 0, 0};
 740     UVersionInfo* version = (UVersionInfo*)context;
 741     u_charAge(ch, v);
 742     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
 743 }
 744
 745 typedef struct {
 746     UProperty prop;
 747     int32_t value;
 748 } IntPropertyContext;
 749
 750 static UBool intPropertyFilter(UChar32 ch, void* context) {
 751     IntPropertyContext* c = (IntPropertyContext*)context;
 752     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
 753 }
 754
 755
 756 /**
 757  * Generic filter-based scanning code for UCD property UnicodeSets.
 758  */
 759 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
 760                              void* context,
 761                              int32_t src,
 762                              UErrorCode &status) {
 763     // Walk through all Unicode characters, noting the start
 764     // and end of each range for which filter.contain(c) is
 765     // true.  Add each range to a set.
 766     //
 767     // To improve performance, use the INCLUSIONS set, which
 768     // encodes information about character ranges that are known
 769     // to have identical properties. INCLUSIONS contains
 770     // only the first characters of such ranges.
 771     //
 772     // TODO Where possible, instead of scanning over code points,
 773     // use internal property data to initialize UnicodeSets for
 774     // those properties.  Scanning code points is slow.
 775     if (U_FAILURE(status)) return;
 776
 777     const UnicodeSet* inclusions = getInclusions(src, status);
 778     if (U_FAILURE(status)) {
 779         return;
 780     }
 781
 782     clear();
 783
 784     UChar32 startHasProperty = -1;
 785     int limitRange = inclusions->getRangeCount();
 786
 787     for (int j=0; j<limitRange; ++j) {
 788         // get current range
 789         UChar32 start = inclusions->getRangeStart(j);
 790         UChar32 end = inclusions->getRangeEnd(j);
 791
 792         // for all the code points in the range, process
 793         for (UChar32 ch = start; ch <= end; ++ch) {
 794             // only add to this UnicodeSet on inflection points --
 795             // where the hasProperty value changes to false
 796             if ((*filter)(ch, context)) {
 797                 if (startHasProperty < 0) {
 798                     startHasProperty = ch;
 799                 }
 800             } else if (startHasProperty >= 0) {
 801                 add(startHasProperty, ch-1);
 802                 startHasProperty = -1;
 803             }
 804         }
 805     }
 806     if (startHasProperty >= 0) {
 807         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
 808     }
 809 }
 810
 811 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
 812     /* Note: we use ' ' in compiler code page */
 813     int32_t j = 0;
 814     char ch;
 815     --dstCapacity; /* make room for term. zero */
 816     while ((ch = *src++) != 0) {
 817         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
 818             continue;
 819         }
 820         if (j >= dstCapacity) return FALSE;
 821         dst[j++] = ch;
 822     }
 823     if (j > 0 && dst[j-1] == ' ') --j;
 824     dst[j] = 0;
 825     return TRUE;
 826 }
 827
 828 //----------------------------------------------------------------
 829 // Property set API
 830 //----------------------------------------------------------------
 831
 832 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
 833
 834 UnicodeSet&
 835 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
 836     if (U_FAILURE(ec)) return *this;
 837
 838     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
 839         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
 840     } else {
 841         IntPropertyContext c = {prop, value};
 842         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
 843     }
 844     return *this;
 845 }
 846
 847 UnicodeSet&
 848 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
 849                                const UnicodeString& value,
 850                                UErrorCode& ec) {
 851     if (U_FAILURE(ec)) return *this;
 852
 853     // prop and value used to be converted to char * using the default
 854     // converter instead of the invariant conversion.
 855     // This should not be necessary because all Unicode property and value
 856     // names use only invariant characters.
 857     // If there are any variant characters, then we won't find them anyway.
 858     // Checking first avoids assertion failures in the conversion.
 859     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
 860         !uprv_isInvariantUString(value.getBuffer(), value.length())
 861     ) {
 862         FAIL(ec);
 863     }
 864     CharString pname(prop);
 865     CharString vname(value);
 866
 867     UProperty p;
 868     int32_t v;
 869     UBool mustNotBeEmpty = FALSE, invert = FALSE;
 870
 871     if (value.length() > 0) {
 872         p = u_getPropertyEnum(pname);
 873         if (p == UCHAR_INVALID_CODE) FAIL(ec);
 874
 875         // Treat gc as gcm
 876         if (p == UCHAR_GENERAL_CATEGORY) {
 877             p = UCHAR_GENERAL_CATEGORY_MASK;
 878         }
 879
 880         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
 881             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
 882             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
 883             v = u_getPropertyValueEnum(p, vname);
 884             if (v == UCHAR_INVALID_CODE) {
 885                 // Handle numeric CCC
 886                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
 887                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
 888                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
 889                     char* end;
 890                     double value = uprv_strtod(vname, &end);
 891                     v = (int32_t) value;
 892                     if (v != value || v < 0 || *end != 0) {
 893                         // non-integral or negative value, or trailing junk
 894                         FAIL(ec);
 895                     }
 896                     // If the resultant set is empty then the numeric value
 897                     // was invalid.
 898                     mustNotBeEmpty = TRUE;
 899                 } else {
 900                     FAIL(ec);
 901                 }
 902             }
 903         }
 904
 905         else {
 906
 907             switch (p) {
 908             case UCHAR_NUMERIC_VALUE:
 909                 {
 910                     char* end;
 911                     double value = uprv_strtod(vname, &end);
 912                     if (*end != 0) {
 913                         FAIL(ec);
 914                     }
 915                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
 916                     return *this;
 917                 }
 918                 break;
 919             case UCHAR_NAME:
 920             case UCHAR_UNICODE_1_NAME:
 921                 {
 922                     // Must munge name, since u_charFromName() does not do
 923                     // 'loose' matching.
 924                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
 925                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
 926                     UCharNameChoice choice = (p == UCHAR_NAME) ?
 927                         U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
 928                     UChar32 ch = u_charFromName(choice, buf, &ec);
 929                     if (U_SUCCESS(ec)) {
 930                         clear();
 931                         add(ch);
 932                         return *this;
 933                     } else {
 934                         FAIL(ec);
 935                     }
 936                 }
 937                 break;
 938             case UCHAR_AGE:
 939                 {
 940                     // Must munge name, since u_versionFromString() does not do
 941                     // 'loose' matching.
 942                     char buf[128];
 943                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
 944                     UVersionInfo version;
 945                     u_versionFromString(version, buf);
 946                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
 947                     return *this;
 948                 }
 949                 break;
 950             default:
 951                 // p is a non-binary, non-enumerated property that we
 952                 // don't support (yet).
 953                 FAIL(ec);
 954             }
 955         }
 956     }
 957
 958     else {
 959         // value is empty.  Interpret as General Category, Script, or
 960         // Binary property.
 961         p = UCHAR_GENERAL_CATEGORY_MASK;
 962         v = u_getPropertyValueEnum(p, pname);
 963         if (v == UCHAR_INVALID_CODE) {
 964             p = UCHAR_SCRIPT;
 965             v = u_getPropertyValueEnum(p, pname);
 966             if (v == UCHAR_INVALID_CODE) {
 967                 p = u_getPropertyEnum(pname);
 968                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
 969                     v = 1;
 970                 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
 971                     set(MIN_VALUE, MAX_VALUE);
 972                     return *this;
 973                 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
 974                     set(0, 0x7F);
 975                     return *this;
 976                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
 977                     // [:Assigned:]=[:^Cn:]
 978                     p = UCHAR_GENERAL_CATEGORY_MASK;
 979                     v = U_GC_CN_MASK;
 980                     invert = TRUE;
 981                 } else {
 982                     FAIL(ec);
 983                 }
 984             }
 985         }
 986     }
 987
 988     applyIntPropertyValue(p, v, ec);
 989     if(invert) {
 990         complement();
 991     }
 992
 993     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
 994         // mustNotBeEmpty is set to true if an empty set indicates
 995         // invalid input.
 996         ec = U_ILLEGAL_ARGUMENT_ERROR;
 997     }
 998
 999     return *this;
1000 }
1001
1002 //----------------------------------------------------------------
1003 // Property set patterns
1004 //----------------------------------------------------------------
1005
1006 /**
1007  * Return true if the given position, in the given pattern, appears
1008  * to be the start of a property set pattern.
1009  */
1010 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1011                                            int32_t pos) {
1012     // Patterns are at least 5 characters long
1013     if ((pos+5) > pattern.length()) {
1014         return FALSE;
1015     }
1016
1017     // Look for an opening [:, [:^, \p, or \P
1018     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1019 }
1020
1021 /**
1022  * Return true if the given iterator appears to point at a
1023  * property pattern.  Regardless of the result, return with the
1024  * iterator unchanged.
1025  * @param chars iterator over the pattern characters.  Upon return
1026  * it will be unchanged.
1027  * @param iterOpts RuleCharacterIterator options
1028  */
1029 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1030                                            int32_t iterOpts) {
1031     // NOTE: literal will always be FALSE, because we don't parse escapes.
1032     UBool result = FALSE, literal;
1033     UErrorCode ec = U_ZERO_ERROR;
1034     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1035     RuleCharacterIterator::Pos pos;
1036     chars.getPos(pos);
1037     UChar32 c = chars.next(iterOpts, literal, ec);
1038     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1039         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1040                                literal, ec);
1041         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1042                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1043     }
1044     chars.setPos(pos);
1045     return result && U_SUCCESS(ec);
1046 }
1047
1048 /**
1049  * Parse the given property pattern at the given parse position.
1050  */
1051 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1052                                              ParsePosition& ppos,
1053                                              UErrorCode &ec) {
1054     int32_t pos = ppos.getIndex();
1055
1056     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1057     UBool isName = FALSE; // true for \N{pat}, o/w false
1058     UBool invert = FALSE;
1059
1060     if (U_FAILURE(ec)) return *this;
1061
1062     // Minimum length is 5 characters, e.g. \p{L}
1063     if ((pos+5) > pattern.length()) {
1064         FAIL(ec);
1065     }
1066
1067     // On entry, ppos should point to one of the following locations:
1068     // Look for an opening [:, [:^, \p, or \P
1069     if (isPOSIXOpen(pattern, pos)) {
1070         posix = TRUE;
1071         pos += 2;
1072         pos = ICU_Utility::skipWhitespace(pattern, pos);
1073         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1074             ++pos;
1075             invert = TRUE;
1076         }
1077     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1078         UChar c = pattern.charAt(pos+1);
1079         invert = (c == UPPER_P);
1080         isName = (c == UPPER_N);
1081         pos += 2;
1082         pos = ICU_Utility::skipWhitespace(pattern, pos);
1083         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1084             // Syntax error; "\p" or "\P" not followed by "{"
1085             FAIL(ec);
1086         }
1087     } else {
1088         // Open delimiter not seen
1089         FAIL(ec);
1090     }
1091
1092     // Look for the matching close delimiter, either :] or }
1093     int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1094     if (close < 0) {
1095         // Syntax error; close delimiter missing
1096         FAIL(ec);
1097     }
1098
1099     // Look for an '=' sign.  If this is present, we will parse a
1100     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1101     // pattern.
1102     int32_t equals = pattern.indexOf(EQUALS, pos);
1103     UnicodeString propName, valueName;
1104     if (equals >= 0 && equals < close && !isName) {
1105         // Equals seen; parse medium/long pattern
1106         pattern.extractBetween(pos, equals, propName);
1107         pattern.extractBetween(equals+1, close, valueName);
1108     }
1109
1110     else {
1111         // Handle case where no '=' is seen, and \N{}
1112         pattern.extractBetween(pos, close, propName);
1113
1114         // Handle \N{name}
1115         if (isName) {
1116             // This is a little inefficient since it means we have to
1117             // parse NAME_PROP back to UCHAR_NAME even though we already
1118             // know it's UCHAR_NAME.  If we refactor the API to
1119             // support args of (UProperty, char*) then we can remove
1120             // NAME_PROP and make this a little more efficient.
1121             valueName = propName;
1122             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1123         }
1124     }
1125
1126     applyPropertyAlias(propName, valueName, ec);
1127
1128     if (U_SUCCESS(ec)) {
1129         if (invert) {
1130             complement();
1131         }
1132
1133         // Move to the limit position after the close delimiter if the
1134         // parse succeeded.
1135         ppos.setIndex(close + (posix ? 2 : 1));
1136     }
1137
1138     return *this;
1139 }
1140
1141 /**
1142  * Parse a property pattern.
1143  * @param chars iterator over the pattern characters.  Upon return
1144  * it will be advanced to the first character after the parsed
1145  * pattern, or the end of the iteration if all characters are
1146  * parsed.
1147  * @param rebuiltPat the pattern that was parsed, rebuilt or
1148  * copied from the input pattern, as appropriate.
1149  */
1150 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1151                                       UnicodeString& rebuiltPat,
1152                                       UErrorCode& ec) {
1153     if (U_FAILURE(ec)) return;
1154     UnicodeString pattern;
1155     chars.lookahead(pattern);
1156     ParsePosition pos(0);
1157     applyPropertyPattern(pattern, pos, ec);
1158     if (U_FAILURE(ec)) return;
1159     if (pos.getIndex() == 0) {
1160         // syntaxError(chars, "Invalid property pattern");
1161         ec = U_MALFORMED_SET;
1162         return;
1163     }
1164     chars.jumpahead(pos.getIndex());
1165     rebuiltPat.append(pattern, 0, pos.getIndex());
1166 }
1167
1168 //----------------------------------------------------------------
1169 // Inclusions list
1170 //----------------------------------------------------------------
1171
1172 U_CDECL_BEGIN
1173
1174 // USetAdder implementation
1175 // Does not use uset.h to reduce code dependencies
1176 static void U_CALLCONV
1177 _set_add(USet *set, UChar32 c) {
1178     ((UnicodeSet *)set)->add(c);
1179 }
1180
1181 static void U_CALLCONV
1182 _set_addRange(USet *set, UChar32 start, UChar32 end) {
1183     ((UnicodeSet *)set)->add(start, end);
1184 }
1185
1186 static void U_CALLCONV
1187 _set_addString(USet *set, const UChar *str, int32_t length) {
1188     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
1189 }
1190
1191 /**
1192  * Cleanup function for UnicodeSet
1193  */
1194 static UBool U_CALLCONV uset_cleanup(void) {
1195     int32_t i;
1196
1197     for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
1198         if (INCLUSIONS[i] != NULL) {
1199             delete INCLUSIONS[i];
1200             INCLUSIONS[i] = NULL;
1201         }
1202     }
1203
1204     return TRUE;
1205 }
1206
1207 U_CDECL_END
1208
1209 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
1210     umtx_lock(NULL);
1211     UBool f = (INCLUSIONS[src] == NULL);
1212     umtx_unlock(NULL);
1213     if (f) {
1214         UnicodeSet* incl = new UnicodeSet();
1215         USetAdder sa = {
1216             (USet *)incl,
1217             _set_add,
1218             _set_addRange,
1219             _set_addString,
1220             NULL // don't need remove()
1221         };
1222
1223         if (incl != NULL) {
1224             switch(src) {
1225             case UPROPS_SRC_CHAR:
1226                 uchar_addPropertyStarts(&sa, &status);
1227                 break;
1228             case UPROPS_SRC_PROPSVEC:
1229                 upropsvec_addPropertyStarts(&sa, &status);
1230                 break;
1231             case UPROPS_SRC_CHAR_AND_PROPSVEC:
1232                 uchar_addPropertyStarts(&sa, &status);
1233                 upropsvec_addPropertyStarts(&sa, &status);
1234                 break;
1235             case UPROPS_SRC_HST:
1236                 uhst_addPropertyStarts(&sa, &status);
1237                 break;
1238 #if !UCONFIG_NO_NORMALIZATION
1239             case UPROPS_SRC_NORM:
1240                 unorm_addPropertyStarts(&sa, &status);
1241                 break;
1242 #endif
1243             case UPROPS_SRC_CASE:
1244                 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
1245                 break;
1246             case UPROPS_SRC_BIDI:
1247                 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
1248                 break;
1249             default:
1250                 status = U_INTERNAL_PROGRAM_ERROR;
1251                 break;
1252             }
1253             if (U_SUCCESS(status)) {
1254                 umtx_lock(NULL);
1255                 if (INCLUSIONS[src] == NULL) {
1256                     INCLUSIONS[src] = incl;
1257                     incl = NULL;
1258                     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
1259                 }
1260                 umtx_unlock(NULL);
1261             }
1262             delete incl;
1263         } else {
1264             status = U_MEMORY_ALLOCATION_ERROR;
1265         }
1266     }
1267     return INCLUSIONS[src];
1268 }
1269
1270 //----------------------------------------------------------------
1271 // Case folding API
1272 //----------------------------------------------------------------
1273
1274 // add the result of a full case mapping to the set
1275 // use str as a temporary string to avoid constructing one
1276 static inline void
1277 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1278     if(result >= 0) {
1279         if(result > UCASE_MAX_STRING_LENGTH) {
1280             // add a single-code point case mapping
1281             set.add(result);
1282         } else {
1283             // add a string case mapping from full with length result
1284             str.setTo((UBool)FALSE, full, result);
1285             set.add(str);
1286         }
1287     }
1288     // result < 0: the code point mapped to itself, no need to add it
1289     // see ucase.h
1290 }
1291
1292 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
1293     if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
1294         UErrorCode status = U_ZERO_ERROR;
1295         const UCaseProps *csp = ucase_getSingleton(&status);
1296         if (U_SUCCESS(status)) {
1297             UnicodeSet foldSet(*this);
1298             UnicodeString str;
1299             USetAdder sa = {
1300                 (USet *)&foldSet,
1301                 _set_add,
1302                 _set_addRange,
1303                 _set_addString,
1304                 NULL // don't need remove()
1305             };
1306
1307             // start with input set to guarantee inclusion
1308             // USET_CASE: remove strings because the strings will actually be reduced (folded);
1309             //            therefore, start with no strings and add only those needed
1310             if (attribute & USET_CASE_INSENSITIVE) {
1311                 foldSet.strings->removeAllElements();
1312             }
1313
1314             int32_t n = getRangeCount();
1315             UChar32 result;
1316             const UChar *full;
1317             int32_t locCache = 0;
1318
1319             for (int32_t i=0; i<n; ++i) {
1320                 UChar32 start = getRangeStart(i);
1321                 UChar32 end   = getRangeEnd(i);
1322
1323                 if (attribute & USET_CASE_INSENSITIVE) {
1324                     // full case closure
1325                     for (UChar32 cp=start; cp<=end; ++cp) {
1326                         ucase_addCaseClosure(csp, cp, &sa);
1327                     }
1328                 } else {
1329                     // add case mappings
1330                     // (does not add long s for regular s, or Kelvin for k, for example)
1331                     for (UChar32 cp=start; cp<=end; ++cp) {
1332                         result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1333                         addCaseMapping(foldSet, result, full, str);
1334
1335                         result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1336                         addCaseMapping(foldSet, result, full, str);
1337
1338                         result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1339                         addCaseMapping(foldSet, result, full, str);
1340
1341                         result = ucase_toFullFolding(csp, cp, &full, 0);
1342                         addCaseMapping(foldSet, result, full, str);
1343                     }
1344                 }
1345             }
1346             if (strings != NULL && strings->size() > 0) {
1347                 if (attribute & USET_CASE_INSENSITIVE) {
1348                     for (int32_t j=0; j<strings->size(); ++j) {
1349                         str = *(const UnicodeString *) strings->elementAt(j);
1350                         str.foldCase();
1351                         if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1352                             foldSet.add(str); // does not map to code points: add the folded string itself
1353                         }
1354                     }
1355                 } else {
1356                     Locale root("");
1357 #if !UCONFIG_NO_BREAK_ITERATION
1358                     BreakIterator *bi = BreakIterator::createWordInstance(root, status);
1359 #endif
1360                     if (U_SUCCESS(status)) {
1361                         const UnicodeString *pStr;
1362
1363                         for (int32_t j=0; j<strings->size(); ++j) {
1364                             pStr = (const UnicodeString *) strings->elementAt(j);
1365                             (str = *pStr).toLower(root);
1366                             foldSet.add(str);
1367 #if !UCONFIG_NO_BREAK_ITERATION
1368                             (str = *pStr).toTitle(bi, root);
1369                             foldSet.add(str);
1370 #endif
1371                             (str = *pStr).toUpper(root);
1372                             foldSet.add(str);
1373                             (str = *pStr).foldCase();
1374                             foldSet.add(str);
1375                         }
1376                     }
1377 #if !UCONFIG_NO_BREAK_ITERATION
1378                     delete bi;
1379 #endif
1380                 }
1381             }
1382             *this = foldSet;
1383         }
1384     }
1385     return *this;
1386 }
1387
1388 U_NAMESPACE_END