2 ******************************************************************************* 
   4 *   Copyright (C) 1999-2012, International Business Machines 
   5 *   Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
   8 *   file name:  uniset_props.cpp 
  10 *   tab size:   8 (not used) 
  13 *   created on: 2004aug25 
  14 *   created by: Markus W. Scherer 
  16 *   Character property dependent functions moved here from uniset.cpp 
  19 #include "unicode/utypes.h" 
  20 #include "unicode/uniset.h" 
  21 #include "unicode/parsepos.h" 
  22 #include "unicode/uchar.h" 
  23 #include "unicode/uscript.h" 
  24 #include "unicode/symtable.h" 
  25 #include "unicode/uset.h" 
  26 #include "unicode/locid.h" 
  27 #include "unicode/brkiter.h" 
  36 #include "normalizer2impl.h" 
  38 #include "ubidi_props.h" 
  50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 
  52 // initial storage. Must be >= 0 
  53 // *** same as in uniset.cpp ! *** 
  54 #define START_EXTRA 16 
  56 // Define UChar constants using hex for EBCDIC compatibility 
  57 // Used #define to reduce private static exports and memory access time. 
  58 #define SET_OPEN        ((UChar)0x005B) /*[*/ 
  59 #define SET_CLOSE       ((UChar)0x005D) /*]*/ 
  60 #define HYPHEN          ((UChar)0x002D) /*-*/ 
  61 #define COMPLEMENT      ((UChar)0x005E) /*^*/ 
  62 #define COLON           ((UChar)0x003A) /*:*/ 
  63 #define BACKSLASH       ((UChar)0x005C) /*\*/ 
  64 #define INTERSECTION    ((UChar)0x0026) /*&*/ 
  65 #define UPPER_U         ((UChar)0x0055) /*U*/ 
  66 #define LOWER_U         ((UChar)0x0075) /*u*/ 
  67 #define OPEN_BRACE      ((UChar)123)    /*{*/ 
  68 #define CLOSE_BRACE     ((UChar)125)    /*}*/ 
  69 #define UPPER_P         ((UChar)0x0050) /*P*/ 
  70 #define LOWER_P         ((UChar)0x0070) /*p*/ 
  71 #define UPPER_N         ((UChar)78)     /*N*/ 
  72 #define EQUALS          ((UChar)0x003D) /*=*/ 
  74 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:" 
  75 static const UChar POSIX_CLOSE
[] = { COLON
,SET_CLOSE
,0 };  // ":]" 
  76 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p" 
  77 //static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}" 
  78 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N" 
  79 static const UChar HYPHEN_RIGHT_BRACE
[] = {HYPHEN
,SET_CLOSE
,0}; /*-]*/ 
  81 // Special property set IDs 
  82 static const char ANY
[]   = "ANY";   // [\u0000-\U0010FFFF] 
  83 static const char ASCII
[] = "ASCII"; // [\u0000-\u007F] 
  84 static const char ASSIGNED
[] = "Assigned"; // [:^Cn:] 
  86 // Unicode name property alias 
  87 #define NAME_PROP "na" 
  88 #define NAME_PROP_LENGTH 2 
  91  * Delimiter string used in patterns to close a category reference: 
  92  * ":]".  Example: "[:Lu:]". 
  94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 
  96 // Cached sets ------------------------------------------------------------- *** 
  99 static UBool U_CALLCONV 
uset_cleanup(); 
 102 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor 
 103 // can only fail with an out-of-memory error 
 104 // if we have a correct pattern and the properties data is hardcoded and always available. 
 105 class UnicodeSetSingleton 
: public SimpleSingletonWrapper
<UnicodeSet
> { 
 107     UnicodeSetSingleton(SimpleSingleton 
&s
, const char *pattern
) : 
 108             SimpleSingletonWrapper
<UnicodeSet
>(s
), fPattern(pattern
) {} 
 109     UnicodeSet 
*getInstance(UErrorCode 
&errorCode
) { 
 110         return SimpleSingletonWrapper
<UnicodeSet
>::getInstance(createInstance
, fPattern
, errorCode
); 
 113     static void *createInstance(const void *context
, UErrorCode 
&errorCode
) { 
 114         UnicodeString 
pattern((const char *)context
, -1, US_INV
); 
 115         UnicodeSet 
*set
=new UnicodeSet(pattern
, errorCode
); 
 117             errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 121         ucln_common_registerCleanup(UCLN_COMMON_USET
, uset_cleanup
); 
 125     const char *fPattern
; 
 130 static UnicodeSet 
*INCLUSIONS
[UPROPS_SRC_COUNT
] = { NULL 
}; // cached getInclusions() 
 132 STATIC_SIMPLE_SINGLETON(uni32Singleton
); 
 134 //---------------------------------------------------------------- 
 136 //---------------------------------------------------------------- 
 138 // USetAdder implementation 
 139 // Does not use uset.h to reduce code dependencies 
 140 static void U_CALLCONV
 
 141 _set_add(USet 
*set
, UChar32 c
) { 
 142     ((UnicodeSet 
*)set
)->add(c
); 
 145 static void U_CALLCONV
 
 146 _set_addRange(USet 
*set
, UChar32 start
, UChar32 end
) { 
 147     ((UnicodeSet 
*)set
)->add(start
, end
); 
 150 static void U_CALLCONV
 
 151 _set_addString(USet 
*set
, const UChar 
*str
, int32_t length
) { 
 152     ((UnicodeSet 
*)set
)->add(UnicodeString((UBool
)(length
<0), str
, length
)); 
 156  * Cleanup function for UnicodeSet 
 158 static UBool U_CALLCONV 
uset_cleanup(void) { 
 161     for(i 
= UPROPS_SRC_NONE
; i 
< UPROPS_SRC_COUNT
; ++i
) { 
 162         if (INCLUSIONS
[i
] != NULL
) { 
 163             delete INCLUSIONS
[i
]; 
 164             INCLUSIONS
[i
] = NULL
; 
 167     UnicodeSetSingleton(uni32Singleton
, NULL
).deleteInstance(); 
 176 Reduce excessive reallocation, and make it easier to detect initialization 
 178 Usually you don't see smaller sets than this for Unicode 5.0. 
 180 #define DEFAULT_INCLUSION_CAPACITY 3072 
 182 const UnicodeSet
* UnicodeSet::getInclusions(int32_t src
, UErrorCode 
&status
) { 
 184     UMTX_CHECK(NULL
, (INCLUSIONS
[src
] == NULL
), needInit
); 
 186         UnicodeSet
* incl 
= new UnicodeSet(); 
 192             NULL
, // don't need remove() 
 193             NULL 
// don't need removeRange() 
 196             incl
->ensureCapacity(DEFAULT_INCLUSION_CAPACITY
, status
); 
 198             case UPROPS_SRC_CHAR
: 
 199                 uchar_addPropertyStarts(&sa
, &status
); 
 201             case UPROPS_SRC_PROPSVEC
: 
 202                 upropsvec_addPropertyStarts(&sa
, &status
); 
 204             case UPROPS_SRC_CHAR_AND_PROPSVEC
: 
 205                 uchar_addPropertyStarts(&sa
, &status
); 
 206                 upropsvec_addPropertyStarts(&sa
, &status
); 
 208 #if !UCONFIG_NO_NORMALIZATION 
 209             case UPROPS_SRC_CASE_AND_NORM
: { 
 210                 const Normalizer2Impl 
*impl
=Normalizer2Factory::getNFCImpl(status
); 
 211                 if(U_SUCCESS(status
)) { 
 212                     impl
->addPropertyStarts(&sa
, status
); 
 214                 ucase_addPropertyStarts(ucase_getSingleton(), &sa
, &status
); 
 217             case UPROPS_SRC_NFC
: { 
 218                 const Normalizer2Impl 
*impl
=Normalizer2Factory::getNFCImpl(status
); 
 219                 if(U_SUCCESS(status
)) { 
 220                     impl
->addPropertyStarts(&sa
, status
); 
 224             case UPROPS_SRC_NFKC
: { 
 225                 const Normalizer2Impl 
*impl
=Normalizer2Factory::getNFKCImpl(status
); 
 226                 if(U_SUCCESS(status
)) { 
 227                     impl
->addPropertyStarts(&sa
, status
); 
 231             case UPROPS_SRC_NFKC_CF
: { 
 232                 const Normalizer2Impl 
*impl
=Normalizer2Factory::getNFKC_CFImpl(status
); 
 233                 if(U_SUCCESS(status
)) { 
 234                     impl
->addPropertyStarts(&sa
, status
); 
 238             case UPROPS_SRC_NFC_CANON_ITER
: { 
 239                 const Normalizer2Impl 
*impl
=Normalizer2Factory::getNFCImpl(status
); 
 240                 if(U_SUCCESS(status
)) { 
 241                     impl
->addCanonIterPropertyStarts(&sa
, status
); 
 246             case UPROPS_SRC_CASE
: 
 247                 ucase_addPropertyStarts(ucase_getSingleton(), &sa
, &status
); 
 249             case UPROPS_SRC_BIDI
: 
 250                 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa
, &status
); 
 253                 status 
= U_INTERNAL_PROGRAM_ERROR
; 
 256             if (U_SUCCESS(status
)) { 
 257                 // Compact for caching 
 260                 if (INCLUSIONS
[src
] == NULL
) { 
 261                     INCLUSIONS
[src
] = incl
; 
 263                     ucln_common_registerCleanup(UCLN_COMMON_USET
, uset_cleanup
); 
 269             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 272     return INCLUSIONS
[src
]; 
 275 // Cache some sets for other services -------------------------------------- *** 
 278 uniset_getUnicode32Instance(UErrorCode 
&errorCode
) { 
 279     return UnicodeSetSingleton(uni32Singleton
, "[:age=3.2:]").getInstance(errorCode
); 
 282 // helper functions for matching of pattern syntax pieces ------------------ *** 
 283 // these functions are parallel to the PERL_OPEN etc. strings above 
 285 // using these functions is not only faster than UnicodeString::compare() and 
 286 // caseCompare(), but they also make UnicodeSet work for simple patterns when 
 287 // no Unicode properties data is available - when caseCompare() fails 
 290 isPerlOpen(const UnicodeString 
&pattern
, int32_t pos
) { 
 292     return pattern
.charAt(pos
)==BACKSLASH 
&& ((c
=pattern
.charAt(pos
+1))==LOWER_P 
|| c
==UPPER_P
); 
 295 /*static inline UBool 
 296 isPerlClose(const UnicodeString &pattern, int32_t pos) { 
 297     return pattern.charAt(pos)==CLOSE_BRACE; 
 301 isNameOpen(const UnicodeString 
&pattern
, int32_t pos
) { 
 302     return pattern
.charAt(pos
)==BACKSLASH 
&& pattern
.charAt(pos
+1)==UPPER_N
; 
 306 isPOSIXOpen(const UnicodeString 
&pattern
, int32_t pos
) { 
 307     return pattern
.charAt(pos
)==SET_OPEN 
&& pattern
.charAt(pos
+1)==COLON
; 
 310 /*static inline UBool 
 311 isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 
 312     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 
 315 // TODO memory debugging provided inside uniset.cpp 
 316 // could be made available here but probably obsolete with use of modern 
 317 // memory leak checker tools 
 320 //---------------------------------------------------------------- 
 322 //---------------------------------------------------------------- 
 325  * Constructs a set from the given pattern, optionally ignoring 
 326  * white space.  See the class description for the syntax of the 
 328  * @param pattern a string specifying what characters are in the set 
 330 UnicodeSet::UnicodeSet(const UnicodeString
& pattern
, 
 331                        UErrorCode
& status
) : 
 332     len(0), capacity(START_EXTRA
), list(0), bmpSet(0), buffer(0), 
 333     bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
), 
 336     if(U_SUCCESS(status
)){ 
 337         list 
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
); 
 340             status 
= U_MEMORY_ALLOCATION_ERROR
;   
 342             allocateStrings(status
); 
 343             applyPattern(pattern
, status
); 
 349 //---------------------------------------------------------------- 
 351 //---------------------------------------------------------------- 
 353 UnicodeSet
& UnicodeSet::applyPattern(const UnicodeString
& pattern
, 
 354                                      UErrorCode
& status
) { 
 356     //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 
 357     // but without dependency on closeOver(). 
 358     ParsePosition 
pos(0); 
 359     applyPatternIgnoreSpace(pattern
, pos
, NULL
, status
); 
 360     if (U_FAILURE(status
)) return *this; 
 362     int32_t i 
= pos
.getIndex(); 
 363     // Skip over trailing whitespace 
 364     ICU_Utility::skipWhitespace(pattern
, i
, TRUE
); 
 365     if (i 
!= pattern
.length()) { 
 366         status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 372 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString
& pattern
, 
 374                                     const SymbolTable
* symbols
, 
 375                                     UErrorCode
& status
) { 
 376     if (U_FAILURE(status
)) { 
 380         status 
= U_NO_WRITE_PERMISSION
; 
 383     // Need to build the pattern in a temporary string because 
 384     // _applyPattern calls add() etc., which set pat to empty. 
 385     UnicodeString rebuiltPat
; 
 386     RuleCharacterIterator 
chars(pattern
, symbols
, pos
); 
 387     applyPattern(chars
, symbols
, rebuiltPat
, USET_IGNORE_SPACE
, NULL
, status
); 
 388     if (U_FAILURE(status
)) return; 
 389     if (chars
.inVariable()) { 
 390         // syntaxError(chars, "Extra chars in variable value"); 
 391         status 
= U_MALFORMED_SET
; 
 394     setPattern(rebuiltPat
); 
 398  * Return true if the given position, in the given pattern, appears 
 399  * to be the start of a UnicodeSet pattern. 
 401 UBool 
UnicodeSet::resemblesPattern(const UnicodeString
& pattern
, int32_t pos
) { 
 402     return ((pos
+1) < pattern
.length() && 
 403             pattern
.charAt(pos
) == (UChar
)91/*[*/) || 
 404         resemblesPropertyPattern(pattern
, pos
); 
 407 //---------------------------------------------------------------- 
 408 // Implementation: Pattern parsing 
 409 //---------------------------------------------------------------- 
 412  * A small all-inline class to manage a UnicodeSet pointer.  Add 
 413  * operator->() etc. as needed. 
 415 class UnicodeSetPointer 
{ 
 418     inline UnicodeSetPointer() : p(0) {} 
 419     inline ~UnicodeSetPointer() { delete p
; } 
 420     inline UnicodeSet
* pointer() { return p
; } 
 421     inline UBool 
allocate() { 
 423             p 
= new UnicodeSet(); 
 430  * Parse the pattern from the given RuleCharacterIterator.  The 
 431  * iterator is advanced over the parsed pattern. 
 432  * @param chars iterator over the pattern characters.  Upon return 
 433  * it will be advanced to the first character after the parsed 
 434  * pattern, or the end of the iteration if all characters are 
 436  * @param symbols symbol table to use to parse and dereference 
 437  * variables, or null if none. 
 438  * @param rebuiltPat the pattern that was parsed, rebuilt or 
 439  * copied from the input pattern, as appropriate. 
 440  * @param options a bit mask of zero or more of the following: 
 441  * IGNORE_SPACE, CASE. 
 443 void UnicodeSet::applyPattern(RuleCharacterIterator
& chars
, 
 444                               const SymbolTable
* symbols
, 
 445                               UnicodeString
& rebuiltPat
, 
 447                               UnicodeSet
& (UnicodeSet::*caseClosure
)(int32_t attribute
), 
 449     if (U_FAILURE(ec
)) return; 
 451     // Syntax characters: [ ] ^ - & { } 
 453     // Recognized special forms for chars, sets: c-c s-s s&s 
 455     int32_t opts 
= RuleCharacterIterator::PARSE_VARIABLES 
| 
 456                    RuleCharacterIterator::PARSE_ESCAPES
; 
 457     if ((options 
& USET_IGNORE_SPACE
) != 0) { 
 458         opts 
|= RuleCharacterIterator::SKIP_WHITESPACE
; 
 461     UnicodeString patLocal
, buf
; 
 462     UBool usePat 
= FALSE
; 
 463     UnicodeSetPointer scratch
; 
 464     RuleCharacterIterator::Pos backup
; 
 466     // mode: 0=before [, 1=between [...], 2=after ] 
 467     // lastItem: 0=none, 1=char, 2=set 
 468     int8_t lastItem 
= 0, mode 
= 0; 
 469     UChar32 lastChar 
= 0; 
 472     UBool invert 
= FALSE
; 
 476     while (mode 
!= 2 && !chars
.atEnd()) { 
 477         U_ASSERT((lastItem 
== 0 && op 
== 0) || 
 478                  (lastItem 
== 1 && (op 
== 0 || op 
== HYPHEN 
/*'-'*/)) || 
 479                  (lastItem 
== 2 && (op 
== 0 || op 
== HYPHEN 
/*'-'*/ || 
 480                                     op 
== INTERSECTION 
/*'&'*/))); 
 483         UBool literal 
= FALSE
; 
 484         UnicodeSet
* nested 
= 0; // alias - do not delete 
 486         // -------- Check for property pattern 
 488         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 
 490         if (resemblesPropertyPattern(chars
, opts
)) { 
 494         // -------- Parse '[' of opening delimiter OR nested set. 
 495         // If there is a nested set, use `setMode' to define how 
 496         // the set should be parsed.  If the '[' is part of the 
 497         // opening delimiter for this pattern, parse special 
 498         // strings "[", "[^", "[-", and "[^-".  Check for stand-in 
 499         // characters representing a nested set in the symbol 
 503             // Prepare to backup if necessary 
 504             chars
.getPos(backup
); 
 505             c 
= chars
.next(opts
, literal
, ec
); 
 506             if (U_FAILURE(ec
)) return; 
 508             if (c 
== 0x5B /*'['*/ && !literal
) { 
 510                     chars
.setPos(backup
); // backup 
 513                     // Handle opening '[' delimiter 
 515                     patLocal
.append((UChar
) 0x5B /*'['*/); 
 516                     chars
.getPos(backup
); // prepare to backup 
 517                     c 
= chars
.next(opts
, literal
, ec
);  
 518                     if (U_FAILURE(ec
)) return; 
 519                     if (c 
== 0x5E /*'^'*/ && !literal
) { 
 521                         patLocal
.append((UChar
) 0x5E /*'^'*/); 
 522                         chars
.getPos(backup
); // prepare to backup 
 523                         c 
= chars
.next(opts
, literal
, ec
); 
 524                         if (U_FAILURE(ec
)) return; 
 526                     // Fall through to handle special leading '-'; 
 527                     // otherwise restart loop for nested [], \p{}, etc. 
 528                     if (c 
== HYPHEN 
/*'-'*/) { 
 530                         // Fall through to handle literal '-' below 
 532                         chars
.setPos(backup
); // backup 
 536             } else if (symbols 
!= 0) { 
 537                 const UnicodeFunctor 
*m 
= symbols
->lookupMatcher(c
); 
 539                     const UnicodeSet 
*ms 
= dynamic_cast<const UnicodeSet 
*>(m
); 
 541                         ec 
= U_MALFORMED_SET
; 
 544                     // casting away const, but `nested' won't be modified 
 545                     // (important not to modify stored set) 
 546                     nested 
= const_cast<UnicodeSet
*>(ms
); 
 552         // -------- Handle a nested set.  This either is inline in 
 553         // the pattern or represented by a stand-in that has 
 554         // previously been parsed and was looked up in the symbol 
 560                     // syntaxError(chars, "Char expected after operator"); 
 561                     ec 
= U_MALFORMED_SET
; 
 564                 add(lastChar
, lastChar
); 
 565                 _appendToPat(patLocal
, lastChar
, FALSE
); 
 570             if (op 
== HYPHEN 
/*'-'*/ || op 
== INTERSECTION 
/*'&'*/) { 
 576                 if (!scratch
.allocate()) { 
 577                     ec 
= U_MEMORY_ALLOCATION_ERROR
; 
 580                 nested 
= scratch
.pointer(); 
 584                 nested
->applyPattern(chars
, symbols
, patLocal
, options
, caseClosure
, ec
); 
 587                 chars
.skipIgnored(opts
); 
 588                 nested
->applyPropertyPattern(chars
, patLocal
, ec
); 
 589                 if (U_FAILURE(ec
)) return; 
 591             case 3: // `nested' already parsed 
 592                 nested
->_toPattern(patLocal
, FALSE
); 
 599                 // Entire pattern is a category; leave parse loop 
 609             case INTERSECTION
: /*'&'*/ 
 624             // syntaxError(chars, "Missing '['"); 
 625             ec 
= U_MALFORMED_SET
; 
 629         // -------- Parse special (syntax) characters.  If the 
 630         // current character is not special, or if it is escaped, 
 631         // then fall through and handle it below. 
 637                     add(lastChar
, lastChar
); 
 638                     _appendToPat(patLocal
, lastChar
, FALSE
); 
 640                 // Treat final trailing '-' as a literal 
 641                 if (op 
== HYPHEN 
/*'-'*/) { 
 644                 } else if (op 
== INTERSECTION 
/*'&'*/) { 
 645                     // syntaxError(chars, "Trailing '&'"); 
 646                     ec 
= U_MALFORMED_SET
; 
 649                 patLocal
.append((UChar
) 0x5D /*']'*/); 
 658                         // Treat final trailing '-' as a literal 
 660                         c 
= chars
.next(opts
, literal
, ec
); 
 661                         if (U_FAILURE(ec
)) return; 
 662                         if (c 
== 0x5D /*']'*/ && !literal
) { 
 663                             patLocal
.append(HYPHEN_RIGHT_BRACE
, 2); 
 669                 // syntaxError(chars, "'-' not after char or set"); 
 670                 ec 
= U_MALFORMED_SET
; 
 672             case INTERSECTION 
/*'&'*/: 
 673                 if (lastItem 
== 2 && op 
== 0) { 
 677                 // syntaxError(chars, "'&' not after set"); 
 678                 ec 
= U_MALFORMED_SET
; 
 681                 // syntaxError(chars, "'^' not after '['"); 
 682                 ec 
= U_MALFORMED_SET
; 
 686                     // syntaxError(chars, "Missing operand after operator"); 
 687                     ec 
= U_MALFORMED_SET
; 
 691                     add(lastChar
, lastChar
); 
 692                     _appendToPat(patLocal
, lastChar
, FALSE
); 
 698                     while (!chars
.atEnd()) { 
 699                         c 
= chars
.next(opts
, literal
, ec
); 
 700                         if (U_FAILURE(ec
)) return; 
 701                         if (c 
== 0x7D /*'}'*/ && !literal
) { 
 707                     if (buf
.length() < 1 || !ok
) { 
 708                         // syntaxError(chars, "Invalid multicharacter string"); 
 709                         ec 
= U_MALFORMED_SET
; 
 713                 // We have new string. Add it to set and continue; 
 714                 // we don't need to drop through to the further 
 717                 patLocal
.append((UChar
) 0x7B /*'{'*/); 
 718                 _appendToPat(patLocal
, buf
, FALSE
); 
 719                 patLocal
.append((UChar
) 0x7D /*'}'*/); 
 721             case SymbolTable::SYMBOL_REF
: 
 723                 // [a-$]   error    error (ambiguous) 
 724                 // [a$]    anchor   anchor 
 725                 // [a-$x]  var "x"* literal '$' 
 726                 // [a-$.]  error    literal '$' 
 727                 // *We won't get here in the case of var "x" 
 729                     chars
.getPos(backup
); 
 730                     c 
= chars
.next(opts
, literal
, ec
); 
 731                     if (U_FAILURE(ec
)) return; 
 732                     UBool anchor 
= (c 
== 0x5D /*']'*/ && !literal
); 
 733                     if (symbols 
== 0 && !anchor
) { 
 734                         c 
= SymbolTable::SYMBOL_REF
; 
 735                         chars
.setPos(backup
); 
 736                         break; // literal '$' 
 738                     if (anchor 
&& op 
== 0) { 
 740                             add(lastChar
, lastChar
); 
 741                             _appendToPat(patLocal
, lastChar
, FALSE
); 
 745                         patLocal
.append((UChar
) SymbolTable::SYMBOL_REF
); 
 746                         patLocal
.append((UChar
) 0x5D /*']'*/); 
 750                     // syntaxError(chars, "Unquoted '$'"); 
 751                     ec 
= U_MALFORMED_SET
; 
 759         // -------- Parse literal characters.  This includes both 
 760         // escaped chars ("\u4E01") and non-syntax characters 
 769             if (op 
== HYPHEN 
/*'-'*/) { 
 771                     // Don't allow redundant (a-a) or empty (b-a) ranges; 
 772                     // these are most likely typos. 
 773                     // syntaxError(chars, "Invalid range"); 
 774                     ec 
= U_MALFORMED_SET
; 
 778                 _appendToPat(patLocal
, lastChar
, FALSE
); 
 780                 _appendToPat(patLocal
, c
, FALSE
); 
 784                 add(lastChar
, lastChar
); 
 785                 _appendToPat(patLocal
, lastChar
, FALSE
); 
 791                 // syntaxError(chars, "Set expected after operator"); 
 792                 ec 
= U_MALFORMED_SET
; 
 802         // syntaxError(chars, "Missing ']'"); 
 803         ec 
= U_MALFORMED_SET
; 
 807     chars
.skipIgnored(opts
); 
 810      * Handle global flags (invert, case insensitivity).  If this 
 811      * pattern should be compiled case-insensitive, then we need 
 812      * to close over case BEFORE COMPLEMENTING.  This makes 
 813      * patterns like /[^abc]/i work. 
 815     if ((options 
& USET_CASE_INSENSITIVE
) != 0) { 
 816         (this->*caseClosure
)(USET_CASE_INSENSITIVE
); 
 818     else if ((options 
& USET_ADD_CASE_MAPPINGS
) != 0) { 
 819         (this->*caseClosure
)(USET_ADD_CASE_MAPPINGS
); 
 825     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the 
 826     // generated pattern. 
 828         rebuiltPat
.append(patLocal
); 
 830         _generatePattern(rebuiltPat
, FALSE
); 
 832     if (isBogus() && U_SUCCESS(ec
)) { 
 833         // We likely ran out of memory. AHHH! 
 834         ec 
= U_MEMORY_ALLOCATION_ERROR
; 
 838 //---------------------------------------------------------------- 
 839 // Property set implementation 
 840 //---------------------------------------------------------------- 
 842 static UBool 
numericValueFilter(UChar32 ch
, void* context
) { 
 843     return u_getNumericValue(ch
) == *(double*)context
; 
 846 static UBool 
generalCategoryMaskFilter(UChar32 ch
, void* context
) { 
 847     int32_t value 
= *(int32_t*)context
; 
 848     return (U_GET_GC_MASK((UChar32
) ch
) & value
) != 0; 
 851 static UBool 
versionFilter(UChar32 ch
, void* context
) { 
 852     static const UVersionInfo none 
= { 0, 0, 0, 0 }; 
 855     UVersionInfo
* version 
= (UVersionInfo
*)context
; 
 856     return uprv_memcmp(&v
, &none
, sizeof(v
)) > 0 && uprv_memcmp(&v
, version
, sizeof(v
)) <= 0; 
 862 } IntPropertyContext
; 
 864 static UBool 
intPropertyFilter(UChar32 ch
, void* context
) { 
 865     IntPropertyContext
* c 
= (IntPropertyContext
*)context
; 
 866     return u_getIntPropertyValue((UChar32
) ch
, c
->prop
) == c
->value
; 
 869 static UBool 
scriptExtensionsFilter(UChar32 ch
, void* context
) { 
 870     return uscript_hasScript(ch
, *(UScriptCode
*)context
); 
 874  * Generic filter-based scanning code for UCD property UnicodeSets. 
 876 void UnicodeSet::applyFilter(UnicodeSet::Filter filter
, 
 879                              UErrorCode 
&status
) { 
 880     if (U_FAILURE(status
)) return; 
 882     // Logically, walk through all Unicode characters, noting the start 
 883     // and end of each range for which filter.contain(c) is 
 884     // true.  Add each range to a set. 
 886     // To improve performance, use an inclusions set which 
 887     // encodes information about character ranges that are known 
 888     // to have identical properties. 
 889     // getInclusions(src) contains exactly the first characters of 
 890     // same-value ranges for the given properties "source". 
 891     const UnicodeSet
* inclusions 
= getInclusions(src
, status
); 
 892     if (U_FAILURE(status
)) { 
 898     UChar32 startHasProperty 
= -1; 
 899     int32_t limitRange 
= inclusions
->getRangeCount(); 
 901     for (int j
=0; j
<limitRange
; ++j
) { 
 903         UChar32 start 
= inclusions
->getRangeStart(j
); 
 904         UChar32 end 
= inclusions
->getRangeEnd(j
); 
 906         // for all the code points in the range, process 
 907         for (UChar32 ch 
= start
; ch 
<= end
; ++ch
) { 
 908             // only add to this UnicodeSet on inflection points -- 
 909             // where the hasProperty value changes to false 
 910             if ((*filter
)(ch
, context
)) { 
 911                 if (startHasProperty 
< 0) { 
 912                     startHasProperty 
= ch
; 
 914             } else if (startHasProperty 
>= 0) { 
 915                 add(startHasProperty
, ch
-1); 
 916                 startHasProperty 
= -1; 
 920     if (startHasProperty 
>= 0) { 
 921         add((UChar32
)startHasProperty
, (UChar32
)0x10FFFF); 
 923     if (isBogus() && U_SUCCESS(status
)) { 
 924         // We likely ran out of memory. AHHH! 
 925         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 929 static UBool 
mungeCharName(char* dst
, const char* src
, int32_t dstCapacity
) { 
 930     /* Note: we use ' ' in compiler code page */ 
 933     --dstCapacity
; /* make room for term. zero */ 
 934     while ((ch 
= *src
++) != 0) { 
 935         if (ch 
== ' ' && (j
==0 || (j
>0 && dst
[j
-1]==' '))) { 
 938         if (j 
>= dstCapacity
) return FALSE
; 
 941     if (j 
> 0 && dst
[j
-1] == ' ') --j
; 
 946 //---------------------------------------------------------------- 
 948 //---------------------------------------------------------------- 
 950 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 
 953 UnicodeSet::applyIntPropertyValue(UProperty prop
, int32_t value
, UErrorCode
& ec
) { 
 954     if (U_FAILURE(ec
) || isFrozen()) return *this; 
 956     if (prop 
== UCHAR_GENERAL_CATEGORY_MASK
) { 
 957         applyFilter(generalCategoryMaskFilter
, &value
, UPROPS_SRC_CHAR
, ec
); 
 958     } else if (prop 
== UCHAR_SCRIPT_EXTENSIONS
) { 
 959         UScriptCode script 
= (UScriptCode
)value
; 
 960         applyFilter(scriptExtensionsFilter
, &script
, UPROPS_SRC_PROPSVEC
, ec
); 
 962         IntPropertyContext c 
= {prop
, value
}; 
 963         applyFilter(intPropertyFilter
, &c
, uprops_getSource(prop
), ec
); 
 969 UnicodeSet::applyPropertyAlias(const UnicodeString
& prop
, 
 970                                const UnicodeString
& value
, 
 972     if (U_FAILURE(ec
) || isFrozen()) return *this; 
 974     // prop and value used to be converted to char * using the default 
 975     // converter instead of the invariant conversion. 
 976     // This should not be necessary because all Unicode property and value 
 977     // names use only invariant characters. 
 978     // If there are any variant characters, then we won't find them anyway. 
 979     // Checking first avoids assertion failures in the conversion. 
 980     if( !uprv_isInvariantUString(prop
.getBuffer(), prop
.length()) || 
 981         !uprv_isInvariantUString(value
.getBuffer(), value
.length()) 
 985     CharString pname
, vname
; 
 986     pname
.appendInvariantChars(prop
, ec
); 
 987     vname
.appendInvariantChars(value
, ec
); 
 988     if (U_FAILURE(ec
)) return *this; 
 992     UBool mustNotBeEmpty 
= FALSE
, invert 
= FALSE
; 
 994     if (value
.length() > 0) { 
 995         p 
= u_getPropertyEnum(pname
.data()); 
 996         if (p 
== UCHAR_INVALID_CODE
) FAIL(ec
); 
 999         if (p 
== UCHAR_GENERAL_CATEGORY
) { 
1000             p 
= UCHAR_GENERAL_CATEGORY_MASK
; 
1003         if ((p 
>= UCHAR_BINARY_START 
&& p 
< UCHAR_BINARY_LIMIT
) || 
1004             (p 
>= UCHAR_INT_START 
&& p 
< UCHAR_INT_LIMIT
) || 
1005             (p 
>= UCHAR_MASK_START 
&& p 
< UCHAR_MASK_LIMIT
)) { 
1006             v 
= u_getPropertyValueEnum(p
, vname
.data()); 
1007             if (v 
== UCHAR_INVALID_CODE
) { 
1008                 // Handle numeric CCC 
1009                 if (p 
== UCHAR_CANONICAL_COMBINING_CLASS 
|| 
1010                     p 
== UCHAR_TRAIL_CANONICAL_COMBINING_CLASS 
|| 
1011                     p 
== UCHAR_LEAD_CANONICAL_COMBINING_CLASS
) { 
1013                     double value 
= uprv_strtod(vname
.data(), &end
); 
1014                     v 
= (int32_t) value
; 
1015                     if (v 
!= value 
|| v 
< 0 || *end 
!= 0) { 
1016                         // non-integral or negative value, or trailing junk 
1019                     // If the resultant set is empty then the numeric value 
1021                     mustNotBeEmpty 
= TRUE
; 
1031             case UCHAR_NUMERIC_VALUE
: 
1034                     double value 
= uprv_strtod(vname
.data(), &end
); 
1038                     applyFilter(numericValueFilter
, &value
, UPROPS_SRC_CHAR
, ec
); 
1043                     // Must munge name, since u_charFromName() does not do 
1044                     // 'loose' matching. 
1045                     char buf
[128]; // it suffices that this be > uprv_getMaxCharNameLength 
1046                     if (!mungeCharName(buf
, vname
.data(), sizeof(buf
))) FAIL(ec
); 
1047                     UChar32 ch 
= u_charFromName(U_EXTENDED_CHAR_NAME
, buf
, &ec
); 
1048                     if (U_SUCCESS(ec
)) { 
1056             case UCHAR_UNICODE_1_NAME
: 
1057                 // ICU 49 deprecates the Unicode_1_Name property APIs. 
1061                     // Must munge name, since u_versionFromString() does not do 
1062                     // 'loose' matching. 
1064                     if (!mungeCharName(buf
, vname
.data(), sizeof(buf
))) FAIL(ec
); 
1065                     UVersionInfo version
; 
1066                     u_versionFromString(version
, buf
); 
1067                     applyFilter(versionFilter
, &version
, UPROPS_SRC_PROPSVEC
, ec
); 
1070             case UCHAR_SCRIPT_EXTENSIONS
: 
1071                 v 
= u_getPropertyValueEnum(UCHAR_SCRIPT
, vname
.data()); 
1072                 if (v 
== UCHAR_INVALID_CODE
) { 
1075                 // fall through to calling applyIntPropertyValue() 
1078                 // p is a non-binary, non-enumerated property that we 
1079                 // don't support (yet). 
1086         // value is empty.  Interpret as General Category, Script, or 
1088         p 
= UCHAR_GENERAL_CATEGORY_MASK
; 
1089         v 
= u_getPropertyValueEnum(p
, pname
.data()); 
1090         if (v 
== UCHAR_INVALID_CODE
) { 
1092             v 
= u_getPropertyValueEnum(p
, pname
.data()); 
1093             if (v 
== UCHAR_INVALID_CODE
) { 
1094                 p 
= u_getPropertyEnum(pname
.data()); 
1095                 if (p 
>= UCHAR_BINARY_START 
&& p 
< UCHAR_BINARY_LIMIT
) { 
1097                 } else if (0 == uprv_comparePropertyNames(ANY
, pname
.data())) { 
1098                     set(MIN_VALUE
, MAX_VALUE
); 
1100                 } else if (0 == uprv_comparePropertyNames(ASCII
, pname
.data())) { 
1103                 } else if (0 == uprv_comparePropertyNames(ASSIGNED
, pname
.data())) { 
1104                     // [:Assigned:]=[:^Cn:] 
1105                     p 
= UCHAR_GENERAL_CATEGORY_MASK
; 
1115     applyIntPropertyValue(p
, v
, ec
); 
1120     if (U_SUCCESS(ec
) && (mustNotBeEmpty 
&& isEmpty())) { 
1121         // mustNotBeEmpty is set to true if an empty set indicates 
1123         ec 
= U_ILLEGAL_ARGUMENT_ERROR
; 
1126     if (isBogus() && U_SUCCESS(ec
)) { 
1127         // We likely ran out of memory. AHHH! 
1128         ec 
= U_MEMORY_ALLOCATION_ERROR
; 
1133 //---------------------------------------------------------------- 
1134 // Property set patterns 
1135 //---------------------------------------------------------------- 
1138  * Return true if the given position, in the given pattern, appears 
1139  * to be the start of a property set pattern. 
1141 UBool 
UnicodeSet::resemblesPropertyPattern(const UnicodeString
& pattern
, 
1143     // Patterns are at least 5 characters long 
1144     if ((pos
+5) > pattern
.length()) { 
1148     // Look for an opening [:, [:^, \p, or \P 
1149     return isPOSIXOpen(pattern
, pos
) || isPerlOpen(pattern
, pos
) || isNameOpen(pattern
, pos
); 
1153  * Return true if the given iterator appears to point at a 
1154  * property pattern.  Regardless of the result, return with the 
1155  * iterator unchanged. 
1156  * @param chars iterator over the pattern characters.  Upon return 
1157  * it will be unchanged. 
1158  * @param iterOpts RuleCharacterIterator options 
1160 UBool 
UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator
& chars
, 
1162     // NOTE: literal will always be FALSE, because we don't parse escapes. 
1163     UBool result 
= FALSE
, literal
; 
1164     UErrorCode ec 
= U_ZERO_ERROR
; 
1165     iterOpts 
&= ~RuleCharacterIterator::PARSE_ESCAPES
; 
1166     RuleCharacterIterator::Pos pos
; 
1168     UChar32 c 
= chars
.next(iterOpts
, literal
, ec
); 
1169     if (c 
== 0x5B /*'['*/ || c 
== 0x5C /*'\\'*/) { 
1170         UChar32 d 
= chars
.next(iterOpts 
& ~RuleCharacterIterator::SKIP_WHITESPACE
, 
1172         result 
= (c 
== 0x5B /*'['*/) ? (d 
== 0x3A /*':'*/) : 
1173                  (d 
== 0x4E /*'N'*/ || d 
== 0x70 /*'p'*/ || d 
== 0x50 /*'P'*/); 
1176     return result 
&& U_SUCCESS(ec
); 
1180  * Parse the given property pattern at the given parse position. 
1182 UnicodeSet
& UnicodeSet::applyPropertyPattern(const UnicodeString
& pattern
, 
1183                                              ParsePosition
& ppos
, 
1185     int32_t pos 
= ppos
.getIndex(); 
1187     UBool posix 
= FALSE
; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 
1188     UBool isName 
= FALSE
; // true for \N{pat}, o/w false 
1189     UBool invert 
= FALSE
; 
1191     if (U_FAILURE(ec
)) return *this; 
1193     // Minimum length is 5 characters, e.g. \p{L} 
1194     if ((pos
+5) > pattern
.length()) { 
1198     // On entry, ppos should point to one of the following locations: 
1199     // Look for an opening [:, [:^, \p, or \P 
1200     if (isPOSIXOpen(pattern
, pos
)) { 
1203         pos 
= ICU_Utility::skipWhitespace(pattern
, pos
); 
1204         if (pos 
< pattern
.length() && pattern
.charAt(pos
) == COMPLEMENT
) { 
1208     } else if (isPerlOpen(pattern
, pos
) || isNameOpen(pattern
, pos
)) { 
1209         UChar c 
= pattern
.charAt(pos
+1); 
1210         invert 
= (c 
== UPPER_P
); 
1211         isName 
= (c 
== UPPER_N
); 
1213         pos 
= ICU_Utility::skipWhitespace(pattern
, pos
); 
1214         if (pos 
== pattern
.length() || pattern
.charAt(pos
++) != OPEN_BRACE
) { 
1215             // Syntax error; "\p" or "\P" not followed by "{" 
1219         // Open delimiter not seen 
1223     // Look for the matching close delimiter, either :] or } 
1226       close 
= pattern
.indexOf(POSIX_CLOSE
, 2, pos
); 
1228       close 
= pattern
.indexOf(CLOSE_BRACE
, pos
); 
1231         // Syntax error; close delimiter missing 
1235     // Look for an '=' sign.  If this is present, we will parse a 
1236     // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 
1238     int32_t equals 
= pattern
.indexOf(EQUALS
, pos
); 
1239     UnicodeString propName
, valueName
; 
1240     if (equals 
>= 0 && equals 
< close 
&& !isName
) { 
1241         // Equals seen; parse medium/long pattern 
1242         pattern
.extractBetween(pos
, equals
, propName
); 
1243         pattern
.extractBetween(equals
+1, close
, valueName
); 
1247         // Handle case where no '=' is seen, and \N{} 
1248         pattern
.extractBetween(pos
, close
, propName
); 
1252             // This is a little inefficient since it means we have to 
1253             // parse NAME_PROP back to UCHAR_NAME even though we already 
1254             // know it's UCHAR_NAME.  If we refactor the API to 
1255             // support args of (UProperty, char*) then we can remove 
1256             // NAME_PROP and make this a little more efficient. 
1257             valueName 
= propName
; 
1258             propName 
= UnicodeString(NAME_PROP
, NAME_PROP_LENGTH
, US_INV
); 
1262     applyPropertyAlias(propName
, valueName
, ec
); 
1264     if (U_SUCCESS(ec
)) { 
1269         // Move to the limit position after the close delimiter if the 
1271         ppos
.setIndex(close 
+ (posix 
? 2 : 1)); 
1278  * Parse a property pattern. 
1279  * @param chars iterator over the pattern characters.  Upon return 
1280  * it will be advanced to the first character after the parsed 
1281  * pattern, or the end of the iteration if all characters are 
1283  * @param rebuiltPat the pattern that was parsed, rebuilt or 
1284  * copied from the input pattern, as appropriate. 
1286 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator
& chars
, 
1287                                       UnicodeString
& rebuiltPat
, 
1289     if (U_FAILURE(ec
)) return; 
1290     UnicodeString pattern
; 
1291     chars
.lookahead(pattern
); 
1292     ParsePosition 
pos(0); 
1293     applyPropertyPattern(pattern
, pos
, ec
); 
1294     if (U_FAILURE(ec
)) return; 
1295     if (pos
.getIndex() == 0) { 
1296         // syntaxError(chars, "Invalid property pattern"); 
1297         ec 
= U_MALFORMED_SET
; 
1300     chars
.jumpahead(pos
.getIndex()); 
1301     rebuiltPat
.append(pattern
, 0, pos
.getIndex());