2 ******************************************************************************* 
   3 * Copyright (C) 2013-2015, International Business Machines 
   4 * Corporation and others.  All Rights Reserved. 
   5 ******************************************************************************* 
   6 * collationruleparser.cpp 
   8 * (replaced the former ucol_tok.cpp) 
  10 * created on: 2013apr10 
  11 * created by: Markus W. Scherer 
  14 #include "unicode/utypes.h" 
  16 #if !UCONFIG_NO_COLLATION 
  18 #include "unicode/normalizer2.h" 
  19 #include "unicode/parseerr.h" 
  20 #include "unicode/uchar.h" 
  21 #include "unicode/ucol.h" 
  22 #include "unicode/uloc.h" 
  23 #include "unicode/unistr.h" 
  24 #include "unicode/utf16.h" 
  27 #include "collation.h" 
  28 #include "collationdata.h" 
  29 #include "collationruleparser.h" 
  30 #include "collationsettings.h" 
  31 #include "collationtailoring.h" 
  33 #include "patternprops.h" 
  41 static const UChar BEFORE
[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before" 
  42 const int32_t BEFORE_LENGTH 
= 7; 
  46 CollationRuleParser::Sink::~Sink() {} 
  49 CollationRuleParser::Sink::suppressContractions(const UnicodeSet 
&, const char *&, UErrorCode 
&) {} 
  52 CollationRuleParser::Sink::optimize(const UnicodeSet 
&, const char *&, UErrorCode 
&) {} 
  54 CollationRuleParser::Importer::~Importer() {} 
  56 CollationRuleParser::CollationRuleParser(const CollationData 
*base
, UErrorCode 
&errorCode
) 
  57         : nfd(*Normalizer2::getNFDInstance(errorCode
)), 
  58           nfc(*Normalizer2::getNFCInstance(errorCode
)), 
  59           rules(NULL
), baseData(base
), settings(NULL
), 
  60           parseError(NULL
), errorReason(NULL
), 
  61           sink(NULL
), importer(NULL
), 
  65 CollationRuleParser::~CollationRuleParser() { 
  69 CollationRuleParser::parse(const UnicodeString 
&ruleString
, 
  70                            CollationSettings 
&outSettings
, 
  71                            UParseError 
*outParseError
, 
  72                            UErrorCode 
&errorCode
) { 
  73     if(U_FAILURE(errorCode
)) { return; } 
  74     settings 
= &outSettings
; 
  75     parseError 
= outParseError
; 
  76     if(parseError 
!= NULL
) { 
  78         parseError
->offset 
= -1; 
  79         parseError
->preContext
[0] = 0; 
  80         parseError
->postContext
[0] = 0; 
  83     parse(ruleString
, errorCode
); 
  87 CollationRuleParser::parse(const UnicodeString 
&ruleString
, UErrorCode 
&errorCode
) { 
  88     if(U_FAILURE(errorCode
)) { return; } 
  92     while(ruleIndex 
< rules
->length()) { 
  93         UChar c 
= rules
->charAt(ruleIndex
); 
  94         if(PatternProps::isWhiteSpace(c
)) { 
 100             parseRuleChain(errorCode
); 
 103             parseSetting(errorCode
); 
 105         case 0x23:  // '#' starts a comment, until the end of the line 
 106             ruleIndex 
= skipComment(ruleIndex 
+ 1); 
 108         case 0x40:  // '@' is equivalent to [backwards 2] 
 109             settings
->setFlag(CollationSettings::BACKWARD_SECONDARY
, 
 110                               UCOL_ON
, 0, errorCode
); 
 113         case 0x21:  // '!' used to turn on Thai/Lao character reversal 
 114             // Accept but ignore. The root collator has contractions 
 115             // that are equivalent to the character reversal, where appropriate. 
 119             setParseError("expected a reset or setting or comment", errorCode
); 
 122         if(U_FAILURE(errorCode
)) { return; } 
 127 CollationRuleParser::parseRuleChain(UErrorCode 
&errorCode
) { 
 128     int32_t resetStrength 
= parseResetAndPosition(errorCode
); 
 129     UBool isFirstRelation 
= TRUE
; 
 131         int32_t result 
= parseRelationOperator(errorCode
); 
 132         if(U_FAILURE(errorCode
)) { return; } 
 134             if(ruleIndex 
< rules
->length() && rules
->charAt(ruleIndex
) == 0x23) { 
 135                 // '#' starts a comment, until the end of the line 
 136                 ruleIndex 
= skipComment(ruleIndex 
+ 1); 
 139             if(isFirstRelation
) { 
 140                 setParseError("reset not followed by a relation", errorCode
); 
 144         int32_t strength 
= result 
& STRENGTH_MASK
; 
 145         if(resetStrength 
< UCOL_IDENTICAL
) { 
 146             // reset-before rule chain 
 147             if(isFirstRelation
) { 
 148                 if(strength 
!= resetStrength
) { 
 149                     setParseError("reset-before strength differs from its first relation", errorCode
); 
 153                 if(strength 
< resetStrength
) { 
 154                     setParseError("reset-before strength followed by a stronger relation", errorCode
); 
 159         int32_t i 
= ruleIndex 
+ (result 
>> OFFSET_SHIFT
);  // skip over the relation operator 
 160         if((result 
& STARRED_FLAG
) == 0) { 
 161             parseRelationStrings(strength
, i
, errorCode
); 
 163             parseStarredCharacters(strength
, i
, errorCode
); 
 165         if(U_FAILURE(errorCode
)) { return; } 
 166         isFirstRelation 
= FALSE
; 
 171 CollationRuleParser::parseResetAndPosition(UErrorCode 
&errorCode
) { 
 172     if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; } 
 173     int32_t i 
= skipWhiteSpace(ruleIndex 
+ 1); 
 176     int32_t resetStrength
; 
 177     if(rules
->compare(i
, BEFORE_LENGTH
, BEFORE
, 0, BEFORE_LENGTH
) == 0 && 
 178             (j 
= i 
+ BEFORE_LENGTH
) < rules
->length() && 
 179             PatternProps::isWhiteSpace(rules
->charAt(j
)) && 
 180             ((j 
= skipWhiteSpace(j 
+ 1)) + 1) < rules
->length() && 
 181             0x31 <= (c 
= rules
->charAt(j
)) && c 
<= 0x33 && 
 182             rules
->charAt(j 
+ 1) == 0x5d) { 
 183         // &[before n] with n=1 or 2 or 3 
 184         resetStrength 
= UCOL_PRIMARY 
+ (c 
- 0x31); 
 185         i 
= skipWhiteSpace(j 
+ 2); 
 187         resetStrength 
= UCOL_IDENTICAL
; 
 189     if(i 
>= rules
->length()) { 
 190         setParseError("reset without position", errorCode
); 
 194     if(rules
->charAt(i
) == 0x5b) {  // '[' 
 195         i 
= parseSpecialPosition(i
, str
, errorCode
); 
 197         i 
= parseTailoringString(i
, str
, errorCode
); 
 199     sink
->addReset(resetStrength
, str
, errorReason
, errorCode
); 
 200     if(U_FAILURE(errorCode
)) { setErrorContext(); } 
 202     return resetStrength
; 
 206 CollationRuleParser::parseRelationOperator(UErrorCode 
&errorCode
) { 
 207     if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; } 
 208     ruleIndex 
= skipWhiteSpace(ruleIndex
); 
 209     if(ruleIndex 
>= rules
->length()) { return UCOL_DEFAULT
; } 
 211     int32_t i 
= ruleIndex
; 
 212     UChar c 
= rules
->charAt(i
++); 
 215         if(i 
< rules
->length() && rules
->charAt(i
) == 0x3c) {  // << 
 217             if(i 
< rules
->length() && rules
->charAt(i
) == 0x3c) {  // <<< 
 219                 if(i 
< rules
->length() && rules
->charAt(i
) == 0x3c) {  // <<<< 
 221                     strength 
= UCOL_QUATERNARY
; 
 223                     strength 
= UCOL_TERTIARY
; 
 226                 strength 
= UCOL_SECONDARY
; 
 229             strength 
= UCOL_PRIMARY
; 
 231         if(i 
< rules
->length() && rules
->charAt(i
) == 0x2a) {  // '*' 
 233             strength 
|= STARRED_FLAG
; 
 236     case 0x3b:  // ';' same as << 
 237         strength 
= UCOL_SECONDARY
; 
 239     case 0x2c:  // ',' same as <<< 
 240         strength 
= UCOL_TERTIARY
; 
 243         strength 
= UCOL_IDENTICAL
; 
 244         if(i 
< rules
->length() && rules
->charAt(i
) == 0x2a) {  // '*' 
 246             strength 
|= STARRED_FLAG
; 
 252     return ((i 
- ruleIndex
) << OFFSET_SHIFT
) | strength
; 
 256 CollationRuleParser::parseRelationStrings(int32_t strength
, int32_t i
, UErrorCode 
&errorCode
) { 
 258     //     prefix | str / extension 
 259     // where prefix and extension are optional. 
 260     UnicodeString prefix
, str
, extension
; 
 261     i 
= parseTailoringString(i
, str
, errorCode
); 
 262     if(U_FAILURE(errorCode
)) { return; } 
 263     UChar next 
= (i 
< rules
->length()) ? rules
->charAt(i
) : 0; 
 264     if(next 
== 0x7c) {  // '|' separates the context prefix from the string. 
 266         i 
= parseTailoringString(i 
+ 1, str
, errorCode
); 
 267         if(U_FAILURE(errorCode
)) { return; } 
 268         next 
= (i 
< rules
->length()) ? rules
->charAt(i
) : 0; 
 270     if(next 
== 0x2f) {  // '/' separates the string from the extension. 
 271         i 
= parseTailoringString(i 
+ 1, extension
, errorCode
); 
 273     if(!prefix
.isEmpty()) { 
 274         UChar32 prefix0 
= prefix
.char32At(0); 
 275         UChar32 c 
= str
.char32At(0); 
 276         if(!nfc
.hasBoundaryBefore(prefix0
) || !nfc
.hasBoundaryBefore(c
)) { 
 277             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary", 
 282     sink
->addRelation(strength
, prefix
, str
, extension
, errorReason
, errorCode
); 
 283     if(U_FAILURE(errorCode
)) { setErrorContext(); } 
 288 CollationRuleParser::parseStarredCharacters(int32_t strength
, int32_t i
, UErrorCode 
&errorCode
) { 
 289     UnicodeString empty
, raw
; 
 290     i 
= parseString(skipWhiteSpace(i
), raw
, errorCode
); 
 291     if(U_FAILURE(errorCode
)) { return; } 
 293         setParseError("missing starred-relation string", errorCode
); 
 299         while(j 
< raw
.length()) { 
 300             UChar32 c 
= raw
.char32At(j
); 
 301             if(!nfd
.isInert(c
)) { 
 302                 setParseError("starred-relation string is not all NFD-inert", errorCode
); 
 305             sink
->addRelation(strength
, empty
, UnicodeString(c
), empty
, errorReason
, errorCode
); 
 306             if(U_FAILURE(errorCode
)) { 
 313         if(i 
>= rules
->length() || rules
->charAt(i
) != 0x2d) {  // '-' 
 317             setParseError("range without start in starred-relation string", errorCode
); 
 320         i 
= parseString(i 
+ 1, raw
, errorCode
); 
 321         if(U_FAILURE(errorCode
)) { return; } 
 323             setParseError("range without end in starred-relation string", errorCode
); 
 326         UChar32 c 
= raw
.char32At(0); 
 328             setParseError("range start greater than end in starred-relation string", errorCode
); 
 334             if(!nfd
.isInert(prev
)) { 
 335                 setParseError("starred-relation string range is not all NFD-inert", errorCode
); 
 338             if(U_IS_SURROGATE(prev
)) { 
 339                 setParseError("starred-relation string range contains a surrogate", errorCode
); 
 342             if(0xfffd <= prev 
&& prev 
<= 0xffff) { 
 343                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode
); 
 347             sink
->addRelation(strength
, empty
, s
, empty
, errorReason
, errorCode
); 
 348             if(U_FAILURE(errorCode
)) { 
 356     ruleIndex 
= skipWhiteSpace(i
); 
 360 CollationRuleParser::parseTailoringString(int32_t i
, UnicodeString 
&raw
, UErrorCode 
&errorCode
) { 
 361     i 
= parseString(skipWhiteSpace(i
), raw
, errorCode
); 
 362     if(U_SUCCESS(errorCode
) && raw
.isEmpty()) { 
 363         setParseError("missing relation string", errorCode
); 
 365     return skipWhiteSpace(i
); 
 369 CollationRuleParser::parseString(int32_t i
, UnicodeString 
&raw
, UErrorCode 
&errorCode
) { 
 370     if(U_FAILURE(errorCode
)) { return i
; } 
 372     while(i 
< rules
->length()) { 
 373         UChar32 c 
= rules
->charAt(i
++); 
 374         if(isSyntaxChar(c
)) { 
 375             if(c 
== 0x27) {  // apostrophe 
 376                 if(i 
< rules
->length() && rules
->charAt(i
) == 0x27) { 
 377                     // Double apostrophe, encodes a single one. 
 378                     raw
.append((UChar
)0x27); 
 382                 // Quote literal text until the next single apostrophe. 
 384                     if(i 
== rules
->length()) { 
 385                         setParseError("quoted literal text missing terminating apostrophe", errorCode
); 
 388                     c 
= rules
->charAt(i
++); 
 390                         if(i 
< rules
->length() && rules
->charAt(i
) == 0x27) { 
 391                             // Double apostrophe inside quoted literal text, 
 392                             // still encodes a single apostrophe. 
 398                     raw
.append((UChar
)c
); 
 400             } else if(c 
== 0x5c) {  // backslash 
 401                 if(i 
== rules
->length()) { 
 402                     setParseError("backslash escape at the end of the rule string", errorCode
); 
 405                 c 
= rules
->char32At(i
); 
 409                 // Any other syntax character terminates a string. 
 413         } else if(PatternProps::isWhiteSpace(c
)) { 
 414             // Unquoted white space terminates a string. 
 418             raw
.append((UChar
)c
); 
 421     for(int32_t j 
= 0; j 
< raw
.length();) { 
 422         UChar32 c 
= raw
.char32At(j
); 
 423         if(U_IS_SURROGATE(c
)) { 
 424             setParseError("string contains an unpaired surrogate", errorCode
); 
 427         if(0xfffd <= c 
&& c 
<= 0xffff) { 
 428             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode
); 
 438 static const char *const positions
[] = { 
 439     "first tertiary ignorable", 
 440     "last tertiary ignorable", 
 441     "first secondary ignorable", 
 442     "last secondary ignorable", 
 443     "first primary ignorable", 
 444     "last primary ignorable", 
 458 CollationRuleParser::parseSpecialPosition(int32_t i
, UnicodeString 
&str
, UErrorCode 
&errorCode
) { 
 459     if(U_FAILURE(errorCode
)) { return 0; } 
 461     int32_t j 
= readWords(i 
+ 1, raw
); 
 462     if(j 
> i 
&& rules
->charAt(j
) == 0x5d && !raw
.isEmpty()) {  // words end with ] 
 464         for(int32_t pos 
= 0; pos 
< UPRV_LENGTHOF(positions
); ++pos
) { 
 465             if(raw 
== UnicodeString(positions
[pos
], -1, US_INV
)) { 
 466                 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE 
+ pos
)); 
 470         if(raw 
== UNICODE_STRING_SIMPLE("top")) { 
 471             str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE 
+ LAST_REGULAR
)); 
 474         if(raw 
== UNICODE_STRING_SIMPLE("variable top")) { 
 475             str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE 
+ LAST_VARIABLE
)); 
 479     setParseError("not a valid special reset position", errorCode
); 
 484 CollationRuleParser::parseSetting(UErrorCode 
&errorCode
) { 
 485     if(U_FAILURE(errorCode
)) { return; } 
 487     int32_t i 
= ruleIndex 
+ 1; 
 488     int32_t j 
= readWords(i
, raw
); 
 489     if(j 
<= i 
|| raw
.isEmpty()) { 
 490         setParseError("expected a setting/option at '['", errorCode
); 
 492     if(rules
->charAt(j
) == 0x5d) {  // words end with ] 
 494         if(raw
.startsWith(UNICODE_STRING_SIMPLE("reorder")) && 
 495                 (raw
.length() == 7 || raw
.charAt(7) == 0x20)) { 
 496             parseReordering(raw
, errorCode
); 
 500         if(raw 
== UNICODE_STRING_SIMPLE("backwards 2")) { 
 501             settings
->setFlag(CollationSettings::BACKWARD_SECONDARY
, 
 502                               UCOL_ON
, 0, errorCode
); 
 507         int32_t valueIndex 
= raw
.lastIndexOf((UChar
)0x20); 
 508         if(valueIndex 
>= 0) { 
 509             v
.setTo(raw
, valueIndex 
+ 1); 
 510             raw
.truncate(valueIndex
); 
 512         if(raw 
== UNICODE_STRING_SIMPLE("strength") && v
.length() == 1) { 
 513             int32_t value 
= UCOL_DEFAULT
; 
 514             UChar c 
= v
.charAt(0); 
 515             if(0x31 <= c 
&& c 
<= 0x34) {  // 1..4 
 516                 value 
= UCOL_PRIMARY 
+ (c 
- 0x31); 
 517             } else if(c 
== 0x49) {  // 'I' 
 518                 value 
= UCOL_IDENTICAL
; 
 520             if(value 
!= UCOL_DEFAULT
) { 
 521                 settings
->setStrength(value
, 0, errorCode
); 
 525         } else if(raw 
== UNICODE_STRING_SIMPLE("alternate")) { 
 526             UColAttributeValue value 
= UCOL_DEFAULT
; 
 527             if(v 
== UNICODE_STRING_SIMPLE("non-ignorable")) { 
 528                 value 
= UCOL_NON_IGNORABLE
; 
 529             } else if(v 
== UNICODE_STRING_SIMPLE("shifted")) { 
 530                 value 
= UCOL_SHIFTED
; 
 532             if(value 
!= UCOL_DEFAULT
) { 
 533                 settings
->setAlternateHandling(value
, 0, errorCode
); 
 537         } else if(raw 
== UNICODE_STRING_SIMPLE("maxVariable")) { 
 538             int32_t value 
= UCOL_DEFAULT
; 
 539             if(v 
== UNICODE_STRING_SIMPLE("space")) { 
 540                 value 
= CollationSettings::MAX_VAR_SPACE
; 
 541             } else if(v 
== UNICODE_STRING_SIMPLE("punct")) { 
 542                 value 
= CollationSettings::MAX_VAR_PUNCT
; 
 543             } else if(v 
== UNICODE_STRING_SIMPLE("symbol")) { 
 544                 value 
= CollationSettings::MAX_VAR_SYMBOL
; 
 545             } else if(v 
== UNICODE_STRING_SIMPLE("currency")) { 
 546                 value 
= CollationSettings::MAX_VAR_CURRENCY
; 
 548             if(value 
!= UCOL_DEFAULT
) { 
 549                 settings
->setMaxVariable(value
, 0, errorCode
); 
 550                 settings
->variableTop 
= baseData
->getLastPrimaryForGroup( 
 551                     UCOL_REORDER_CODE_FIRST 
+ value
); 
 552                 U_ASSERT(settings
->variableTop 
!= 0); 
 556         } else if(raw 
== UNICODE_STRING_SIMPLE("caseFirst")) { 
 557             UColAttributeValue value 
= UCOL_DEFAULT
; 
 558             if(v 
== UNICODE_STRING_SIMPLE("off")) { 
 560             } else if(v 
== UNICODE_STRING_SIMPLE("lower")) { 
 561                 value 
= UCOL_LOWER_FIRST
; 
 562             } else if(v 
== UNICODE_STRING_SIMPLE("upper")) { 
 563                 value 
= UCOL_UPPER_FIRST
; 
 565             if(value 
!= UCOL_DEFAULT
) { 
 566                 settings
->setCaseFirst(value
, 0, errorCode
); 
 570         } else if(raw 
== UNICODE_STRING_SIMPLE("caseLevel")) { 
 571             UColAttributeValue value 
= getOnOffValue(v
); 
 572             if(value 
!= UCOL_DEFAULT
) { 
 573                 settings
->setFlag(CollationSettings::CASE_LEVEL
, value
, 0, errorCode
); 
 577         } else if(raw 
== UNICODE_STRING_SIMPLE("normalization")) { 
 578             UColAttributeValue value 
= getOnOffValue(v
); 
 579             if(value 
!= UCOL_DEFAULT
) { 
 580                 settings
->setFlag(CollationSettings::CHECK_FCD
, value
, 0, errorCode
); 
 584         } else if(raw 
== UNICODE_STRING_SIMPLE("numericOrdering")) { 
 585             UColAttributeValue value 
= getOnOffValue(v
); 
 586             if(value 
!= UCOL_DEFAULT
) { 
 587                 settings
->setFlag(CollationSettings::NUMERIC
, value
, 0, errorCode
); 
 591         } else if(raw 
== UNICODE_STRING_SIMPLE("hiraganaQ")) { 
 592             UColAttributeValue value 
= getOnOffValue(v
); 
 593             if(value 
!= UCOL_DEFAULT
) { 
 594                 if(value 
== UCOL_ON
) { 
 595                     setParseError("[hiraganaQ on] is not supported", errorCode
); 
 600         } else if(raw 
== UNICODE_STRING_SIMPLE("import")) { 
 602             lang
.appendInvariantChars(v
, errorCode
); 
 603             if(errorCode 
== U_MEMORY_ALLOCATION_ERROR
) { return; } 
 604             // BCP 47 language tag -> ICU locale ID 
 605             char localeID
[ULOC_FULLNAME_CAPACITY
]; 
 606             int32_t parsedLength
; 
 607             int32_t length 
= uloc_forLanguageTag(lang
.data(), localeID
, ULOC_FULLNAME_CAPACITY
, 
 608                                                  &parsedLength
, &errorCode
); 
 609             if(U_FAILURE(errorCode
) || 
 610                     parsedLength 
!= lang
.length() || length 
>= ULOC_FULLNAME_CAPACITY
) { 
 611                 errorCode 
= U_ZERO_ERROR
; 
 612                 setParseError("expected language tag in [import langTag]", errorCode
); 
 615             // localeID minus all keywords 
 616             char baseID
[ULOC_FULLNAME_CAPACITY
]; 
 617             length 
= uloc_getBaseName(localeID
, baseID
, ULOC_FULLNAME_CAPACITY
, &errorCode
); 
 618             if(U_FAILURE(errorCode
) || length 
>= ULOC_KEYWORDS_CAPACITY
) { 
 619                 errorCode 
= U_ZERO_ERROR
; 
 620                 setParseError("expected language tag in [import langTag]", errorCode
); 
 623             if(length 
== 3 && uprv_memcmp(baseID
, "und", 3) == 0) { 
 624                 uprv_strcpy(baseID
, "root"); 
 626             // @collation=type, or length=0 if not specified 
 627             char collationType
[ULOC_KEYWORDS_CAPACITY
]; 
 628             length 
= uloc_getKeywordValue(localeID
, "collation", 
 629                                           collationType
, ULOC_KEYWORDS_CAPACITY
, 
 631             if(U_FAILURE(errorCode
) || length 
>= ULOC_KEYWORDS_CAPACITY
) { 
 632                 errorCode 
= U_ZERO_ERROR
; 
 633                 setParseError("expected language tag in [import langTag]", errorCode
); 
 636             if(importer 
== NULL
) { 
 637                 setParseError("[import langTag] is not supported", errorCode
); 
 639                 UnicodeString importedRules
; 
 640                 importer
->getRules(baseID
, length 
> 0 ? collationType 
: "standard", 
 641                                    importedRules
, errorReason
, errorCode
); 
 642                 if(U_FAILURE(errorCode
)) { 
 643                     if(errorReason 
== NULL
) { 
 644                         errorReason 
= "[import langTag] failed"; 
 649                 const UnicodeString 
*outerRules 
= rules
; 
 650                 int32_t outerRuleIndex 
= ruleIndex
; 
 651                 parse(importedRules
, errorCode
); 
 652                 if(U_FAILURE(errorCode
)) { 
 653                     if(parseError 
!= NULL
) { 
 654                         parseError
->offset 
= outerRuleIndex
; 
 662     } else if(rules
->charAt(j
) == 0x5b) {  // words end with [ 
 664         j 
= parseUnicodeSet(j
, set
, errorCode
); 
 665         if(U_FAILURE(errorCode
)) { return; } 
 666         if(raw 
== UNICODE_STRING_SIMPLE("optimize")) { 
 667             sink
->optimize(set
, errorReason
, errorCode
); 
 668             if(U_FAILURE(errorCode
)) { setErrorContext(); } 
 671         } else if(raw 
== UNICODE_STRING_SIMPLE("suppressContractions")) { 
 672             sink
->suppressContractions(set
, errorReason
, errorCode
); 
 673             if(U_FAILURE(errorCode
)) { setErrorContext(); } 
 678     setParseError("not a valid setting/option", errorCode
); 
 682 CollationRuleParser::parseReordering(const UnicodeString 
&raw
, UErrorCode 
&errorCode
) { 
 683     if(U_FAILURE(errorCode
)) { return; } 
 684     int32_t i 
= 7;  // after "reorder" 
 685     if(i 
== raw
.length()) { 
 686         // empty [reorder] with no codes 
 687         settings
->resetReordering(); 
 690     // Parse the codes in [reorder aa bb cc]. 
 691     UVector32 
reorderCodes(errorCode
); 
 692     if(U_FAILURE(errorCode
)) { return; } 
 694     while(i 
< raw
.length()) { 
 695         ++i
;  // skip the word-separating space 
 696         int32_t limit 
= raw
.indexOf((UChar
)0x20, i
); 
 697         if(limit 
< 0) { limit 
= raw
.length(); } 
 698         word
.clear().appendInvariantChars(raw
.tempSubStringBetween(i
, limit
), errorCode
); 
 699         if(U_FAILURE(errorCode
)) { return; } 
 700         int32_t code 
= getReorderCode(word
.data()); 
 702             setParseError("unknown script or reorder code", errorCode
); 
 705         reorderCodes
.addElement(code
, errorCode
); 
 706         if(U_FAILURE(errorCode
)) { return; } 
 709     settings
->setReordering(*baseData
, reorderCodes
.getBuffer(), reorderCodes
.size(), errorCode
); 
 712 static const char *const gSpecialReorderCodes
[] = { 
 713     "space", "punct", "symbol", "currency", "digit" 
 717 CollationRuleParser::getReorderCode(const char *word
) { 
 718     for(int32_t i 
= 0; i 
< UPRV_LENGTHOF(gSpecialReorderCodes
); ++i
) { 
 719         if(uprv_stricmp(word
, gSpecialReorderCodes
[i
]) == 0) { 
 720             return UCOL_REORDER_CODE_FIRST 
+ i
; 
 723     int32_t script 
= u_getPropertyValueEnum(UCHAR_SCRIPT
, word
); 
 727     if(uprv_stricmp(word
, "others") == 0) { 
 728         return UCOL_REORDER_CODE_OTHERS
;  // same as Zzzz = USCRIPT_UNKNOWN 
 734 CollationRuleParser::getOnOffValue(const UnicodeString 
&s
) { 
 735     if(s 
== UNICODE_STRING_SIMPLE("on")) { 
 737     } else if(s 
== UNICODE_STRING_SIMPLE("off")) { 
 745 CollationRuleParser::parseUnicodeSet(int32_t i
, UnicodeSet 
&set
, UErrorCode 
&errorCode
) { 
 746     // Collect a UnicodeSet pattern between a balanced pair of [brackets]. 
 750         if(j 
== rules
->length()) { 
 751             setParseError("unbalanced UnicodeSet pattern brackets", errorCode
); 
 754         UChar c 
= rules
->charAt(j
++); 
 755         if(c 
== 0x5b) {  // '[' 
 757         } else if(c 
== 0x5d) {  // ']' 
 758             if(--level 
== 0) { break; } 
 761     set
.applyPattern(rules
->tempSubStringBetween(i
, j
), errorCode
); 
 762     if(U_FAILURE(errorCode
)) { 
 763         errorCode 
= U_ZERO_ERROR
; 
 764         setParseError("not a valid UnicodeSet pattern", errorCode
); 
 767     j 
= skipWhiteSpace(j
); 
 768     if(j 
== rules
->length() || rules
->charAt(j
) != 0x5d) { 
 769         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode
); 
 776 CollationRuleParser::readWords(int32_t i
, UnicodeString 
&raw
) const { 
 777     static const UChar sp 
= 0x20; 
 779     i 
= skipWhiteSpace(i
); 
 781         if(i 
>= rules
->length()) { return 0; } 
 782         UChar c 
= rules
->charAt(i
); 
 783         if(isSyntaxChar(c
) && c 
!= 0x2d && c 
!= 0x5f) {  // syntax except -_ 
 784             if(raw
.isEmpty()) { return i
; } 
 785             if(raw
.endsWith(&sp
, 1)) {  // remove trailing space 
 786                 raw
.truncate(raw
.length() - 1); 
 790         if(PatternProps::isWhiteSpace(c
)) { 
 792             i 
= skipWhiteSpace(i 
+ 1); 
 801 CollationRuleParser::skipComment(int32_t i
) const { 
 802     // skip to past the newline 
 803     while(i 
< rules
->length()) { 
 804         UChar c 
= rules
->charAt(i
++); 
 805         // LF or FF or CR or NEL or LS or PS 
 806         if(c 
== 0xa || c 
== 0xc || c 
== 0xd || c 
== 0x85 || c 
== 0x2028 || c 
== 0x2029) { 
 807             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." 
 808             // NLF (new line function) = CR or LF or CR+LF or NEL. 
 809             // No need to collect all of CR+LF because a following LF will be ignored anyway. 
 817 CollationRuleParser::setParseError(const char *reason
, UErrorCode 
&errorCode
) { 
 818     if(U_FAILURE(errorCode
)) { return; } 
 819     // Error code consistent with the old parser (from ca. 2001), 
 820     // rather than U_PARSE_ERROR; 
 821     errorCode 
= U_INVALID_FORMAT_ERROR
; 
 822     errorReason 
= reason
; 
 823     if(parseError 
!= NULL
) { setErrorContext(); } 
 827 CollationRuleParser::setErrorContext() { 
 828     if(parseError 
== NULL
) { return; } 
 830     // Note: This relies on the calling code maintaining the ruleIndex 
 831     // at a position that is useful for debugging. 
 832     // For example, at the beginning of a reset or relation etc. 
 833     parseError
->offset 
= ruleIndex
; 
 834     parseError
->line 
= 0;  // We are not counting line numbers. 
 837     int32_t start 
= ruleIndex 
- (U_PARSE_CONTEXT_LEN 
- 1); 
 840     } else if(start 
> 0 && U16_IS_TRAIL(rules
->charAt(start
))) { 
 843     int32_t length 
= ruleIndex 
- start
; 
 844     rules
->extract(start
, length
, parseError
->preContext
); 
 845     parseError
->preContext
[length
] = 0; 
 847     // starting from ruleIndex 
 848     length 
= rules
->length() - ruleIndex
; 
 849     if(length 
>= U_PARSE_CONTEXT_LEN
) { 
 850         length 
= U_PARSE_CONTEXT_LEN 
- 1; 
 851         if(U16_IS_LEAD(rules
->charAt(ruleIndex 
+ length 
- 1))) { 
 855     rules
->extract(ruleIndex
, length
, parseError
->postContext
); 
 856     parseError
->postContext
[length
] = 0; 
 860 CollationRuleParser::isSyntaxChar(UChar32 c
) { 
 861     return 0x21 <= c 
&& c 
<= 0x7e && 
 862             (c 
<= 0x2f || (0x3a <= c 
&& c 
<= 0x40) || 
 863             (0x5b <= c 
&& c 
<= 0x60) || (0x7b <= c
)); 
 867 CollationRuleParser::skipWhiteSpace(int32_t i
) const { 
 868     while(i 
< rules
->length() && PatternProps::isWhiteSpace(rules
->charAt(i
))) { 
 876 #endif  // !UCONFIG_NO_COLLATION