1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
10 * (replaced the former ucol_tok.cpp)
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
35 #include "patternprops.h"
43 static const UChar BEFORE
[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
44 const int32_t BEFORE_LENGTH
= 7;
48 CollationRuleParser::Sink::~Sink() {}
51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet
&, const char *&, UErrorCode
&) {}
54 CollationRuleParser::Sink::optimize(const UnicodeSet
&, const char *&, UErrorCode
&) {}
56 CollationRuleParser::Importer::~Importer() {}
58 CollationRuleParser::CollationRuleParser(const CollationData
*base
, UErrorCode
&errorCode
)
59 : nfd(*Normalizer2::getNFDInstance(errorCode
)),
60 nfc(*Normalizer2::getNFCInstance(errorCode
)),
61 rules(NULL
), baseData(base
), settings(NULL
),
62 parseError(NULL
), errorReason(NULL
),
63 sink(NULL
), importer(NULL
),
67 CollationRuleParser::~CollationRuleParser() {
71 CollationRuleParser::parse(const UnicodeString
&ruleString
,
72 CollationSettings
&outSettings
,
73 UParseError
*outParseError
,
74 UErrorCode
&errorCode
) {
75 if(U_FAILURE(errorCode
)) { return; }
76 settings
= &outSettings
;
77 parseError
= outParseError
;
78 if(parseError
!= NULL
) {
80 parseError
->offset
= -1;
81 parseError
->preContext
[0] = 0;
82 parseError
->postContext
[0] = 0;
85 parse(ruleString
, errorCode
);
89 CollationRuleParser::parse(const UnicodeString
&ruleString
, UErrorCode
&errorCode
) {
90 if(U_FAILURE(errorCode
)) { return; }
94 while(ruleIndex
< rules
->length()) {
95 UChar c
= rules
->charAt(ruleIndex
);
96 if(PatternProps::isWhiteSpace(c
)) {
102 parseRuleChain(errorCode
);
105 parseSetting(errorCode
);
107 case 0x23: // '#' starts a comment, until the end of the line
108 ruleIndex
= skipComment(ruleIndex
+ 1);
110 case 0x40: // '@' is equivalent to [backwards 2]
111 settings
->setFlag(CollationSettings::BACKWARD_SECONDARY
,
112 UCOL_ON
, 0, errorCode
);
115 case 0x21: // '!' used to turn on Thai/Lao character reversal
116 // Accept but ignore. The root collator has contractions
117 // that are equivalent to the character reversal, where appropriate.
121 setParseError("expected a reset or setting or comment", errorCode
);
124 if(U_FAILURE(errorCode
)) { return; }
129 CollationRuleParser::parseRuleChain(UErrorCode
&errorCode
) {
130 int32_t resetStrength
= parseResetAndPosition(errorCode
);
131 UBool isFirstRelation
= TRUE
;
133 int32_t result
= parseRelationOperator(errorCode
);
134 if(U_FAILURE(errorCode
)) { return; }
136 if(ruleIndex
< rules
->length() && rules
->charAt(ruleIndex
) == 0x23) {
137 // '#' starts a comment, until the end of the line
138 ruleIndex
= skipComment(ruleIndex
+ 1);
141 if(isFirstRelation
) {
142 setParseError("reset not followed by a relation", errorCode
);
146 int32_t strength
= result
& STRENGTH_MASK
;
147 if(resetStrength
< UCOL_IDENTICAL
) {
148 // reset-before rule chain
149 if(isFirstRelation
) {
150 if(strength
!= resetStrength
) {
151 setParseError("reset-before strength differs from its first relation", errorCode
);
155 if(strength
< resetStrength
) {
156 setParseError("reset-before strength followed by a stronger relation", errorCode
);
161 int32_t i
= ruleIndex
+ (result
>> OFFSET_SHIFT
); // skip over the relation operator
162 if((result
& STARRED_FLAG
) == 0) {
163 parseRelationStrings(strength
, i
, errorCode
);
165 parseStarredCharacters(strength
, i
, errorCode
);
167 if(U_FAILURE(errorCode
)) { return; }
168 isFirstRelation
= FALSE
;
173 CollationRuleParser::parseResetAndPosition(UErrorCode
&errorCode
) {
174 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
175 int32_t i
= skipWhiteSpace(ruleIndex
+ 1);
178 int32_t resetStrength
;
179 if(rules
->compare(i
, BEFORE_LENGTH
, BEFORE
, 0, BEFORE_LENGTH
) == 0 &&
180 (j
= i
+ BEFORE_LENGTH
) < rules
->length() &&
181 PatternProps::isWhiteSpace(rules
->charAt(j
)) &&
182 ((j
= skipWhiteSpace(j
+ 1)) + 1) < rules
->length() &&
183 0x31 <= (c
= rules
->charAt(j
)) && c
<= 0x33 &&
184 rules
->charAt(j
+ 1) == 0x5d) {
185 // &[before n] with n=1 or 2 or 3
186 resetStrength
= UCOL_PRIMARY
+ (c
- 0x31);
187 i
= skipWhiteSpace(j
+ 2);
189 resetStrength
= UCOL_IDENTICAL
;
191 if(i
>= rules
->length()) {
192 setParseError("reset without position", errorCode
);
196 if(rules
->charAt(i
) == 0x5b) { // '['
197 i
= parseSpecialPosition(i
, str
, errorCode
);
199 i
= parseTailoringString(i
, str
, errorCode
);
201 sink
->addReset(resetStrength
, str
, errorReason
, errorCode
);
202 if(U_FAILURE(errorCode
)) { setErrorContext(); }
204 return resetStrength
;
208 CollationRuleParser::parseRelationOperator(UErrorCode
&errorCode
) {
209 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
210 ruleIndex
= skipWhiteSpace(ruleIndex
);
211 if(ruleIndex
>= rules
->length()) { return UCOL_DEFAULT
; }
213 int32_t i
= ruleIndex
;
214 UChar c
= rules
->charAt(i
++);
217 if(i
< rules
->length() && rules
->charAt(i
) == 0x3c) { // <<
219 if(i
< rules
->length() && rules
->charAt(i
) == 0x3c) { // <<<
221 if(i
< rules
->length() && rules
->charAt(i
) == 0x3c) { // <<<<
223 strength
= UCOL_QUATERNARY
;
225 strength
= UCOL_TERTIARY
;
228 strength
= UCOL_SECONDARY
;
231 strength
= UCOL_PRIMARY
;
233 if(i
< rules
->length() && rules
->charAt(i
) == 0x2a) { // '*'
235 strength
|= STARRED_FLAG
;
238 case 0x3b: // ';' same as <<
239 strength
= UCOL_SECONDARY
;
241 case 0x2c: // ',' same as <<<
242 strength
= UCOL_TERTIARY
;
245 strength
= UCOL_IDENTICAL
;
246 if(i
< rules
->length() && rules
->charAt(i
) == 0x2a) { // '*'
248 strength
|= STARRED_FLAG
;
254 return ((i
- ruleIndex
) << OFFSET_SHIFT
) | strength
;
258 CollationRuleParser::parseRelationStrings(int32_t strength
, int32_t i
, UErrorCode
&errorCode
) {
260 // prefix | str / extension
261 // where prefix and extension are optional.
262 UnicodeString prefix
, str
, extension
;
263 i
= parseTailoringString(i
, str
, errorCode
);
264 if(U_FAILURE(errorCode
)) { return; }
265 UChar next
= (i
< rules
->length()) ? rules
->charAt(i
) : 0;
266 if(next
== 0x7c) { // '|' separates the context prefix from the string.
268 i
= parseTailoringString(i
+ 1, str
, errorCode
);
269 if(U_FAILURE(errorCode
)) { return; }
270 next
= (i
< rules
->length()) ? rules
->charAt(i
) : 0;
272 if(next
== 0x2f) { // '/' separates the string from the extension.
273 i
= parseTailoringString(i
+ 1, extension
, errorCode
);
275 if(!prefix
.isEmpty()) {
276 UChar32 prefix0
= prefix
.char32At(0);
277 UChar32 c
= str
.char32At(0);
278 if(!nfc
.hasBoundaryBefore(prefix0
) || !nfc
.hasBoundaryBefore(c
)) {
279 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
284 sink
->addRelation(strength
, prefix
, str
, extension
, errorReason
, errorCode
);
285 if(U_FAILURE(errorCode
)) { setErrorContext(); }
290 CollationRuleParser::parseStarredCharacters(int32_t strength
, int32_t i
, UErrorCode
&errorCode
) {
291 UnicodeString empty
, raw
;
292 i
= parseString(skipWhiteSpace(i
), raw
, errorCode
);
293 if(U_FAILURE(errorCode
)) { return; }
295 setParseError("missing starred-relation string", errorCode
);
301 while(j
< raw
.length()) {
302 UChar32 c
= raw
.char32At(j
);
303 if(!nfd
.isInert(c
)) {
304 setParseError("starred-relation string is not all NFD-inert", errorCode
);
307 sink
->addRelation(strength
, empty
, UnicodeString(c
), empty
, errorReason
, errorCode
);
308 if(U_FAILURE(errorCode
)) {
315 if(i
>= rules
->length() || rules
->charAt(i
) != 0x2d) { // '-'
319 setParseError("range without start in starred-relation string", errorCode
);
322 i
= parseString(i
+ 1, raw
, errorCode
);
323 if(U_FAILURE(errorCode
)) { return; }
325 setParseError("range without end in starred-relation string", errorCode
);
328 UChar32 c
= raw
.char32At(0);
330 setParseError("range start greater than end in starred-relation string", errorCode
);
336 if(!nfd
.isInert(prev
)) {
337 setParseError("starred-relation string range is not all NFD-inert", errorCode
);
340 if(U_IS_SURROGATE(prev
)) {
341 setParseError("starred-relation string range contains a surrogate", errorCode
);
344 if(0xfffd <= prev
&& prev
<= 0xffff) {
345 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode
);
349 sink
->addRelation(strength
, empty
, s
, empty
, errorReason
, errorCode
);
350 if(U_FAILURE(errorCode
)) {
358 ruleIndex
= skipWhiteSpace(i
);
362 CollationRuleParser::parseTailoringString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
) {
363 i
= parseString(skipWhiteSpace(i
), raw
, errorCode
);
364 if(U_SUCCESS(errorCode
) && raw
.isEmpty()) {
365 setParseError("missing relation string", errorCode
);
367 return skipWhiteSpace(i
);
371 CollationRuleParser::parseString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
) {
372 if(U_FAILURE(errorCode
)) { return i
; }
374 while(i
< rules
->length()) {
375 UChar32 c
= rules
->charAt(i
++);
376 if(isSyntaxChar(c
)) {
377 if(c
== 0x27) { // apostrophe
378 if(i
< rules
->length() && rules
->charAt(i
) == 0x27) {
379 // Double apostrophe, encodes a single one.
380 raw
.append((UChar
)0x27);
384 // Quote literal text until the next single apostrophe.
386 if(i
== rules
->length()) {
387 setParseError("quoted literal text missing terminating apostrophe", errorCode
);
390 c
= rules
->charAt(i
++);
392 if(i
< rules
->length() && rules
->charAt(i
) == 0x27) {
393 // Double apostrophe inside quoted literal text,
394 // still encodes a single apostrophe.
400 raw
.append((UChar
)c
);
402 } else if(c
== 0x5c) { // backslash
403 if(i
== rules
->length()) {
404 setParseError("backslash escape at the end of the rule string", errorCode
);
407 c
= rules
->char32At(i
);
411 // Any other syntax character terminates a string.
415 } else if(PatternProps::isWhiteSpace(c
)) {
416 // Unquoted white space terminates a string.
420 raw
.append((UChar
)c
);
423 for(int32_t j
= 0; j
< raw
.length();) {
424 UChar32 c
= raw
.char32At(j
);
425 if(U_IS_SURROGATE(c
)) {
426 setParseError("string contains an unpaired surrogate", errorCode
);
429 if(0xfffd <= c
&& c
<= 0xffff) {
430 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode
);
440 static const char *const positions
[] = {
441 "first tertiary ignorable",
442 "last tertiary ignorable",
443 "first secondary ignorable",
444 "last secondary ignorable",
445 "first primary ignorable",
446 "last primary ignorable",
460 CollationRuleParser::parseSpecialPosition(int32_t i
, UnicodeString
&str
, UErrorCode
&errorCode
) {
461 if(U_FAILURE(errorCode
)) { return 0; }
463 int32_t j
= readWords(i
+ 1, raw
);
464 if(j
> i
&& rules
->charAt(j
) == 0x5d && !raw
.isEmpty()) { // words end with ]
466 for(int32_t pos
= 0; pos
< UPRV_LENGTHOF(positions
); ++pos
) {
467 if(raw
== UnicodeString(positions
[pos
], -1, US_INV
)) {
468 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE
+ pos
));
472 if(raw
== UNICODE_STRING_SIMPLE("top")) {
473 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE
+ LAST_REGULAR
));
476 if(raw
== UNICODE_STRING_SIMPLE("variable top")) {
477 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE
+ LAST_VARIABLE
));
481 setParseError("not a valid special reset position", errorCode
);
486 CollationRuleParser::parseSetting(UErrorCode
&errorCode
) {
487 if(U_FAILURE(errorCode
)) { return; }
489 int32_t i
= ruleIndex
+ 1;
490 int32_t j
= readWords(i
, raw
);
491 if(j
<= i
|| raw
.isEmpty()) {
492 setParseError("expected a setting/option at '['", errorCode
);
494 if(rules
->charAt(j
) == 0x5d) { // words end with ]
496 if(raw
.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497 (raw
.length() == 7 || raw
.charAt(7) == 0x20)) {
498 parseReordering(raw
, errorCode
);
502 if(raw
== UNICODE_STRING_SIMPLE("backwards 2")) {
503 settings
->setFlag(CollationSettings::BACKWARD_SECONDARY
,
504 UCOL_ON
, 0, errorCode
);
509 int32_t valueIndex
= raw
.lastIndexOf((UChar
)0x20);
510 if(valueIndex
>= 0) {
511 v
.setTo(raw
, valueIndex
+ 1);
512 raw
.truncate(valueIndex
);
514 if(raw
== UNICODE_STRING_SIMPLE("strength") && v
.length() == 1) {
515 int32_t value
= UCOL_DEFAULT
;
516 UChar c
= v
.charAt(0);
517 if(0x31 <= c
&& c
<= 0x34) { // 1..4
518 value
= UCOL_PRIMARY
+ (c
- 0x31);
519 } else if(c
== 0x49) { // 'I'
520 value
= UCOL_IDENTICAL
;
522 if(value
!= UCOL_DEFAULT
) {
523 settings
->setStrength(value
, 0, errorCode
);
527 } else if(raw
== UNICODE_STRING_SIMPLE("alternate")) {
528 UColAttributeValue value
= UCOL_DEFAULT
;
529 if(v
== UNICODE_STRING_SIMPLE("non-ignorable")) {
530 value
= UCOL_NON_IGNORABLE
;
531 } else if(v
== UNICODE_STRING_SIMPLE("shifted")) {
532 value
= UCOL_SHIFTED
;
534 if(value
!= UCOL_DEFAULT
) {
535 settings
->setAlternateHandling(value
, 0, errorCode
);
539 } else if(raw
== UNICODE_STRING_SIMPLE("maxVariable")) {
540 int32_t value
= UCOL_DEFAULT
;
541 if(v
== UNICODE_STRING_SIMPLE("space")) {
542 value
= CollationSettings::MAX_VAR_SPACE
;
543 } else if(v
== UNICODE_STRING_SIMPLE("punct")) {
544 value
= CollationSettings::MAX_VAR_PUNCT
;
545 } else if(v
== UNICODE_STRING_SIMPLE("symbol")) {
546 value
= CollationSettings::MAX_VAR_SYMBOL
;
547 } else if(v
== UNICODE_STRING_SIMPLE("currency")) {
548 value
= CollationSettings::MAX_VAR_CURRENCY
;
550 if(value
!= UCOL_DEFAULT
) {
551 settings
->setMaxVariable(value
, 0, errorCode
);
552 settings
->variableTop
= baseData
->getLastPrimaryForGroup(
553 UCOL_REORDER_CODE_FIRST
+ value
);
554 U_ASSERT(settings
->variableTop
!= 0);
558 } else if(raw
== UNICODE_STRING_SIMPLE("caseFirst")) {
559 UColAttributeValue value
= UCOL_DEFAULT
;
560 if(v
== UNICODE_STRING_SIMPLE("off")) {
562 } else if(v
== UNICODE_STRING_SIMPLE("lower")) {
563 value
= UCOL_LOWER_FIRST
;
564 } else if(v
== UNICODE_STRING_SIMPLE("upper")) {
565 value
= UCOL_UPPER_FIRST
;
567 if(value
!= UCOL_DEFAULT
) {
568 settings
->setCaseFirst(value
, 0, errorCode
);
572 } else if(raw
== UNICODE_STRING_SIMPLE("caseLevel")) {
573 UColAttributeValue value
= getOnOffValue(v
);
574 if(value
!= UCOL_DEFAULT
) {
575 settings
->setFlag(CollationSettings::CASE_LEVEL
, value
, 0, errorCode
);
579 } else if(raw
== UNICODE_STRING_SIMPLE("normalization")) {
580 UColAttributeValue value
= getOnOffValue(v
);
581 if(value
!= UCOL_DEFAULT
) {
582 settings
->setFlag(CollationSettings::CHECK_FCD
, value
, 0, errorCode
);
586 } else if(raw
== UNICODE_STRING_SIMPLE("numericOrdering")) {
587 UColAttributeValue value
= getOnOffValue(v
);
588 if(value
!= UCOL_DEFAULT
) {
589 settings
->setFlag(CollationSettings::NUMERIC
, value
, 0, errorCode
);
593 } else if(raw
== UNICODE_STRING_SIMPLE("hiraganaQ")) {
594 UColAttributeValue value
= getOnOffValue(v
);
595 if(value
!= UCOL_DEFAULT
) {
596 if(value
== UCOL_ON
) {
597 setParseError("[hiraganaQ on] is not supported", errorCode
);
602 } else if(raw
== UNICODE_STRING_SIMPLE("import")) {
604 lang
.appendInvariantChars(v
, errorCode
);
605 if(errorCode
== U_MEMORY_ALLOCATION_ERROR
) { return; }
606 // BCP 47 language tag -> ICU locale ID
607 char localeID
[ULOC_FULLNAME_CAPACITY
];
608 int32_t parsedLength
;
609 int32_t length
= uloc_forLanguageTag(lang
.data(), localeID
, ULOC_FULLNAME_CAPACITY
,
610 &parsedLength
, &errorCode
);
611 if(U_FAILURE(errorCode
) ||
612 parsedLength
!= lang
.length() || length
>= ULOC_FULLNAME_CAPACITY
) {
613 errorCode
= U_ZERO_ERROR
;
614 setParseError("expected language tag in [import langTag]", errorCode
);
617 // localeID minus all keywords
618 char baseID
[ULOC_FULLNAME_CAPACITY
];
619 length
= uloc_getBaseName(localeID
, baseID
, ULOC_FULLNAME_CAPACITY
, &errorCode
);
620 if(U_FAILURE(errorCode
) || length
>= ULOC_KEYWORDS_CAPACITY
) {
621 errorCode
= U_ZERO_ERROR
;
622 setParseError("expected language tag in [import langTag]", errorCode
);
625 if(length
== 0 || (length
== 3 && uprv_memcmp(baseID
, "und", 3) == 0)) {
626 uprv_strcpy(baseID
, "root");
627 } else if(*baseID
== '_') {
628 uprv_memmove(baseID
+ 3, baseID
, length
+ 1);
629 uprv_memcpy(baseID
, "und", 3);
631 // @collation=type, or length=0 if not specified
632 char collationType
[ULOC_KEYWORDS_CAPACITY
];
633 length
= uloc_getKeywordValue(localeID
, "collation",
634 collationType
, ULOC_KEYWORDS_CAPACITY
,
636 if(U_FAILURE(errorCode
) || length
>= ULOC_KEYWORDS_CAPACITY
) {
637 errorCode
= U_ZERO_ERROR
;
638 setParseError("expected language tag in [import langTag]", errorCode
);
641 if(importer
== NULL
) {
642 setParseError("[import langTag] is not supported", errorCode
);
644 UnicodeString importedRules
;
645 importer
->getRules(baseID
, length
> 0 ? collationType
: "standard",
646 importedRules
, errorReason
, errorCode
);
647 if(U_FAILURE(errorCode
)) {
648 if(errorReason
== NULL
) {
649 errorReason
= "[import langTag] failed";
654 const UnicodeString
*outerRules
= rules
;
655 int32_t outerRuleIndex
= ruleIndex
;
656 parse(importedRules
, errorCode
);
657 if(U_FAILURE(errorCode
)) {
658 if(parseError
!= NULL
) {
659 parseError
->offset
= outerRuleIndex
;
667 } else if(rules
->charAt(j
) == 0x5b) { // words end with [
669 j
= parseUnicodeSet(j
, set
, errorCode
);
670 if(U_FAILURE(errorCode
)) { return; }
671 if(raw
== UNICODE_STRING_SIMPLE("optimize")) {
672 sink
->optimize(set
, errorReason
, errorCode
);
673 if(U_FAILURE(errorCode
)) { setErrorContext(); }
676 } else if(raw
== UNICODE_STRING_SIMPLE("suppressContractions")) {
677 sink
->suppressContractions(set
, errorReason
, errorCode
);
678 if(U_FAILURE(errorCode
)) { setErrorContext(); }
683 setParseError("not a valid setting/option", errorCode
);
687 CollationRuleParser::parseReordering(const UnicodeString
&raw
, UErrorCode
&errorCode
) {
688 if(U_FAILURE(errorCode
)) { return; }
689 int32_t i
= 7; // after "reorder"
690 if(i
== raw
.length()) {
691 // empty [reorder] with no codes
692 settings
->resetReordering();
695 // Parse the codes in [reorder aa bb cc].
696 UVector32
reorderCodes(errorCode
);
697 if(U_FAILURE(errorCode
)) { return; }
699 while(i
< raw
.length()) {
700 ++i
; // skip the word-separating space
701 int32_t limit
= raw
.indexOf((UChar
)0x20, i
);
702 if(limit
< 0) { limit
= raw
.length(); }
703 word
.clear().appendInvariantChars(raw
.tempSubStringBetween(i
, limit
), errorCode
);
704 if(U_FAILURE(errorCode
)) { return; }
705 int32_t code
= getReorderCode(word
.data());
707 setParseError("unknown script or reorder code", errorCode
);
710 reorderCodes
.addElement(code
, errorCode
);
711 if(U_FAILURE(errorCode
)) { return; }
714 settings
->setReordering(*baseData
, reorderCodes
.getBuffer(), reorderCodes
.size(), errorCode
);
717 static const char *const gSpecialReorderCodes
[] = {
718 "space", "punct", "symbol", "currency", "digit"
722 CollationRuleParser::getReorderCode(const char *word
) {
723 for(int32_t i
= 0; i
< UPRV_LENGTHOF(gSpecialReorderCodes
); ++i
) {
724 if(uprv_stricmp(word
, gSpecialReorderCodes
[i
]) == 0) {
725 return UCOL_REORDER_CODE_FIRST
+ i
;
728 int32_t script
= u_getPropertyValueEnum(UCHAR_SCRIPT
, word
);
732 if(uprv_stricmp(word
, "others") == 0) {
733 return UCOL_REORDER_CODE_OTHERS
; // same as Zzzz = USCRIPT_UNKNOWN
739 CollationRuleParser::getOnOffValue(const UnicodeString
&s
) {
740 if(s
== UNICODE_STRING_SIMPLE("on")) {
742 } else if(s
== UNICODE_STRING_SIMPLE("off")) {
750 CollationRuleParser::parseUnicodeSet(int32_t i
, UnicodeSet
&set
, UErrorCode
&errorCode
) {
751 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
755 if(j
== rules
->length()) {
756 setParseError("unbalanced UnicodeSet pattern brackets", errorCode
);
759 UChar c
= rules
->charAt(j
++);
760 if(c
== 0x5b) { // '['
762 } else if(c
== 0x5d) { // ']'
763 if(--level
== 0) { break; }
766 set
.applyPattern(rules
->tempSubStringBetween(i
, j
), errorCode
);
767 if(U_FAILURE(errorCode
)) {
768 errorCode
= U_ZERO_ERROR
;
769 setParseError("not a valid UnicodeSet pattern", errorCode
);
772 j
= skipWhiteSpace(j
);
773 if(j
== rules
->length() || rules
->charAt(j
) != 0x5d) {
774 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode
);
781 CollationRuleParser::readWords(int32_t i
, UnicodeString
&raw
) const {
782 static const UChar sp
= 0x20;
784 i
= skipWhiteSpace(i
);
786 if(i
>= rules
->length()) { return 0; }
787 UChar c
= rules
->charAt(i
);
788 if(isSyntaxChar(c
) && c
!= 0x2d && c
!= 0x5f) { // syntax except -_
789 if(raw
.isEmpty()) { return i
; }
790 if(raw
.endsWith(&sp
, 1)) { // remove trailing space
791 raw
.truncate(raw
.length() - 1);
795 if(PatternProps::isWhiteSpace(c
)) {
797 i
= skipWhiteSpace(i
+ 1);
806 CollationRuleParser::skipComment(int32_t i
) const {
807 // skip to past the newline
808 while(i
< rules
->length()) {
809 UChar c
= rules
->charAt(i
++);
810 // LF or FF or CR or NEL or LS or PS
811 if(c
== 0xa || c
== 0xc || c
== 0xd || c
== 0x85 || c
== 0x2028 || c
== 0x2029) {
812 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
813 // NLF (new line function) = CR or LF or CR+LF or NEL.
814 // No need to collect all of CR+LF because a following LF will be ignored anyway.
822 CollationRuleParser::setParseError(const char *reason
, UErrorCode
&errorCode
) {
823 if(U_FAILURE(errorCode
)) { return; }
824 // Error code consistent with the old parser (from ca. 2001),
825 // rather than U_PARSE_ERROR;
826 errorCode
= U_INVALID_FORMAT_ERROR
;
827 errorReason
= reason
;
828 if(parseError
!= NULL
) { setErrorContext(); }
832 CollationRuleParser::setErrorContext() {
833 if(parseError
== NULL
) { return; }
835 // Note: This relies on the calling code maintaining the ruleIndex
836 // at a position that is useful for debugging.
837 // For example, at the beginning of a reset or relation etc.
838 parseError
->offset
= ruleIndex
;
839 parseError
->line
= 0; // We are not counting line numbers.
842 int32_t start
= ruleIndex
- (U_PARSE_CONTEXT_LEN
- 1);
845 } else if(start
> 0 && U16_IS_TRAIL(rules
->charAt(start
))) {
848 int32_t length
= ruleIndex
- start
;
849 rules
->extract(start
, length
, parseError
->preContext
);
850 parseError
->preContext
[length
] = 0;
852 // starting from ruleIndex
853 length
= rules
->length() - ruleIndex
;
854 if(length
>= U_PARSE_CONTEXT_LEN
) {
855 length
= U_PARSE_CONTEXT_LEN
- 1;
856 if(U16_IS_LEAD(rules
->charAt(ruleIndex
+ length
- 1))) {
860 rules
->extract(ruleIndex
, length
, parseError
->postContext
);
861 parseError
->postContext
[length
] = 0;
865 CollationRuleParser::isSyntaxChar(UChar32 c
) {
866 return 0x21 <= c
&& c
<= 0x7e &&
867 (c
<= 0x2f || (0x3a <= c
&& c
<= 0x40) ||
868 (0x5b <= c
&& c
<= 0x60) || (0x7b <= c
));
872 CollationRuleParser::skipWhiteSpace(int32_t i
) const {
873 while(i
< rules
->length() && PatternProps::isWhiteSpace(rules
->charAt(i
))) {
881 #endif // !UCONFIG_NO_COLLATION