2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.cpp
8 * (replaced the former ucol_tok.cpp)
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/normalizer2.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/uchar.h"
21 #include "unicode/ucol.h"
22 #include "unicode/uloc.h"
23 #include "unicode/unistr.h"
24 #include "unicode/utf16.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationruleparser.h"
30 #include "collationsettings.h"
31 #include "collationtailoring.h"
33 #include "patternprops.h"
41 static const UChar BEFORE
[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
42 const int32_t BEFORE_LENGTH
= 7;
46 CollationRuleParser::Sink::~Sink() {}
49 CollationRuleParser::Sink::suppressContractions(const UnicodeSet
&, const char *&, UErrorCode
&) {}
52 CollationRuleParser::Sink::optimize(const UnicodeSet
&, const char *&, UErrorCode
&) {}
54 CollationRuleParser::Importer::~Importer() {}
56 CollationRuleParser::CollationRuleParser(const CollationData
*base
, UErrorCode
&errorCode
)
57 : nfd(*Normalizer2::getNFDInstance(errorCode
)),
58 nfc(*Normalizer2::getNFCInstance(errorCode
)),
59 rules(NULL
), baseData(base
), settings(NULL
),
60 parseError(NULL
), errorReason(NULL
),
61 sink(NULL
), importer(NULL
),
65 CollationRuleParser::~CollationRuleParser() {
69 CollationRuleParser::parse(const UnicodeString
&ruleString
,
70 CollationSettings
&outSettings
,
71 UParseError
*outParseError
,
72 UErrorCode
&errorCode
) {
73 if(U_FAILURE(errorCode
)) { return; }
74 settings
= &outSettings
;
75 parseError
= outParseError
;
76 if(parseError
!= NULL
) {
78 parseError
->offset
= -1;
79 parseError
->preContext
[0] = 0;
80 parseError
->postContext
[0] = 0;
83 parse(ruleString
, errorCode
);
87 CollationRuleParser::parse(const UnicodeString
&ruleString
, UErrorCode
&errorCode
) {
88 if(U_FAILURE(errorCode
)) { return; }
92 while(ruleIndex
< rules
->length()) {
93 UChar c
= rules
->charAt(ruleIndex
);
94 if(PatternProps::isWhiteSpace(c
)) {
100 parseRuleChain(errorCode
);
103 parseSetting(errorCode
);
105 case 0x23: // '#' starts a comment, until the end of the line
106 ruleIndex
= skipComment(ruleIndex
+ 1);
108 case 0x40: // '@' is equivalent to [backwards 2]
109 settings
->setFlag(CollationSettings::BACKWARD_SECONDARY
,
110 UCOL_ON
, 0, errorCode
);
113 case 0x21: // '!' used to turn on Thai/Lao character reversal
114 // Accept but ignore. The root collator has contractions
115 // that are equivalent to the character reversal, where appropriate.
119 setParseError("expected a reset or setting or comment", errorCode
);
122 if(U_FAILURE(errorCode
)) { return; }
127 CollationRuleParser::parseRuleChain(UErrorCode
&errorCode
) {
128 int32_t resetStrength
= parseResetAndPosition(errorCode
);
129 UBool isFirstRelation
= TRUE
;
131 int32_t result
= parseRelationOperator(errorCode
);
132 if(U_FAILURE(errorCode
)) { return; }
134 if(ruleIndex
< rules
->length() && rules
->charAt(ruleIndex
) == 0x23) {
135 // '#' starts a comment, until the end of the line
136 ruleIndex
= skipComment(ruleIndex
+ 1);
139 if(isFirstRelation
) {
140 setParseError("reset not followed by a relation", errorCode
);
144 int32_t strength
= result
& STRENGTH_MASK
;
145 if(resetStrength
< UCOL_IDENTICAL
) {
146 // reset-before rule chain
147 if(isFirstRelation
) {
148 if(strength
!= resetStrength
) {
149 setParseError("reset-before strength differs from its first relation", errorCode
);
153 if(strength
< resetStrength
) {
154 setParseError("reset-before strength followed by a stronger relation", errorCode
);
159 int32_t i
= ruleIndex
+ (result
>> OFFSET_SHIFT
); // skip over the relation operator
160 if((result
& STARRED_FLAG
) == 0) {
161 parseRelationStrings(strength
, i
, errorCode
);
163 parseStarredCharacters(strength
, i
, errorCode
);
165 if(U_FAILURE(errorCode
)) { return; }
166 isFirstRelation
= FALSE
;
171 CollationRuleParser::parseResetAndPosition(UErrorCode
&errorCode
) {
172 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
173 int32_t i
= skipWhiteSpace(ruleIndex
+ 1);
176 int32_t resetStrength
;
177 if(rules
->compare(i
, BEFORE_LENGTH
, BEFORE
, 0, BEFORE_LENGTH
) == 0 &&
178 (j
= i
+ BEFORE_LENGTH
) < rules
->length() &&
179 PatternProps::isWhiteSpace(rules
->charAt(j
)) &&
180 ((j
= skipWhiteSpace(j
+ 1)) + 1) < rules
->length() &&
181 0x31 <= (c
= rules
->charAt(j
)) && c
<= 0x33 &&
182 rules
->charAt(j
+ 1) == 0x5d) {
183 // &[before n] with n=1 or 2 or 3
184 resetStrength
= UCOL_PRIMARY
+ (c
- 0x31);
185 i
= skipWhiteSpace(j
+ 2);
187 resetStrength
= UCOL_IDENTICAL
;
189 if(i
>= rules
->length()) {
190 setParseError("reset without position", errorCode
);
194 if(rules
->charAt(i
) == 0x5b) { // '['
195 i
= parseSpecialPosition(i
, str
, errorCode
);
197 i
= parseTailoringString(i
, str
, errorCode
);
199 sink
->addReset(resetStrength
, str
, errorReason
, errorCode
);
200 if(U_FAILURE(errorCode
)) { setErrorContext(); }
202 return resetStrength
;
206 CollationRuleParser::parseRelationOperator(UErrorCode
&errorCode
) {
207 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
208 ruleIndex
= skipWhiteSpace(ruleIndex
);
209 if(ruleIndex
>= rules
->length()) { return UCOL_DEFAULT
; }
211 int32_t i
= ruleIndex
;
212 UChar c
= rules
->charAt(i
++);
215 if(i
< rules
->length() && rules
->charAt(i
) == 0x3c) { // <<
217 if(i
< rules
->length() && rules
->charAt(i
) == 0x3c) { // <<<
219 if(i
< rules
->length() && rules
->charAt(i
) == 0x3c) { // <<<<
221 strength
= UCOL_QUATERNARY
;
223 strength
= UCOL_TERTIARY
;
226 strength
= UCOL_SECONDARY
;
229 strength
= UCOL_PRIMARY
;
231 if(i
< rules
->length() && rules
->charAt(i
) == 0x2a) { // '*'
233 strength
|= STARRED_FLAG
;
236 case 0x3b: // ';' same as <<
237 strength
= UCOL_SECONDARY
;
239 case 0x2c: // ',' same as <<<
240 strength
= UCOL_TERTIARY
;
243 strength
= UCOL_IDENTICAL
;
244 if(i
< rules
->length() && rules
->charAt(i
) == 0x2a) { // '*'
246 strength
|= STARRED_FLAG
;
252 return ((i
- ruleIndex
) << OFFSET_SHIFT
) | strength
;
256 CollationRuleParser::parseRelationStrings(int32_t strength
, int32_t i
, UErrorCode
&errorCode
) {
258 // prefix | str / extension
259 // where prefix and extension are optional.
260 UnicodeString prefix
, str
, extension
;
261 i
= parseTailoringString(i
, str
, errorCode
);
262 if(U_FAILURE(errorCode
)) { return; }
263 UChar next
= (i
< rules
->length()) ? rules
->charAt(i
) : 0;
264 if(next
== 0x7c) { // '|' separates the context prefix from the string.
266 i
= parseTailoringString(i
+ 1, str
, errorCode
);
267 if(U_FAILURE(errorCode
)) { return; }
268 next
= (i
< rules
->length()) ? rules
->charAt(i
) : 0;
270 if(next
== 0x2f) { // '/' separates the string from the extension.
271 i
= parseTailoringString(i
+ 1, extension
, errorCode
);
273 if(!prefix
.isEmpty()) {
274 UChar32 prefix0
= prefix
.char32At(0);
275 UChar32 c
= str
.char32At(0);
276 if(!nfc
.hasBoundaryBefore(prefix0
) || !nfc
.hasBoundaryBefore(c
)) {
277 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
282 sink
->addRelation(strength
, prefix
, str
, extension
, errorReason
, errorCode
);
283 if(U_FAILURE(errorCode
)) { setErrorContext(); }
288 CollationRuleParser::parseStarredCharacters(int32_t strength
, int32_t i
, UErrorCode
&errorCode
) {
289 UnicodeString empty
, raw
;
290 i
= parseString(skipWhiteSpace(i
), raw
, errorCode
);
291 if(U_FAILURE(errorCode
)) { return; }
293 setParseError("missing starred-relation string", errorCode
);
299 while(j
< raw
.length()) {
300 UChar32 c
= raw
.char32At(j
);
301 if(!nfd
.isInert(c
)) {
302 setParseError("starred-relation string is not all NFD-inert", errorCode
);
305 sink
->addRelation(strength
, empty
, UnicodeString(c
), empty
, errorReason
, errorCode
);
306 if(U_FAILURE(errorCode
)) {
313 if(i
>= rules
->length() || rules
->charAt(i
) != 0x2d) { // '-'
317 setParseError("range without start in starred-relation string", errorCode
);
320 i
= parseString(i
+ 1, raw
, errorCode
);
321 if(U_FAILURE(errorCode
)) { return; }
323 setParseError("range without end in starred-relation string", errorCode
);
326 UChar32 c
= raw
.char32At(0);
328 setParseError("range start greater than end in starred-relation string", errorCode
);
334 if(!nfd
.isInert(prev
)) {
335 setParseError("starred-relation string range is not all NFD-inert", errorCode
);
338 if(U_IS_SURROGATE(prev
)) {
339 setParseError("starred-relation string range contains a surrogate", errorCode
);
342 if(0xfffd <= prev
&& prev
<= 0xffff) {
343 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode
);
347 sink
->addRelation(strength
, empty
, s
, empty
, errorReason
, errorCode
);
348 if(U_FAILURE(errorCode
)) {
356 ruleIndex
= skipWhiteSpace(i
);
360 CollationRuleParser::parseTailoringString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
) {
361 i
= parseString(skipWhiteSpace(i
), raw
, errorCode
);
362 if(U_SUCCESS(errorCode
) && raw
.isEmpty()) {
363 setParseError("missing relation string", errorCode
);
365 return skipWhiteSpace(i
);
369 CollationRuleParser::parseString(int32_t i
, UnicodeString
&raw
, UErrorCode
&errorCode
) {
370 if(U_FAILURE(errorCode
)) { return i
; }
372 while(i
< rules
->length()) {
373 UChar32 c
= rules
->charAt(i
++);
374 if(isSyntaxChar(c
)) {
375 if(c
== 0x27) { // apostrophe
376 if(i
< rules
->length() && rules
->charAt(i
) == 0x27) {
377 // Double apostrophe, encodes a single one.
378 raw
.append((UChar
)0x27);
382 // Quote literal text until the next single apostrophe.
384 if(i
== rules
->length()) {
385 setParseError("quoted literal text missing terminating apostrophe", errorCode
);
388 c
= rules
->charAt(i
++);
390 if(i
< rules
->length() && rules
->charAt(i
) == 0x27) {
391 // Double apostrophe inside quoted literal text,
392 // still encodes a single apostrophe.
398 raw
.append((UChar
)c
);
400 } else if(c
== 0x5c) { // backslash
401 if(i
== rules
->length()) {
402 setParseError("backslash escape at the end of the rule string", errorCode
);
405 c
= rules
->char32At(i
);
409 // Any other syntax character terminates a string.
413 } else if(PatternProps::isWhiteSpace(c
)) {
414 // Unquoted white space terminates a string.
418 raw
.append((UChar
)c
);
421 for(int32_t j
= 0; j
< raw
.length();) {
422 UChar32 c
= raw
.char32At(j
);
423 if(U_IS_SURROGATE(c
)) {
424 setParseError("string contains an unpaired surrogate", errorCode
);
427 if(0xfffd <= c
&& c
<= 0xffff) {
428 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode
);
438 static const char *const positions
[] = {
439 "first tertiary ignorable",
440 "last tertiary ignorable",
441 "first secondary ignorable",
442 "last secondary ignorable",
443 "first primary ignorable",
444 "last primary ignorable",
458 CollationRuleParser::parseSpecialPosition(int32_t i
, UnicodeString
&str
, UErrorCode
&errorCode
) {
459 if(U_FAILURE(errorCode
)) { return 0; }
461 int32_t j
= readWords(i
+ 1, raw
);
462 if(j
> i
&& rules
->charAt(j
) == 0x5d && !raw
.isEmpty()) { // words end with ]
464 for(int32_t pos
= 0; pos
< UPRV_LENGTHOF(positions
); ++pos
) {
465 if(raw
== UnicodeString(positions
[pos
], -1, US_INV
)) {
466 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE
+ pos
));
470 if(raw
== UNICODE_STRING_SIMPLE("top")) {
471 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE
+ LAST_REGULAR
));
474 if(raw
== UNICODE_STRING_SIMPLE("variable top")) {
475 str
.setTo((UChar
)POS_LEAD
).append((UChar
)(POS_BASE
+ LAST_VARIABLE
));
479 setParseError("not a valid special reset position", errorCode
);
484 CollationRuleParser::parseSetting(UErrorCode
&errorCode
) {
485 if(U_FAILURE(errorCode
)) { return; }
487 int32_t i
= ruleIndex
+ 1;
488 int32_t j
= readWords(i
, raw
);
489 if(j
<= i
|| raw
.isEmpty()) {
490 setParseError("expected a setting/option at '['", errorCode
);
492 if(rules
->charAt(j
) == 0x5d) { // words end with ]
494 if(raw
.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
495 (raw
.length() == 7 || raw
.charAt(7) == 0x20)) {
496 parseReordering(raw
, errorCode
);
500 if(raw
== UNICODE_STRING_SIMPLE("backwards 2")) {
501 settings
->setFlag(CollationSettings::BACKWARD_SECONDARY
,
502 UCOL_ON
, 0, errorCode
);
507 int32_t valueIndex
= raw
.lastIndexOf((UChar
)0x20);
508 if(valueIndex
>= 0) {
509 v
.setTo(raw
, valueIndex
+ 1);
510 raw
.truncate(valueIndex
);
512 if(raw
== UNICODE_STRING_SIMPLE("strength") && v
.length() == 1) {
513 int32_t value
= UCOL_DEFAULT
;
514 UChar c
= v
.charAt(0);
515 if(0x31 <= c
&& c
<= 0x34) { // 1..4
516 value
= UCOL_PRIMARY
+ (c
- 0x31);
517 } else if(c
== 0x49) { // 'I'
518 value
= UCOL_IDENTICAL
;
520 if(value
!= UCOL_DEFAULT
) {
521 settings
->setStrength(value
, 0, errorCode
);
525 } else if(raw
== UNICODE_STRING_SIMPLE("alternate")) {
526 UColAttributeValue value
= UCOL_DEFAULT
;
527 if(v
== UNICODE_STRING_SIMPLE("non-ignorable")) {
528 value
= UCOL_NON_IGNORABLE
;
529 } else if(v
== UNICODE_STRING_SIMPLE("shifted")) {
530 value
= UCOL_SHIFTED
;
532 if(value
!= UCOL_DEFAULT
) {
533 settings
->setAlternateHandling(value
, 0, errorCode
);
537 } else if(raw
== UNICODE_STRING_SIMPLE("maxVariable")) {
538 int32_t value
= UCOL_DEFAULT
;
539 if(v
== UNICODE_STRING_SIMPLE("space")) {
540 value
= CollationSettings::MAX_VAR_SPACE
;
541 } else if(v
== UNICODE_STRING_SIMPLE("punct")) {
542 value
= CollationSettings::MAX_VAR_PUNCT
;
543 } else if(v
== UNICODE_STRING_SIMPLE("symbol")) {
544 value
= CollationSettings::MAX_VAR_SYMBOL
;
545 } else if(v
== UNICODE_STRING_SIMPLE("currency")) {
546 value
= CollationSettings::MAX_VAR_CURRENCY
;
548 if(value
!= UCOL_DEFAULT
) {
549 settings
->setMaxVariable(value
, 0, errorCode
);
550 settings
->variableTop
= baseData
->getLastPrimaryForGroup(
551 UCOL_REORDER_CODE_FIRST
+ value
);
552 U_ASSERT(settings
->variableTop
!= 0);
556 } else if(raw
== UNICODE_STRING_SIMPLE("caseFirst")) {
557 UColAttributeValue value
= UCOL_DEFAULT
;
558 if(v
== UNICODE_STRING_SIMPLE("off")) {
560 } else if(v
== UNICODE_STRING_SIMPLE("lower")) {
561 value
= UCOL_LOWER_FIRST
;
562 } else if(v
== UNICODE_STRING_SIMPLE("upper")) {
563 value
= UCOL_UPPER_FIRST
;
565 if(value
!= UCOL_DEFAULT
) {
566 settings
->setCaseFirst(value
, 0, errorCode
);
570 } else if(raw
== UNICODE_STRING_SIMPLE("caseLevel")) {
571 UColAttributeValue value
= getOnOffValue(v
);
572 if(value
!= UCOL_DEFAULT
) {
573 settings
->setFlag(CollationSettings::CASE_LEVEL
, value
, 0, errorCode
);
577 } else if(raw
== UNICODE_STRING_SIMPLE("normalization")) {
578 UColAttributeValue value
= getOnOffValue(v
);
579 if(value
!= UCOL_DEFAULT
) {
580 settings
->setFlag(CollationSettings::CHECK_FCD
, value
, 0, errorCode
);
584 } else if(raw
== UNICODE_STRING_SIMPLE("numericOrdering")) {
585 UColAttributeValue value
= getOnOffValue(v
);
586 if(value
!= UCOL_DEFAULT
) {
587 settings
->setFlag(CollationSettings::NUMERIC
, value
, 0, errorCode
);
591 } else if(raw
== UNICODE_STRING_SIMPLE("hiraganaQ")) {
592 UColAttributeValue value
= getOnOffValue(v
);
593 if(value
!= UCOL_DEFAULT
) {
594 if(value
== UCOL_ON
) {
595 setParseError("[hiraganaQ on] is not supported", errorCode
);
600 } else if(raw
== UNICODE_STRING_SIMPLE("import")) {
602 lang
.appendInvariantChars(v
, errorCode
);
603 if(errorCode
== U_MEMORY_ALLOCATION_ERROR
) { return; }
604 // BCP 47 language tag -> ICU locale ID
605 char localeID
[ULOC_FULLNAME_CAPACITY
];
606 int32_t parsedLength
;
607 int32_t length
= uloc_forLanguageTag(lang
.data(), localeID
, ULOC_FULLNAME_CAPACITY
,
608 &parsedLength
, &errorCode
);
609 if(U_FAILURE(errorCode
) ||
610 parsedLength
!= lang
.length() || length
>= ULOC_FULLNAME_CAPACITY
) {
611 errorCode
= U_ZERO_ERROR
;
612 setParseError("expected language tag in [import langTag]", errorCode
);
615 // localeID minus all keywords
616 char baseID
[ULOC_FULLNAME_CAPACITY
];
617 length
= uloc_getBaseName(localeID
, baseID
, ULOC_FULLNAME_CAPACITY
, &errorCode
);
618 if(U_FAILURE(errorCode
) || length
>= ULOC_KEYWORDS_CAPACITY
) {
619 errorCode
= U_ZERO_ERROR
;
620 setParseError("expected language tag in [import langTag]", errorCode
);
623 if(length
== 3 && uprv_memcmp(baseID
, "und", 3) == 0) {
624 uprv_strcpy(baseID
, "root");
626 // @collation=type, or length=0 if not specified
627 char collationType
[ULOC_KEYWORDS_CAPACITY
];
628 length
= uloc_getKeywordValue(localeID
, "collation",
629 collationType
, ULOC_KEYWORDS_CAPACITY
,
631 if(U_FAILURE(errorCode
) || length
>= ULOC_KEYWORDS_CAPACITY
) {
632 errorCode
= U_ZERO_ERROR
;
633 setParseError("expected language tag in [import langTag]", errorCode
);
636 if(importer
== NULL
) {
637 setParseError("[import langTag] is not supported", errorCode
);
639 UnicodeString importedRules
;
640 importer
->getRules(baseID
, length
> 0 ? collationType
: "standard",
641 importedRules
, errorReason
, errorCode
);
642 if(U_FAILURE(errorCode
)) {
643 if(errorReason
== NULL
) {
644 errorReason
= "[import langTag] failed";
649 const UnicodeString
*outerRules
= rules
;
650 int32_t outerRuleIndex
= ruleIndex
;
651 parse(importedRules
, errorCode
);
652 if(U_FAILURE(errorCode
)) {
653 if(parseError
!= NULL
) {
654 parseError
->offset
= outerRuleIndex
;
662 } else if(rules
->charAt(j
) == 0x5b) { // words end with [
664 j
= parseUnicodeSet(j
, set
, errorCode
);
665 if(U_FAILURE(errorCode
)) { return; }
666 if(raw
== UNICODE_STRING_SIMPLE("optimize")) {
667 sink
->optimize(set
, errorReason
, errorCode
);
668 if(U_FAILURE(errorCode
)) { setErrorContext(); }
671 } else if(raw
== UNICODE_STRING_SIMPLE("suppressContractions")) {
672 sink
->suppressContractions(set
, errorReason
, errorCode
);
673 if(U_FAILURE(errorCode
)) { setErrorContext(); }
678 setParseError("not a valid setting/option", errorCode
);
682 CollationRuleParser::parseReordering(const UnicodeString
&raw
, UErrorCode
&errorCode
) {
683 if(U_FAILURE(errorCode
)) { return; }
684 int32_t i
= 7; // after "reorder"
685 if(i
== raw
.length()) {
686 // empty [reorder] with no codes
687 settings
->resetReordering();
690 // Parse the codes in [reorder aa bb cc].
691 UVector32
reorderCodes(errorCode
);
692 if(U_FAILURE(errorCode
)) { return; }
694 while(i
< raw
.length()) {
695 ++i
; // skip the word-separating space
696 int32_t limit
= raw
.indexOf((UChar
)0x20, i
);
697 if(limit
< 0) { limit
= raw
.length(); }
698 word
.clear().appendInvariantChars(raw
.tempSubStringBetween(i
, limit
), errorCode
);
699 if(U_FAILURE(errorCode
)) { return; }
700 int32_t code
= getReorderCode(word
.data());
702 setParseError("unknown script or reorder code", errorCode
);
705 reorderCodes
.addElement(code
, errorCode
);
706 if(U_FAILURE(errorCode
)) { return; }
709 settings
->setReordering(*baseData
, reorderCodes
.getBuffer(), reorderCodes
.size(), errorCode
);
712 static const char *const gSpecialReorderCodes
[] = {
713 "space", "punct", "symbol", "currency", "digit"
717 CollationRuleParser::getReorderCode(const char *word
) {
718 for(int32_t i
= 0; i
< UPRV_LENGTHOF(gSpecialReorderCodes
); ++i
) {
719 if(uprv_stricmp(word
, gSpecialReorderCodes
[i
]) == 0) {
720 return UCOL_REORDER_CODE_FIRST
+ i
;
723 int32_t script
= u_getPropertyValueEnum(UCHAR_SCRIPT
, word
);
727 if(uprv_stricmp(word
, "others") == 0) {
728 return UCOL_REORDER_CODE_OTHERS
; // same as Zzzz = USCRIPT_UNKNOWN
734 CollationRuleParser::getOnOffValue(const UnicodeString
&s
) {
735 if(s
== UNICODE_STRING_SIMPLE("on")) {
737 } else if(s
== UNICODE_STRING_SIMPLE("off")) {
745 CollationRuleParser::parseUnicodeSet(int32_t i
, UnicodeSet
&set
, UErrorCode
&errorCode
) {
746 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
750 if(j
== rules
->length()) {
751 setParseError("unbalanced UnicodeSet pattern brackets", errorCode
);
754 UChar c
= rules
->charAt(j
++);
755 if(c
== 0x5b) { // '['
757 } else if(c
== 0x5d) { // ']'
758 if(--level
== 0) { break; }
761 set
.applyPattern(rules
->tempSubStringBetween(i
, j
), errorCode
);
762 if(U_FAILURE(errorCode
)) {
763 errorCode
= U_ZERO_ERROR
;
764 setParseError("not a valid UnicodeSet pattern", errorCode
);
767 j
= skipWhiteSpace(j
);
768 if(j
== rules
->length() || rules
->charAt(j
) != 0x5d) {
769 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode
);
776 CollationRuleParser::readWords(int32_t i
, UnicodeString
&raw
) const {
777 static const UChar sp
= 0x20;
779 i
= skipWhiteSpace(i
);
781 if(i
>= rules
->length()) { return 0; }
782 UChar c
= rules
->charAt(i
);
783 if(isSyntaxChar(c
) && c
!= 0x2d && c
!= 0x5f) { // syntax except -_
784 if(raw
.isEmpty()) { return i
; }
785 if(raw
.endsWith(&sp
, 1)) { // remove trailing space
786 raw
.truncate(raw
.length() - 1);
790 if(PatternProps::isWhiteSpace(c
)) {
792 i
= skipWhiteSpace(i
+ 1);
801 CollationRuleParser::skipComment(int32_t i
) const {
802 // skip to past the newline
803 while(i
< rules
->length()) {
804 UChar c
= rules
->charAt(i
++);
805 // LF or FF or CR or NEL or LS or PS
806 if(c
== 0xa || c
== 0xc || c
== 0xd || c
== 0x85 || c
== 0x2028 || c
== 0x2029) {
807 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
808 // NLF (new line function) = CR or LF or CR+LF or NEL.
809 // No need to collect all of CR+LF because a following LF will be ignored anyway.
817 CollationRuleParser::setParseError(const char *reason
, UErrorCode
&errorCode
) {
818 if(U_FAILURE(errorCode
)) { return; }
819 // Error code consistent with the old parser (from ca. 2001),
820 // rather than U_PARSE_ERROR;
821 errorCode
= U_INVALID_FORMAT_ERROR
;
822 errorReason
= reason
;
823 if(parseError
!= NULL
) { setErrorContext(); }
827 CollationRuleParser::setErrorContext() {
828 if(parseError
== NULL
) { return; }
830 // Note: This relies on the calling code maintaining the ruleIndex
831 // at a position that is useful for debugging.
832 // For example, at the beginning of a reset or relation etc.
833 parseError
->offset
= ruleIndex
;
834 parseError
->line
= 0; // We are not counting line numbers.
837 int32_t start
= ruleIndex
- (U_PARSE_CONTEXT_LEN
- 1);
840 } else if(start
> 0 && U16_IS_TRAIL(rules
->charAt(start
))) {
843 int32_t length
= ruleIndex
- start
;
844 rules
->extract(start
, length
, parseError
->preContext
);
845 parseError
->preContext
[length
] = 0;
847 // starting from ruleIndex
848 length
= rules
->length() - ruleIndex
;
849 if(length
>= U_PARSE_CONTEXT_LEN
) {
850 length
= U_PARSE_CONTEXT_LEN
- 1;
851 if(U16_IS_LEAD(rules
->charAt(ruleIndex
+ length
- 1))) {
855 rules
->extract(ruleIndex
, length
, parseError
->postContext
);
856 parseError
->postContext
[length
] = 0;
860 CollationRuleParser::isSyntaxChar(UChar32 c
) {
861 return 0x21 <= c
&& c
<= 0x7e &&
862 (c
<= 0x2f || (0x3a <= c
&& c
<= 0x40) ||
863 (0x5b <= c
&& c
<= 0x60) || (0x7b <= c
));
867 CollationRuleParser::skipWhiteSpace(int32_t i
) const {
868 while(i
< rules
->length() && PatternProps::isWhiteSpace(rules
->charAt(i
))) {
876 #endif // !UCONFIG_NO_COLLATION