icuSources/i18n/collationruleparser.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2013-2015, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * collationruleparser.cpp
   7 *
   8 * (replaced the former ucol_tok.cpp)
   9 *
  10 * created on: 2013apr10
  11 * created by: Markus W. Scherer
  12 */
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_COLLATION
  17
  18 #include "unicode/normalizer2.h"
  19 #include "unicode/parseerr.h"
  20 #include "unicode/uchar.h"
  21 #include "unicode/ucol.h"
  22 #include "unicode/uloc.h"
  23 #include "unicode/unistr.h"
  24 #include "unicode/utf16.h"
  25 #include "charstr.h"
  26 #include "cmemory.h"
  27 #include "collation.h"
  28 #include "collationdata.h"
  29 #include "collationruleparser.h"
  30 #include "collationsettings.h"
  31 #include "collationtailoring.h"
  32 #include "cstring.h"
  33 #include "patternprops.h"
  34 #include "uassert.h"
  35 #include "uvectr32.h"
  36
  37 U_NAMESPACE_BEGIN
  38
  39 namespace {
  40
  41 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
  42 const int32_t BEFORE_LENGTH = 7;
  43
  44 }  // namespace
  45
  46 CollationRuleParser::Sink::~Sink() {}
  47
  48 void
  49 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
  50
  51 void
  52 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
  53
  54 CollationRuleParser::Importer::~Importer() {}
  55
  56 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
  57         : nfd(*Normalizer2::getNFDInstance(errorCode)),
  58           nfc(*Normalizer2::getNFCInstance(errorCode)),
  59           rules(NULL), baseData(base), settings(NULL),
  60           parseError(NULL), errorReason(NULL),
  61           sink(NULL), importer(NULL),
  62           ruleIndex(0) {
  63 }
  64
  65 CollationRuleParser::~CollationRuleParser() {
  66 }
  67
  68 void
  69 CollationRuleParser::parse(const UnicodeString &ruleString,
  70                            CollationSettings &outSettings,
  71                            UParseError *outParseError,
  72                            UErrorCode &errorCode) {
  73     if(U_FAILURE(errorCode)) { return; }
  74     settings = &outSettings;
  75     parseError = outParseError;
  76     if(parseError != NULL) {
  77         parseError->line = 0;
  78         parseError->offset = -1;
  79         parseError->preContext[0] = 0;
  80         parseError->postContext[0] = 0;
  81     }
  82     errorReason = NULL;
  83     parse(ruleString, errorCode);
  84 }
  85
  86 void
  87 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
  88     if(U_FAILURE(errorCode)) { return; }
  89     rules = &ruleString;
  90     ruleIndex = 0;
  91
  92     while(ruleIndex < rules->length()) {
  93         UChar c = rules->charAt(ruleIndex);
  94         if(PatternProps::isWhiteSpace(c)) {
  95             ++ruleIndex;
  96             continue;
  97         }
  98         switch(c) {
  99         case 0x26:  // '&'
 100             parseRuleChain(errorCode);
 101             break;
 102         case 0x5b:  // '['
 103             parseSetting(errorCode);
 104             break;
 105         case 0x23:  // '#' starts a comment, until the end of the line
 106             ruleIndex = skipComment(ruleIndex + 1);
 107             break;
 108         case 0x40:  // '@' is equivalent to [backwards 2]
 109             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
 110                               UCOL_ON, 0, errorCode);
 111             ++ruleIndex;
 112             break;
 113         case 0x21:  // '!' used to turn on Thai/Lao character reversal
 114             // Accept but ignore. The root collator has contractions
 115             // that are equivalent to the character reversal, where appropriate.
 116             ++ruleIndex;
 117             break;
 118         default:
 119             setParseError("expected a reset or setting or comment", errorCode);
 120             break;
 121         }
 122         if(U_FAILURE(errorCode)) { return; }
 123     }
 124 }
 125
 126 void
 127 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
 128     int32_t resetStrength = parseResetAndPosition(errorCode);
 129     UBool isFirstRelation = TRUE;
 130     for(;;) {
 131         int32_t result = parseRelationOperator(errorCode);
 132         if(U_FAILURE(errorCode)) { return; }
 133         if(result < 0) {
 134             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
 135                 // '#' starts a comment, until the end of the line
 136                 ruleIndex = skipComment(ruleIndex + 1);
 137                 continue;
 138             }
 139             if(isFirstRelation) {
 140                 setParseError("reset not followed by a relation", errorCode);
 141             }
 142             return;
 143         }
 144         int32_t strength = result & STRENGTH_MASK;
 145         if(resetStrength < UCOL_IDENTICAL) {
 146             // reset-before rule chain
 147             if(isFirstRelation) {
 148                 if(strength != resetStrength) {
 149                     setParseError("reset-before strength differs from its first relation", errorCode);
 150                     return;
 151                 }
 152             } else {
 153                 if(strength < resetStrength) {
 154                     setParseError("reset-before strength followed by a stronger relation", errorCode);
 155                     return;
 156                 }
 157             }
 158         }
 159         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
 160         if((result & STARRED_FLAG) == 0) {
 161             parseRelationStrings(strength, i, errorCode);
 162         } else {
 163             parseStarredCharacters(strength, i, errorCode);
 164         }
 165         if(U_FAILURE(errorCode)) { return; }
 166         isFirstRelation = FALSE;
 167     }
 168 }
 169
 170 int32_t
 171 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
 172     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
 173     int32_t i = skipWhiteSpace(ruleIndex + 1);
 174     int32_t j;
 175     UChar c;
 176     int32_t resetStrength;
 177     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
 178             (j = i + BEFORE_LENGTH) < rules->length() &&
 179             PatternProps::isWhiteSpace(rules->charAt(j)) &&
 180             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
 181             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
 182             rules->charAt(j + 1) == 0x5d) {
 183         // &[before n] with n=1 or 2 or 3
 184         resetStrength = UCOL_PRIMARY + (c - 0x31);
 185         i = skipWhiteSpace(j + 2);
 186     } else {
 187         resetStrength = UCOL_IDENTICAL;
 188     }
 189     if(i >= rules->length()) {
 190         setParseError("reset without position", errorCode);
 191         return UCOL_DEFAULT;
 192     }
 193     UnicodeString str;
 194     if(rules->charAt(i) == 0x5b) {  // '['
 195         i = parseSpecialPosition(i, str, errorCode);
 196     } else {
 197         i = parseTailoringString(i, str, errorCode);
 198     }
 199     sink->addReset(resetStrength, str, errorReason, errorCode);
 200     if(U_FAILURE(errorCode)) { setErrorContext(); }
 201     ruleIndex = i;
 202     return resetStrength;
 203 }
 204
 205 int32_t
 206 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
 207     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
 208     ruleIndex = skipWhiteSpace(ruleIndex);
 209     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
 210     int32_t strength;
 211     int32_t i = ruleIndex;
 212     UChar c = rules->charAt(i++);
 213     switch(c) {
 214     case 0x3c:  // '<'
 215         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
 216             ++i;
 217             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
 218                 ++i;
 219                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
 220                     ++i;
 221                     strength = UCOL_QUATERNARY;
 222                 } else {
 223                     strength = UCOL_TERTIARY;
 224                 }
 225             } else {
 226                 strength = UCOL_SECONDARY;
 227             }
 228         } else {
 229             strength = UCOL_PRIMARY;
 230         }
 231         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
 232             ++i;
 233             strength |= STARRED_FLAG;
 234         }
 235         break;
 236     case 0x3b:  // ';' same as <<
 237         strength = UCOL_SECONDARY;
 238         break;
 239     case 0x2c:  // ',' same as <<<
 240         strength = UCOL_TERTIARY;
 241         break;
 242     case 0x3d:  // '='
 243         strength = UCOL_IDENTICAL;
 244         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
 245             ++i;
 246             strength |= STARRED_FLAG;
 247         }
 248         break;
 249     default:
 250         return UCOL_DEFAULT;
 251     }
 252     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
 253 }
 254
 255 void
 256 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
 257     // Parse
 258     //     prefix | str / extension
 259     // where prefix and extension are optional.
 260     UnicodeString prefix, str, extension;
 261     i = parseTailoringString(i, str, errorCode);
 262     if(U_FAILURE(errorCode)) { return; }
 263     UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
 264     if(next == 0x7c) {  // '|' separates the context prefix from the string.
 265         prefix = str;
 266         i = parseTailoringString(i + 1, str, errorCode);
 267         if(U_FAILURE(errorCode)) { return; }
 268         next = (i < rules->length()) ? rules->charAt(i) : 0;
 269     }
 270     if(next == 0x2f) {  // '/' separates the string from the extension.
 271         i = parseTailoringString(i + 1, extension, errorCode);
 272     }
 273     if(!prefix.isEmpty()) {
 274         UChar32 prefix0 = prefix.char32At(0);
 275         UChar32 c = str.char32At(0);
 276         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
 277             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
 278                           errorCode);
 279             return;
 280         }
 281     }
 282     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
 283     if(U_FAILURE(errorCode)) { setErrorContext(); }
 284     ruleIndex = i;
 285 }
 286
 287 void
 288 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
 289     UnicodeString empty, raw;
 290     i = parseString(skipWhiteSpace(i), raw, errorCode);
 291     if(U_FAILURE(errorCode)) { return; }
 292     if(raw.isEmpty()) {
 293         setParseError("missing starred-relation string", errorCode);
 294         return;
 295     }
 296     UChar32 prev = -1;
 297     int32_t j = 0;
 298     for(;;) {
 299         while(j < raw.length()) {
 300             UChar32 c = raw.char32At(j);
 301             if(!nfd.isInert(c)) {
 302                 setParseError("starred-relation string is not all NFD-inert", errorCode);
 303                 return;
 304             }
 305             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
 306             if(U_FAILURE(errorCode)) {
 307                 setErrorContext();
 308                 return;
 309             }
 310             j += U16_LENGTH(c);
 311             prev = c;
 312         }
 313         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
 314             break;
 315         }
 316         if(prev < 0) {
 317             setParseError("range without start in starred-relation string", errorCode);
 318             return;
 319         }
 320         i = parseString(i + 1, raw, errorCode);
 321         if(U_FAILURE(errorCode)) { return; }
 322         if(raw.isEmpty()) {
 323             setParseError("range without end in starred-relation string", errorCode);
 324             return;
 325         }
 326         UChar32 c = raw.char32At(0);
 327         if(c < prev) {
 328             setParseError("range start greater than end in starred-relation string", errorCode);
 329             return;
 330         }
 331         // range prev-c
 332         UnicodeString s;
 333         while(++prev <= c) {
 334             if(!nfd.isInert(prev)) {
 335                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
 336                 return;
 337             }
 338             if(U_IS_SURROGATE(prev)) {
 339                 setParseError("starred-relation string range contains a surrogate", errorCode);
 340                 return;
 341             }
 342             if(0xfffd <= prev && prev <= 0xffff) {
 343                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
 344                 return;
 345             }
 346             s.setTo(prev);
 347             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
 348             if(U_FAILURE(errorCode)) {
 349                 setErrorContext();
 350                 return;
 351             }
 352         }
 353         prev = -1;
 354         j = U16_LENGTH(c);
 355     }
 356     ruleIndex = skipWhiteSpace(i);
 357 }
 358
 359 int32_t
 360 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
 361     i = parseString(skipWhiteSpace(i), raw, errorCode);
 362     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
 363         setParseError("missing relation string", errorCode);
 364     }
 365     return skipWhiteSpace(i);
 366 }
 367
 368 int32_t
 369 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
 370     if(U_FAILURE(errorCode)) { return i; }
 371     raw.remove();
 372     while(i < rules->length()) {
 373         UChar32 c = rules->charAt(i++);
 374         if(isSyntaxChar(c)) {
 375             if(c == 0x27) {  // apostrophe
 376                 if(i < rules->length() && rules->charAt(i) == 0x27) {
 377                     // Double apostrophe, encodes a single one.
 378                     raw.append((UChar)0x27);
 379                     ++i;
 380                     continue;
 381                 }
 382                 // Quote literal text until the next single apostrophe.
 383                 for(;;) {
 384                     if(i == rules->length()) {
 385                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
 386                         return i;
 387                     }
 388                     c = rules->charAt(i++);
 389                     if(c == 0x27) {
 390                         if(i < rules->length() && rules->charAt(i) == 0x27) {
 391                             // Double apostrophe inside quoted literal text,
 392                             // still encodes a single apostrophe.
 393                             ++i;
 394                         } else {
 395                             break;
 396                         }
 397                     }
 398                     raw.append((UChar)c);
 399                 }
 400             } else if(c == 0x5c) {  // backslash
 401                 if(i == rules->length()) {
 402                     setParseError("backslash escape at the end of the rule string", errorCode);
 403                     return i;
 404                 }
 405                 c = rules->char32At(i);
 406                 raw.append(c);
 407                 i += U16_LENGTH(c);
 408             } else {
 409                 // Any other syntax character terminates a string.
 410                 --i;
 411                 break;
 412             }
 413         } else if(PatternProps::isWhiteSpace(c)) {
 414             // Unquoted white space terminates a string.
 415             --i;
 416             break;
 417         } else {
 418             raw.append((UChar)c);
 419         }
 420     }
 421     for(int32_t j = 0; j < raw.length();) {
 422         UChar32 c = raw.char32At(j);
 423         if(U_IS_SURROGATE(c)) {
 424             setParseError("string contains an unpaired surrogate", errorCode);
 425             return i;
 426         }
 427         if(0xfffd <= c && c <= 0xffff) {
 428             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
 429             return i;
 430         }
 431         j += U16_LENGTH(c);
 432     }
 433     return i;
 434 }
 435
 436 namespace {
 437
 438 static const char *const positions[] = {
 439     "first tertiary ignorable",
 440     "last tertiary ignorable",
 441     "first secondary ignorable",
 442     "last secondary ignorable",
 443     "first primary ignorable",
 444     "last primary ignorable",
 445     "first variable",
 446     "last variable",
 447     "first regular",
 448     "last regular",
 449     "first implicit",
 450     "last implicit",
 451     "first trailing",
 452     "last trailing"
 453 };
 454
 455 }  // namespace
 456
 457 int32_t
 458 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
 459     if(U_FAILURE(errorCode)) { return 0; }
 460     UnicodeString raw;
 461     int32_t j = readWords(i + 1, raw);
 462     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
 463         ++j;
 464         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
 465             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
 466                 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
 467                 return j;
 468             }
 469         }
 470         if(raw == UNICODE_STRING_SIMPLE("top")) {
 471             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
 472             return j;
 473         }
 474         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
 475             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
 476             return j;
 477         }
 478     }
 479     setParseError("not a valid special reset position", errorCode);
 480     return i;
 481 }
 482
 483 void
 484 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
 485     if(U_FAILURE(errorCode)) { return; }
 486     UnicodeString raw;
 487     int32_t i = ruleIndex + 1;
 488     int32_t j = readWords(i, raw);
 489     if(j <= i || raw.isEmpty()) {
 490         setParseError("expected a setting/option at '['", errorCode);
 491     }
 492     if(rules->charAt(j) == 0x5d) {  // words end with ]
 493         ++j;
 494         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
 495                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
 496             parseReordering(raw, errorCode);
 497             ruleIndex = j;
 498             return;
 499         }
 500         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
 501             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
 502                               UCOL_ON, 0, errorCode);
 503             ruleIndex = j;
 504             return;
 505         }
 506         UnicodeString v;
 507         int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
 508         if(valueIndex >= 0) {
 509             v.setTo(raw, valueIndex + 1);
 510             raw.truncate(valueIndex);
 511         }
 512         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
 513             int32_t value = UCOL_DEFAULT;
 514             UChar c = v.charAt(0);
 515             if(0x31 <= c && c <= 0x34) {  // 1..4
 516                 value = UCOL_PRIMARY + (c - 0x31);
 517             } else if(c == 0x49) {  // 'I'
 518                 value = UCOL_IDENTICAL;
 519             }
 520             if(value != UCOL_DEFAULT) {
 521                 settings->setStrength(value, 0, errorCode);
 522                 ruleIndex = j;
 523                 return;
 524             }
 525         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
 526             UColAttributeValue value = UCOL_DEFAULT;
 527             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
 528                 value = UCOL_NON_IGNORABLE;
 529             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
 530                 value = UCOL_SHIFTED;
 531             }
 532             if(value != UCOL_DEFAULT) {
 533                 settings->setAlternateHandling(value, 0, errorCode);
 534                 ruleIndex = j;
 535                 return;
 536             }
 537         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
 538             int32_t value = UCOL_DEFAULT;
 539             if(v == UNICODE_STRING_SIMPLE("space")) {
 540                 value = CollationSettings::MAX_VAR_SPACE;
 541             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
 542                 value = CollationSettings::MAX_VAR_PUNCT;
 543             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
 544                 value = CollationSettings::MAX_VAR_SYMBOL;
 545             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
 546                 value = CollationSettings::MAX_VAR_CURRENCY;
 547             }
 548             if(value != UCOL_DEFAULT) {
 549                 settings->setMaxVariable(value, 0, errorCode);
 550                 settings->variableTop = baseData->getLastPrimaryForGroup(
 551                     UCOL_REORDER_CODE_FIRST + value);
 552                 U_ASSERT(settings->variableTop != 0);
 553                 ruleIndex = j;
 554                 return;
 555             }
 556         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
 557             UColAttributeValue value = UCOL_DEFAULT;
 558             if(v == UNICODE_STRING_SIMPLE("off")) {
 559                 value = UCOL_OFF;
 560             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
 561                 value = UCOL_LOWER_FIRST;
 562             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
 563                 value = UCOL_UPPER_FIRST;
 564             }
 565             if(value != UCOL_DEFAULT) {
 566                 settings->setCaseFirst(value, 0, errorCode);
 567                 ruleIndex = j;
 568                 return;
 569             }
 570         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
 571             UColAttributeValue value = getOnOffValue(v);
 572             if(value != UCOL_DEFAULT) {
 573                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
 574                 ruleIndex = j;
 575                 return;
 576             }
 577         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
 578             UColAttributeValue value = getOnOffValue(v);
 579             if(value != UCOL_DEFAULT) {
 580                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
 581                 ruleIndex = j;
 582                 return;
 583             }
 584         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
 585             UColAttributeValue value = getOnOffValue(v);
 586             if(value != UCOL_DEFAULT) {
 587                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
 588                 ruleIndex = j;
 589                 return;
 590             }
 591         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
 592             UColAttributeValue value = getOnOffValue(v);
 593             if(value != UCOL_DEFAULT) {
 594                 if(value == UCOL_ON) {
 595                     setParseError("[hiraganaQ on] is not supported", errorCode);
 596                 }
 597                 ruleIndex = j;
 598                 return;
 599             }
 600         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
 601             CharString lang;
 602             lang.appendInvariantChars(v, errorCode);
 603             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
 604             // BCP 47 language tag -> ICU locale ID
 605             char localeID[ULOC_FULLNAME_CAPACITY];
 606             int32_t parsedLength;
 607             int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
 608                                                  &parsedLength, &errorCode);
 609             if(U_FAILURE(errorCode) ||
 610                     parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
 611                 errorCode = U_ZERO_ERROR;
 612                 setParseError("expected language tag in [import langTag]", errorCode);
 613                 return;
 614             }
 615             // localeID minus all keywords
 616             char baseID[ULOC_FULLNAME_CAPACITY];
 617             length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
 618             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
 619                 errorCode = U_ZERO_ERROR;
 620                 setParseError("expected language tag in [import langTag]", errorCode);
 621                 return;
 622             }
 623             if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
 624                 uprv_strcpy(baseID, "root");
 625             }
 626             // @collation=type, or length=0 if not specified
 627             char collationType[ULOC_KEYWORDS_CAPACITY];
 628             length = uloc_getKeywordValue(localeID, "collation",
 629                                           collationType, ULOC_KEYWORDS_CAPACITY,
 630                                           &errorCode);
 631             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
 632                 errorCode = U_ZERO_ERROR;
 633                 setParseError("expected language tag in [import langTag]", errorCode);
 634                 return;
 635             }
 636             if(importer == NULL) {
 637                 setParseError("[import langTag] is not supported", errorCode);
 638             } else {
 639                 UnicodeString importedRules;
 640                 importer->getRules(baseID, length > 0 ? collationType : "standard",
 641                                    importedRules, errorReason, errorCode);
 642                 if(U_FAILURE(errorCode)) {
 643                     if(errorReason == NULL) {
 644                         errorReason = "[import langTag] failed";
 645                     }
 646                     setErrorContext();
 647                     return;
 648                 }
 649                 const UnicodeString *outerRules = rules;
 650                 int32_t outerRuleIndex = ruleIndex;
 651                 parse(importedRules, errorCode);
 652                 if(U_FAILURE(errorCode)) {
 653                     if(parseError != NULL) {
 654                         parseError->offset = outerRuleIndex;
 655                     }
 656                 }
 657                 rules = outerRules;
 658                 ruleIndex = j;
 659             }
 660             return;
 661         }
 662     } else if(rules->charAt(j) == 0x5b) {  // words end with [
 663         UnicodeSet set;
 664         j = parseUnicodeSet(j, set, errorCode);
 665         if(U_FAILURE(errorCode)) { return; }
 666         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
 667             sink->optimize(set, errorReason, errorCode);
 668             if(U_FAILURE(errorCode)) { setErrorContext(); }
 669             ruleIndex = j;
 670             return;
 671         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
 672             sink->suppressContractions(set, errorReason, errorCode);
 673             if(U_FAILURE(errorCode)) { setErrorContext(); }
 674             ruleIndex = j;
 675             return;
 676         }
 677     }
 678     setParseError("not a valid setting/option", errorCode);
 679 }
 680
 681 void
 682 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
 683     if(U_FAILURE(errorCode)) { return; }
 684     int32_t i = 7;  // after "reorder"
 685     if(i == raw.length()) {
 686         // empty [reorder] with no codes
 687         settings->resetReordering();
 688         return;
 689     }
 690     // Parse the codes in [reorder aa bb cc].
 691     UVector32 reorderCodes(errorCode);
 692     if(U_FAILURE(errorCode)) { return; }
 693     CharString word;
 694     while(i < raw.length()) {
 695         ++i;  // skip the word-separating space
 696         int32_t limit = raw.indexOf((UChar)0x20, i);
 697         if(limit < 0) { limit = raw.length(); }
 698         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
 699         if(U_FAILURE(errorCode)) { return; }
 700         int32_t code = getReorderCode(word.data());
 701         if(code < 0) {
 702             setParseError("unknown script or reorder code", errorCode);
 703             return;
 704         }
 705         reorderCodes.addElement(code, errorCode);
 706         if(U_FAILURE(errorCode)) { return; }
 707         i = limit;
 708     }
 709     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
 710 }
 711
 712 static const char *const gSpecialReorderCodes[] = {
 713     "space", "punct", "symbol", "currency", "digit"
 714 };
 715
 716 int32_t
 717 CollationRuleParser::getReorderCode(const char *word) {
 718     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
 719         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
 720             return UCOL_REORDER_CODE_FIRST + i;
 721         }
 722     }
 723     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
 724     if(script >= 0) {
 725         return script;
 726     }
 727     if(uprv_stricmp(word, "others") == 0) {
 728         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
 729     }
 730     return -1;
 731 }
 732
 733 UColAttributeValue
 734 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
 735     if(s == UNICODE_STRING_SIMPLE("on")) {
 736         return UCOL_ON;
 737     } else if(s == UNICODE_STRING_SIMPLE("off")) {
 738         return UCOL_OFF;
 739     } else {
 740         return UCOL_DEFAULT;
 741     }
 742 }
 743
 744 int32_t
 745 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
 746     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
 747     int32_t level = 0;
 748     int32_t j = i;
 749     for(;;) {
 750         if(j == rules->length()) {
 751             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
 752             return j;
 753         }
 754         UChar c = rules->charAt(j++);
 755         if(c == 0x5b) {  // '['
 756             ++level;
 757         } else if(c == 0x5d) {  // ']'
 758             if(--level == 0) { break; }
 759         }
 760     }
 761     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
 762     if(U_FAILURE(errorCode)) {
 763         errorCode = U_ZERO_ERROR;
 764         setParseError("not a valid UnicodeSet pattern", errorCode);
 765         return j;
 766     }
 767     j = skipWhiteSpace(j);
 768     if(j == rules->length() || rules->charAt(j) != 0x5d) {
 769         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
 770         return j;
 771     }
 772     return ++j;
 773 }
 774
 775 int32_t
 776 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
 777     static const UChar sp = 0x20;
 778     raw.remove();
 779     i = skipWhiteSpace(i);
 780     for(;;) {
 781         if(i >= rules->length()) { return 0; }
 782         UChar c = rules->charAt(i);
 783         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
 784             if(raw.isEmpty()) { return i; }
 785             if(raw.endsWith(&sp, 1)) {  // remove trailing space
 786                 raw.truncate(raw.length() - 1);
 787             }
 788             return i;
 789         }
 790         if(PatternProps::isWhiteSpace(c)) {
 791             raw.append(0x20);
 792             i = skipWhiteSpace(i + 1);
 793         } else {
 794             raw.append(c);
 795             ++i;
 796         }
 797     }
 798 }
 799
 800 int32_t
 801 CollationRuleParser::skipComment(int32_t i) const {
 802     // skip to past the newline
 803     while(i < rules->length()) {
 804         UChar c = rules->charAt(i++);
 805         // LF or FF or CR or NEL or LS or PS
 806         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
 807             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
 808             // NLF (new line function) = CR or LF or CR+LF or NEL.
 809             // No need to collect all of CR+LF because a following LF will be ignored anyway.
 810             break;
 811         }
 812     }
 813     return i;
 814 }
 815
 816 void
 817 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
 818     if(U_FAILURE(errorCode)) { return; }
 819     // Error code consistent with the old parser (from ca. 2001),
 820     // rather than U_PARSE_ERROR;
 821     errorCode = U_INVALID_FORMAT_ERROR;
 822     errorReason = reason;
 823     if(parseError != NULL) { setErrorContext(); }
 824 }
 825
 826 void
 827 CollationRuleParser::setErrorContext() {
 828     if(parseError == NULL) { return; }
 829
 830     // Note: This relies on the calling code maintaining the ruleIndex
 831     // at a position that is useful for debugging.
 832     // For example, at the beginning of a reset or relation etc.
 833     parseError->offset = ruleIndex;
 834     parseError->line = 0;  // We are not counting line numbers.
 835
 836     // before ruleIndex
 837     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
 838     if(start < 0) {
 839         start = 0;
 840     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
 841         ++start;
 842     }
 843     int32_t length = ruleIndex - start;
 844     rules->extract(start, length, parseError->preContext);
 845     parseError->preContext[length] = 0;
 846
 847     // starting from ruleIndex
 848     length = rules->length() - ruleIndex;
 849     if(length >= U_PARSE_CONTEXT_LEN) {
 850         length = U_PARSE_CONTEXT_LEN - 1;
 851         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
 852             --length;
 853         }
 854     }
 855     rules->extract(ruleIndex, length, parseError->postContext);
 856     parseError->postContext[length] = 0;
 857 }
 858
 859 UBool
 860 CollationRuleParser::isSyntaxChar(UChar32 c) {
 861     return 0x21 <= c && c <= 0x7e &&
 862             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
 863             (0x5b <= c && c <= 0x60) || (0x7b <= c));
 864 }
 865
 866 int32_t
 867 CollationRuleParser::skipWhiteSpace(int32_t i) const {
 868     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
 869         ++i;
 870     }
 871     return i;
 872 }
 873
 874 U_NAMESPACE_END
 875
 876 #endif  // !UCONFIG_NO_COLLATION