icuSources/test/intltest/collationtest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 * Copyright (C) 2012-2015, International Business Machines
   6 * Corporation and others.  All Rights Reserved.
   7 *******************************************************************************
   8 * collationtest.cpp
   9 *
  10 * created on: 2012apr27
  11 * created by: Markus W. Scherer
  12 */
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_COLLATION
  17
  18 #include "unicode/coll.h"
  19 #include "unicode/errorcode.h"
  20 #include "unicode/localpointer.h"
  21 #include "unicode/normalizer2.h"
  22 #include "unicode/sortkey.h"
  23 #include "unicode/std_string.h"
  24 #include "unicode/strenum.h"
  25 #include "unicode/tblcoll.h"
  26 #include "unicode/uiter.h"
  27 #include "unicode/uniset.h"
  28 #include "unicode/unistr.h"
  29 #include "unicode/usetiter.h"
  30 #include "unicode/ustring.h"
  31 #include "charstr.h"
  32 #include "cmemory.h"
  33 #include "collation.h"
  34 #include "collationdata.h"
  35 #include "collationfcd.h"
  36 #include "collationiterator.h"
  37 #include "collationroot.h"
  38 #include "collationrootelements.h"
  39 #include "collationruleparser.h"
  40 #include "collationweights.h"
  41 #include "cstring.h"
  42 #include "intltest.h"
  43 #include "normalizer2impl.h"
  44 #include "ucbuf.h"
  45 #include "uhash.h"
  46 #include "uitercollationiterator.h"
  47 #include "utf16collationiterator.h"
  48 #include "utf8collationiterator.h"
  49 #include "uvectr32.h"
  50 #include "uvectr64.h"
  51 #include "writesrc.h"
  52
  53 class CodePointIterator;
  54
  55 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
  56
  57 class CollationTest : public IntlTest {
  58 public:
  59     CollationTest()
  60             : fcd(NULL), nfd(NULL),
  61               fileLineNumber(0),
  62               coll(NULL) {}
  63
  64     ~CollationTest() {
  65         delete coll;
  66     }
  67
  68     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
  69
  70     void TestMinMax();
  71     void TestImplicits();
  72     void TestNulTerminated();
  73     void TestIllegalUTF8();
  74     void TestShortFCDData();
  75     void TestFCD();
  76     void TestCollationWeights();
  77     void TestRootElements();
  78     void TestTailoredElements();
  79     void TestDataDriven();
  80
  81 private:
  82     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
  83     void checkAllocWeights(CollationWeights &cw,
  84                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
  85                            int32_t someLength, int32_t minCount);
  86
  87     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
  88     static UnicodeString printCollationKey(const CollationKey &key);
  89
  90     // Helpers & fields for data-driven test.
  91     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
  92     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
  93     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
  94     int32_t skipSpaces(int32_t i) {
  95         while(isSpace(fileLine[i])) { ++i; }
  96         return i;
  97     }
  98
  99     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
 100     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
 101     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
 102     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
 103     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
 104     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
 105     void setRootCollator(IcuTestErrorCode &errorCode);
 106     void setLocaleCollator(IcuTestErrorCode &errorCode);
 107
 108     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
 109
 110     UBool getSortKeyParts(const UChar *s, int32_t length,
 111                           CharString &dest, int32_t partSize,
 112                           IcuTestErrorCode &errorCode);
 113     UBool getCollationKey(const char *norm, const UnicodeString &line,
 114                           const UChar *s, int32_t length,
 115                           CollationKey &key, IcuTestErrorCode &errorCode);
 116     UBool getMergedCollationKey(const UChar *s, int32_t length,
 117                                 CollationKey &key, IcuTestErrorCode &errorCode);
 118     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
 119                           const UnicodeString &prevString, const UnicodeString &s,
 120                           UCollationResult expectedOrder, Collation::Level expectedLevel,
 121                           IcuTestErrorCode &errorCode);
 122     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
 123
 124     const Normalizer2 *fcd, *nfd;
 125     UnicodeString fileLine;
 126     int32_t fileLineNumber;
 127     UnicodeString fileTestName;
 128     Collator *coll;
 129 };
 130
 131 extern IntlTest *createCollationTest() {
 132     return new CollationTest();
 133 }
 134
 135 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
 136     if(exec) {
 137         logln("TestSuite CollationTest: ");
 138     }
 139     TESTCASE_AUTO_BEGIN;
 140     TESTCASE_AUTO(TestMinMax);
 141     TESTCASE_AUTO(TestImplicits);
 142     TESTCASE_AUTO(TestNulTerminated);
 143     TESTCASE_AUTO(TestIllegalUTF8);
 144     TESTCASE_AUTO(TestShortFCDData);
 145     TESTCASE_AUTO(TestFCD);
 146     TESTCASE_AUTO(TestCollationWeights);
 147     TESTCASE_AUTO(TestRootElements);
 148     TESTCASE_AUTO(TestTailoredElements);
 149     TESTCASE_AUTO(TestDataDriven);
 150     TESTCASE_AUTO_END;
 151 }
 152
 153 void CollationTest::TestMinMax() {
 154     IcuTestErrorCode errorCode(*this, "TestMinMax");
 155
 156     setRootCollator(errorCode);
 157     if(errorCode.isFailure()) {
 158         errorCode.reset();
 159         return;
 160     }
 161     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
 162     if(rbc == NULL) {
 163         errln("the root collator is not a RuleBasedCollator");
 164         return;
 165     }
 166
 167     static const UChar s[2] = { 0xfffe, 0xffff };
 168     UVector64 ces(errorCode);
 169     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
 170     errorCode.assertSuccess();
 171     if(ces.size() != 2) {
 172         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
 173         return;
 174     }
 175     int64_t ce = ces.elementAti(0);
 176     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
 177     if(ce != expected) {
 178         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
 179     }
 180
 181     ce = ces.elementAti(1);
 182     expected = Collation::makeCE(Collation::MAX_PRIMARY);
 183     if(ce != expected) {
 184         errln("CE(U+ffff)=%04lx != max..", (long)ce);
 185     }
 186 }
 187
 188 void CollationTest::TestImplicits() {
 189     IcuTestErrorCode errorCode(*this, "TestImplicits");
 190
 191     const CollationData *cd = CollationRoot::getData(errorCode);
 192     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 193         return;
 194     }
 195
 196     // Implicit primary weights should be assigned for the following sets,
 197     // and sort in ascending order by set and then code point.
 198     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
 199
 200     // core Han Unified Ideographs
 201     UnicodeSet coreHan("[\\p{unified_ideograph}&"
 202                             "[\\p{Block=CJK_Unified_Ideographs}"
 203                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
 204                        errorCode);
 205     // all other Unified Han ideographs
 206     UnicodeSet otherHan("[\\p{unified ideograph}-"
 207                             "[\\p{Block=CJK_Unified_Ideographs}"
 208                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
 209                         errorCode);
 210     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
 211     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
 212
 213     // Starting with CLDR 26/ICU 54, the root Han order may instead be
 214     // the Unihan radical-stroke order.
 215     // The tests should pass either way, so we only test the order of a small set of Han characters
 216     // whose radical-stroke order is the same as their code point order.
 217     UnicodeSet someHanInCPOrder(
 218             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
 219             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
 220             errorCode);
 221     UnicodeSet inOrder(someHanInCPOrder);
 222     inOrder.addAll(unassigned).freeze();
 223     if(errorCode.logIfFailureAndReset("UnicodeSet")) {
 224         return;
 225     }
 226     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
 227     UChar32 prev = 0;
 228     uint32_t prevPrimary = 0;
 229     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
 230     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
 231         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
 232         while(iter->next()) {
 233             UChar32 c = iter->getCodepoint();
 234             UnicodeString s(c);
 235             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
 236             int64_t ce = ci.nextCE(errorCode);
 237             int64_t ce2 = ci.nextCE(errorCode);
 238             if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
 239                 return;
 240             }
 241             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
 242                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
 243                 continue;
 244             }
 245             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
 246                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
 247                       (long)c, (long)(ce & 0xffffffff));
 248                 continue;
 249             }
 250             uint32_t primary = (uint32_t)(ce >> 32);
 251             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
 252                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
 253                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
 254             }
 255             prev = c;
 256             prevPrimary = primary;
 257         }
 258     }
 259 }
 260
 261 void CollationTest::TestNulTerminated() {
 262     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
 263     const CollationData *data = CollationRoot::getData(errorCode);
 264     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 265         return;
 266     }
 267
 268     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
 269
 270     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
 271     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
 272     for(int32_t i = 0;; ++i) {
 273         int64_t ce1 = ci1.nextCE(errorCode);
 274         int64_t ce2 = ci2.nextCE(errorCode);
 275         if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
 276             return;
 277         }
 278         if(ce1 != ce2) {
 279             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
 280             break;
 281         }
 282         if(ce1 == Collation::NO_CE) { break; }
 283     }
 284 }
 285
 286 void CollationTest::TestIllegalUTF8() {
 287     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
 288
 289     setRootCollator(errorCode);
 290     if(errorCode.isFailure()) {
 291         errorCode.reset();
 292         return;
 293     }
 294     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
 295
 296     static const char *strings[] = {
 297         // U+FFFD
 298         "a\xef\xbf\xbdz",
 299         // illegal byte sequences
 300         "a\x80z",  // trail byte
 301         "a\xc1\x81z",  // non-shortest form
 302         "a\xe0\x82\x83z",  // non-shortest form
 303         "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
 304         "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
 305         "a\xf0\x8f\xbf\xbfz",  // non-shortest form
 306         "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
 307     };
 308
 309     StringPiece fffd(strings[0]);
 310     for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
 311         StringPiece illegal(strings[i]);
 312         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
 313         if(order != UCOL_EQUAL) {
 314             errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
 315                   (int)i, order);
 316         }
 317     }
 318 }
 319
 320 namespace {
 321
 322 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
 323     for(UChar32 c = 0x10000; c < 0x110000;) {
 324         UChar32 next = c + 0x400;
 325         if(src.containsSome(c, next - 1)) {
 326             dest.add(U16_LEAD(c));
 327         }
 328         c = next;
 329     }
 330 }
 331
 332 }  // namespace
 333
 334 void CollationTest::TestShortFCDData() {
 335     // See CollationFCD class comments.
 336     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
 337     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
 338     errorCode.assertSuccess();
 339     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
 340     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
 341     UnicodeSet lccc;  // actual
 342     for(UChar32 c = 0; c <= 0xffff; ++c) {
 343         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
 344     }
 345     UnicodeSet diff(expectedLccc);
 346     diff.removeAll(lccc);
 347     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
 348     UnicodeString empty("[]");
 349     UnicodeString diffString;
 350     diff.toPattern(diffString, TRUE);
 351     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
 352     diff = lccc;
 353     diff.removeAll(expectedLccc);
 354     diff.toPattern(diffString, TRUE);
 355     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
 356
 357     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
 358     if (errorCode.isSuccess()) {
 359         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
 360         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
 361         UnicodeSet tccc;  // actual
 362         for(UChar32 c = 0; c <= 0xffff; ++c) {
 363             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
 364         }
 365         diff = expectedTccc;
 366         diff.removeAll(tccc);
 367         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
 368         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
 369         diff = tccc;
 370         diff.removeAll(expectedTccc);
 371         diff.toPattern(diffString, TRUE);
 372         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
 373     }
 374 }
 375
 376 class CodePointIterator {
 377 public:
 378     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
 379     void resetToStart() { pos = 0; }
 380     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
 381     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
 382     int32_t getLength() const { return length; }
 383     int getIndex() const { return (int)pos; }
 384 private:
 385     const UChar32 *cp;
 386     int32_t length;
 387     int32_t pos;
 388 };
 389
 390 void CollationTest::checkFCD(const char *name,
 391                              CollationIterator &ci, CodePointIterator &cpi) {
 392     IcuTestErrorCode errorCode(*this, "checkFCD");
 393
 394     // Iterate forward to the limit.
 395     for(;;) {
 396         UChar32 c1 = ci.nextCodePoint(errorCode);
 397         UChar32 c2 = cpi.next();
 398         if(c1 != c2) {
 399             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
 400                   name, (long)c1, (long)c2, cpi.getIndex());
 401             return;
 402         }
 403         if(c1 < 0) { break; }
 404     }
 405
 406     // Iterate backward most of the way.
 407     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
 408         UChar32 c1 = ci.previousCodePoint(errorCode);
 409         UChar32 c2 = cpi.previous();
 410         if(c1 != c2) {
 411             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
 412                   name, (long)c1, (long)c2, cpi.getIndex());
 413             return;
 414         }
 415     }
 416
 417     // Forward again.
 418     for(;;) {
 419         UChar32 c1 = ci.nextCodePoint(errorCode);
 420         UChar32 c2 = cpi.next();
 421         if(c1 != c2) {
 422             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
 423                   name, (long)c1, (long)c2, cpi.getIndex());
 424             return;
 425         }
 426         if(c1 < 0) { break; }
 427     }
 428
 429     // Iterate backward to the start.
 430     for(;;) {
 431         UChar32 c1 = ci.previousCodePoint(errorCode);
 432         UChar32 c2 = cpi.previous();
 433         if(c1 != c2) {
 434             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
 435                   name, (long)c1, (long)c2, cpi.getIndex());
 436             return;
 437         }
 438         if(c1 < 0) { break; }
 439     }
 440 }
 441
 442 void CollationTest::TestFCD() {
 443     IcuTestErrorCode errorCode(*this, "TestFCD");
 444     const CollationData *data = CollationRoot::getData(errorCode);
 445     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 446         return;
 447     }
 448
 449     // Input string, not FCD, NUL-terminated.
 450     static const UChar s[] = {
 451         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
 452         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
 453         0x327, 0x308,  // ccc=202, 230
 454         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
 455         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
 456         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
 457         0xac01,
 458         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
 459         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
 460         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
 461         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
 462         0x4e00, 0xf81,
 463         0
 464     };
 465     // Expected code points.
 466     static const UChar32 cp[] = {
 467         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
 468         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
 469         0x1D15F, 0x1D16D,
 470         0xac01,
 471         0x63, 0x327, 0x1D165, 0x1D16D,
 472         0x61,
 473         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
 474         0x4e00, 0xf71, 0xf80
 475     };
 476
 477     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
 478     if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
 479         return;
 480     }
 481     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
 482     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
 483
 484     cpi.resetToStart();
 485     std::string utf8;
 486     UnicodeString(s).toUTF8String(utf8);
 487     FCDUTF8CollationIterator u8ci(data, FALSE,
 488                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
 489     if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
 490         return;
 491     }
 492     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
 493
 494     cpi.resetToStart();
 495     UCharIterator iter;
 496     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
 497     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
 498     if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
 499         return;
 500     }
 501     checkFCD("FCDUIterCollationIterator", uici, cpi);
 502 }
 503
 504 void CollationTest::checkAllocWeights(CollationWeights &cw,
 505                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
 506                                       int32_t someLength, int32_t minCount) {
 507     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
 508         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
 509               (long)lowerLimit, (long)upperLimit, (long)n);
 510         return;
 511     }
 512     uint32_t previous = lowerLimit;
 513     int32_t count = 0;  // number of weights that have someLength
 514     for(int32_t i = 0; i < n; ++i) {
 515         uint32_t w = cw.nextWeight();
 516         if(w == 0xffffffff) {
 517             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 518                   "returns only %ld weights",
 519                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
 520             return;
 521         }
 522         if(!(previous < w && w < upperLimit)) {
 523             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 524                   "number %ld -> %lx not between %lx and %lx",
 525                   (long)lowerLimit, (long)upperLimit, (long)n,
 526                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
 527             return;
 528         }
 529         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
 530     }
 531     if(count < minCount) {
 532         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 533               "returns only %ld < %ld weights of length %d",
 534               (long)lowerLimit, (long)upperLimit, (long)n,
 535               (long)count, (long)minCount, (int)someLength);
 536     }
 537 }
 538
 539 void CollationTest::TestCollationWeights() {
 540     CollationWeights cw;
 541
 542     // Non-compressible primaries use 254 second bytes 02..FF.
 543     logln("CollationWeights.initForPrimary(non-compressible)");
 544     cw.initForPrimary(FALSE);
 545     // Expect 1 weight 11 and 254 weights 12xx.
 546     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
 547     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
 548     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
 549     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
 550     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
 551     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
 552     // Expect 254^2=64516 three-byte weights.
 553     // During computation, there should be 3 three-byte ranges
 554     // 10ffff, 11xxxx, 120202.
 555     // The middle one should be split 64515:1,
 556     // and the newly-split-off range and the last ranged lengthened.
 557     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
 558     // Expect weights 1102 & 1103.
 559     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
 560     // Expect weights 102102 & 102103.
 561     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
 562
 563     // Compressible primaries use 251 second bytes 04..FE.
 564     logln("CollationWeights.initForPrimary(compressible)");
 565     cw.initForPrimary(TRUE);
 566     // Expect 1 weight 11 and 251 weights 12xx.
 567     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
 568     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
 569     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
 570     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
 571     // Expect weights 1104 & 1105.
 572     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
 573     // Expect weights 102102 & 102103.
 574     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
 575
 576     // Secondary and tertiary weights use only bytes 3 & 4.
 577     logln("CollationWeights.initForSecondary()");
 578     cw.initForSecondary();
 579     // Expect weights fbxx and all four fc..ff.
 580     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
 581
 582     logln("CollationWeights.initForTertiary()");
 583     cw.initForTertiary();
 584     // Expect weights 3dxx and both 3e & 3f.
 585     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
 586 }
 587
 588 namespace {
 589
 590 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
 591                 uint32_t p, uint32_t s, uint32_t ctq) {
 592     uint32_t p1 = p >> 24;
 593     uint32_t p2 = (p >> 16) & 0xff;
 594     uint32_t p3 = (p >> 8) & 0xff;
 595     uint32_t p4 = p & 0xff;
 596     uint32_t s1 = s >> 8;
 597     uint32_t s2 = s & 0xff;
 598     // ctq = Case, Tertiary, Quaternary
 599     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
 600     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
 601     uint32_t t1 = t >> 8;
 602     uint32_t t2 = t & 0xff;
 603     uint32_t q = ctq & Collation::QUATERNARY_MASK;
 604     // No leading zero bytes.
 605     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
 606         return FALSE;
 607     }
 608     // No intermediate zero bytes.
 609     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
 610         return FALSE;
 611     }
 612     if(p2 != 0 && p3 == 0 && p4 != 0) {
 613         return FALSE;
 614     }
 615     // Minimum & maximum lead bytes.
 616     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
 617             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
 618             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
 619         return FALSE;
 620     }
 621     if(c > 2) {
 622         return FALSE;
 623     }
 624     // The valid byte range for the second primary byte depends on compressibility.
 625     if(p2 != 0) {
 626         if(data.isCompressibleLeadByte(p1)) {
 627             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
 628                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
 629                 return FALSE;
 630             }
 631         } else {
 632             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
 633                 return FALSE;
 634             }
 635         }
 636     }
 637     // Other bytes just need to avoid the level separator.
 638     // Trailing zeros are ok.
 639     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
 640     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
 641             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
 642         return FALSE;
 643     }
 644     // Well-formed CEs.
 645     if(p == 0) {
 646         if(s == 0) {
 647             if(t == 0) {
 648                 // Completely ignorable CE.
 649                 // Quaternary CEs are not supported.
 650                 if(c != 0 || q != 0) {
 651                     return FALSE;
 652                 }
 653             } else {
 654                 // Tertiary CE.
 655                 if(t < re.getTertiaryBoundary() || c != 2) {
 656                     return FALSE;
 657                 }
 658             }
 659         } else {
 660             // Secondary CE.
 661             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
 662                 return FALSE;
 663             }
 664         }
 665     } else {
 666         // Primary CE.
 667         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
 668                 s >= re.getSecondaryBoundary()) {
 669             return FALSE;
 670         }
 671         if(t == 0 || t >= re.getTertiaryBoundary()) {
 672             return FALSE;
 673         }
 674     }
 675     return TRUE;
 676 }
 677
 678 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
 679     uint32_t p = (uint32_t)(ce >> 32);
 680     uint32_t secTer = (uint32_t)ce;
 681     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
 682 }
 683
 684 class RootElementsIterator {
 685 public:
 686     RootElementsIterator(const CollationData &root)
 687             : data(root),
 688               elements(root.rootElements), length(root.rootElementsLength),
 689               pri(0), secTer(0),
 690               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
 691
 692     UBool next() {
 693         if(index >= length) { return FALSE; }
 694         uint32_t p = elements[index];
 695         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
 696         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
 697             ++index;
 698             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
 699             return TRUE;
 700         }
 701         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
 702             // End of a range, enumerate the primaries in the range.
 703             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
 704             p &= 0xffffff00;
 705             if(pri == p) {
 706                 // Finished the range, return the next CE after it.
 707                 ++index;
 708                 return next();
 709             }
 710             U_ASSERT(pri < p);
 711             // Return the next primary in this range.
 712             UBool isCompressible = data.isCompressiblePrimary(pri);
 713             if((pri & 0xffff) == 0) {
 714                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
 715             } else {
 716                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
 717             }
 718             return TRUE;
 719         }
 720         // Simple primary CE.
 721         ++index;
 722         pri = p;
 723         // Does this have an explicit below-common sec/ter unit,
 724         // or does it imply a common one?
 725         if(index == length) {
 726             secTer = Collation::COMMON_SEC_AND_TER_CE;
 727         } else {
 728             secTer = elements[index];
 729             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
 730                 // No sec/ter delta.
 731                 secTer = Collation::COMMON_SEC_AND_TER_CE;
 732             } else {
 733                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
 734                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
 735                     // Implied sec/ter.
 736                     secTer = Collation::COMMON_SEC_AND_TER_CE;
 737                 } else {
 738                     // Explicit sec/ter below common/common.
 739                     ++index;
 740                 }
 741             }
 742         }
 743         return TRUE;
 744     }
 745
 746     uint32_t getPrimary() const { return pri; }
 747     uint32_t getSecTer() const { return secTer; }
 748
 749 private:
 750     const CollationData &data;
 751     const uint32_t *elements;
 752     int32_t length;
 753
 754     uint32_t pri;
 755     uint32_t secTer;
 756     int32_t index;
 757 };
 758
 759 }  // namespace
 760
 761 void CollationTest::TestRootElements() {
 762     IcuTestErrorCode errorCode(*this, "TestRootElements");
 763     const CollationData *root = CollationRoot::getData(errorCode);
 764     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 765         return;
 766     }
 767     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
 768     RootElementsIterator iter(*root);
 769
 770     // We check each root CE for validity,
 771     // and we also verify that there is a tailoring gap between each two CEs.
 772     CollationWeights cw1c;  // compressible primary weights
 773     CollationWeights cw1u;  // uncompressible primary weights
 774     CollationWeights cw2;
 775     CollationWeights cw3;
 776
 777     cw1c.initForPrimary(TRUE);
 778     cw1u.initForPrimary(FALSE);
 779     cw2.initForSecondary();
 780     cw3.initForTertiary();
 781
 782     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
 783     // nor the special merge-separator CE for U+FFFE.
 784     uint32_t prevPri = 0;
 785     uint32_t prevSec = 0;
 786     uint32_t prevTer = 0;
 787     while(iter.next()) {
 788         uint32_t pri = iter.getPrimary();
 789         uint32_t secTer = iter.getSecTer();
 790         // CollationRootElements CEs must have 0 case and quaternary bits.
 791         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
 792             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
 793                   (long)pri, (long)secTer);
 794         }
 795         uint32_t sec = secTer >> 16;
 796         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
 797         uint32_t ctq = ter;
 798         if(pri == 0 && sec == 0 && ter != 0) {
 799             // Tertiary CEs must have uppercase bits,
 800             // but they are not stored in the CollationRootElements.
 801             ctq |= 0x8000;
 802         }
 803         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
 804             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
 805         } else {
 806             if(pri != prevPri) {
 807                 uint32_t newWeight = 0;
 808                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
 809                     // There is currently no tailoring gap after primary ignorables,
 810                     // and we forbid tailoring after U+FFFD and U+FFFF.
 811                 } else if(root->isCompressiblePrimary(prevPri)) {
 812                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
 813                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
 814                               (long)prevPri, (long)pri);
 815                     } else {
 816                         newWeight = cw1c.nextWeight();
 817                     }
 818                 } else {
 819                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
 820                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
 821                               (long)prevPri, (long)pri);
 822                     } else {
 823                         newWeight = cw1u.nextWeight();
 824                     }
 825                 }
 826                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
 827                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
 828                           (long)prevPri, (long)newWeight, (long)pri);
 829                 }
 830             } else if(sec != prevSec) {
 831                 uint32_t lowerLimit =
 832                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
 833                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
 834                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
 835                 } else {
 836                     uint32_t newWeight = cw2.nextWeight();
 837                     if(!(prevSec < newWeight && newWeight < sec)) {
 838                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
 839                               (long)lowerLimit, (long)newWeight, (long)sec);
 840                     }
 841                 }
 842             } else if(ter != prevTer) {
 843                 uint32_t lowerLimit =
 844                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
 845                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
 846                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
 847                 } else {
 848                     uint32_t newWeight = cw3.nextWeight();
 849                     if(!(prevTer < newWeight && newWeight < ter)) {
 850                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
 851                               (long)lowerLimit, (long)newWeight, (long)ter);
 852                     }
 853                 }
 854             } else {
 855                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
 856             }
 857         }
 858         prevPri = pri;
 859         prevSec = sec;
 860         prevTer = ter;
 861     }
 862 }
 863
 864 void CollationTest::TestTailoredElements() {
 865     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
 866     const CollationData *root = CollationRoot::getData(errorCode);
 867     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 868         return;
 869     }
 870     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
 871
 872     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
 873     if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
 874         return;
 875     }
 876     uhash_setKeyDeleter(prevLocales, uprv_free);
 877     // TestRootElements() tests the root collator which does not have tailorings.
 878     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
 879     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
 880     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
 881
 882     UVector64 ces(errorCode);
 883     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
 884     U_ASSERT(locales.isValid());
 885     const char *localeID = "root";
 886     do {
 887         Locale locale(localeID);
 888         LocalPointer<StringEnumeration> types(
 889                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
 890         errorCode.assertSuccess();
 891         const char *type;  // first: default type
 892         while((type = types->next(NULL, errorCode)) != NULL) {
 893             if(strncmp(type, "private-", 8) == 0) {
 894                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
 895                         localeID, type);
 896             }
 897             Locale localeWithType(locale);
 898             localeWithType.setKeywordValue("collation", type, errorCode);
 899             errorCode.assertSuccess();
 900             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
 901             if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
 902                                               localeWithType.getName())) {
 903                 continue;
 904             }
 905             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
 906             if(uhash_geti(prevLocales, actual.getName()) != 0) {
 907                 continue;
 908             }
 909             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
 910             errorCode.assertSuccess();
 911             logln("TestTailoredElements(): requested %s -> actual %s",
 912                   localeWithType.getName(), actual.getName());
 913             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
 914             if(rbc == NULL) {
 915                 continue;
 916             }
 917             // Note: It would be better to get tailored strings such that we can
 918             // identify the prefix, and only get the CEs for the prefix+string,
 919             // not also for the prefix.
 920             // There is currently no API for that.
 921             // It would help in an unusual case where a contraction starting in the prefix
 922             // extends past its end, and we do not see the intended mapping.
 923             // For example, for a mapping p|st, if there is also a contraction ps,
 924             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
 925             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
 926             errorCode.assertSuccess();
 927             UnicodeSetIterator iter(*tailored);
 928             while(iter.next()) {
 929                 const UnicodeString &s = iter.getString();
 930                 ces.removeAllElements();
 931                 rbc->internalGetCEs(s, ces, errorCode);
 932                 errorCode.assertSuccess();
 933                 for(int32_t i = 0; i < ces.size(); ++i) {
 934                     int64_t ce = ces.elementAti(i);
 935                     if(!isValidCE(rootElements, *root, ce)) {
 936                         errln("invalid tailored CE %016llx at CE index %d from string:",
 937                               (long long)ce, (int)i);
 938                         infoln(prettify(s));
 939                     }
 940                 }
 941             }
 942         }
 943     } while((localeID = locales->next(NULL, errorCode)) != NULL);
 944     uhash_close(prevLocales);
 945 }
 946
 947 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
 948     UnicodeString s;
 949     for(int32_t i = 0; i < length; ++i) {
 950         if(i > 0) { s.append((UChar)0x20); }
 951         uint8_t b = p[i];
 952         if(b == 0) {
 953             s.append((UChar)0x2e);  // period
 954         } else if(b == 1) {
 955             s.append((UChar)0x7c);  // vertical bar
 956         } else {
 957             appendHex(b, 2, s);
 958         }
 959     }
 960     return s;
 961 }
 962
 963 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
 964     int32_t length;
 965     const uint8_t *p = key.getByteArray(length);
 966     return printSortKey(p, length);
 967 }
 968
 969 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
 970     for(;;) {
 971         int32_t lineLength;
 972         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
 973         if(line == NULL || errorCode.isFailure()) {
 974             fileLine.remove();
 975             return FALSE;
 976         }
 977         ++fileLineNumber;
 978         // Strip trailing CR/LF, comments, and spaces.
 979         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
 980         if(comment != NULL) {
 981             lineLength = (int32_t)(comment - line);
 982         } else {
 983             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
 984         }
 985         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
 986         if(lineLength != 0) {
 987             fileLine.setTo(FALSE, line, lineLength);
 988             return TRUE;
 989         }
 990         // Empty line, continue.
 991     }
 992 }
 993
 994 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
 995                                 UErrorCode &errorCode) {
 996     int32_t length = fileLine.length();
 997     int32_t i;
 998     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
 999     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1000     if(pipeIndex >= 0) {
1001         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1002         if(prefix.isEmpty()) {
1003             errln("empty prefix on line %d", (int)fileLineNumber);
1004             infoln(fileLine);
1005             errorCode = U_PARSE_ERROR;
1006             return;
1007         }
1008         start = pipeIndex + 1;
1009     } else {
1010         prefix.remove();
1011     }
1012     s = fileLine.tempSubStringBetween(start, i).unescape();
1013     if(s.isEmpty()) {
1014         errln("empty string on line %d", (int)fileLineNumber);
1015         infoln(fileLine);
1016         errorCode = U_PARSE_ERROR;
1017         return;
1018     }
1019     start = i;
1020 }
1021
1022 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1023     Collation::Level relation;
1024     int32_t start;
1025     if(fileLine[0] == 0x3c) {  // <
1026         UChar second = fileLine[1];
1027         start = 2;
1028         switch(second) {
1029         case 0x31:  // <1
1030             relation = Collation::PRIMARY_LEVEL;
1031             break;
1032         case 0x32:  // <2
1033             relation = Collation::SECONDARY_LEVEL;
1034             break;
1035         case 0x33:  // <3
1036             relation = Collation::TERTIARY_LEVEL;
1037             break;
1038         case 0x34:  // <4
1039             relation = Collation::QUATERNARY_LEVEL;
1040             break;
1041         case 0x63:  // <c
1042             relation = Collation::CASE_LEVEL;
1043             break;
1044         case 0x69:  // <i
1045             relation = Collation::IDENTICAL_LEVEL;
1046             break;
1047         default:  // just <
1048             relation = Collation::NO_LEVEL;
1049             start = 1;
1050             break;
1051         }
1052     } else if(fileLine[0] == 0x3d) {  // =
1053         relation = Collation::ZERO_LEVEL;
1054         start = 1;
1055     } else {
1056         start = 0;
1057     }
1058     if(start == 0 || !isSpace(fileLine[start])) {
1059         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1060         infoln(fileLine);
1061         errorCode.set(U_PARSE_ERROR);
1062         return Collation::NO_LEVEL;
1063     }
1064     start = skipSpaces(start);
1065     UnicodeString prefix;
1066     parseString(start, prefix, s, errorCode);
1067     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1068         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1069         infoln(fileLine);
1070         errorCode.set(U_PARSE_ERROR);
1071         return Collation::NO_LEVEL;
1072     }
1073     if(start < fileLine.length()) {
1074         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1075         infoln(fileLine);
1076         errorCode.set(U_PARSE_ERROR);
1077         return Collation::NO_LEVEL;
1078     }
1079     return relation;
1080 }
1081
1082 static const struct {
1083     const char *name;
1084     UColAttribute attr;
1085 } attributes[] = {
1086     { "backwards", UCOL_FRENCH_COLLATION },
1087     { "alternate", UCOL_ALTERNATE_HANDLING },
1088     { "caseFirst", UCOL_CASE_FIRST },
1089     { "caseLevel", UCOL_CASE_LEVEL },
1090     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1091     { "strength", UCOL_STRENGTH },
1092     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1093     { "numeric", UCOL_NUMERIC_COLLATION }
1094 };
1095
1096 static const struct {
1097     const char *name;
1098     UColAttributeValue value;
1099 } attributeValues[] = {
1100     { "default", UCOL_DEFAULT },
1101     { "primary", UCOL_PRIMARY },
1102     { "secondary", UCOL_SECONDARY },
1103     { "tertiary", UCOL_TERTIARY },
1104     { "quaternary", UCOL_QUATERNARY },
1105     { "identical", UCOL_IDENTICAL },
1106     { "off", UCOL_OFF },
1107     { "on", UCOL_ON },
1108     { "shifted", UCOL_SHIFTED },
1109     { "non-ignorable", UCOL_NON_IGNORABLE },
1110     { "lower", UCOL_LOWER_FIRST },
1111     { "upper", UCOL_UPPER_FIRST }
1112 };
1113
1114 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1115     // Parse attributes even if the Collator could not be created,
1116     // in order to report syntax errors.
1117     int32_t start = skipSpaces(1);
1118     int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1119     if(equalPos < 0) {
1120         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1121             parseAndSetReorderCodes(start + 7, errorCode);
1122             return;
1123         }
1124         errln("missing '=' on line %d", (int)fileLineNumber);
1125         infoln(fileLine);
1126         errorCode.set(U_PARSE_ERROR);
1127         return;
1128     }
1129
1130     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1131     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1132     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1133         UColReorderCode max;
1134         if(valueString == UNICODE_STRING("space", 5)) {
1135             max = UCOL_REORDER_CODE_SPACE;
1136         } else if(valueString == UNICODE_STRING("punct", 5)) {
1137             max = UCOL_REORDER_CODE_PUNCTUATION;
1138         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1139             max = UCOL_REORDER_CODE_SYMBOL;
1140         } else if(valueString == UNICODE_STRING("currency", 8)) {
1141             max = UCOL_REORDER_CODE_CURRENCY;
1142         } else {
1143             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1144             infoln(fileLine);
1145             errorCode.set(U_PARSE_ERROR);
1146             return;
1147         }
1148         if(coll != NULL) {
1149             coll->setMaxVariable(max, errorCode);
1150             if(errorCode.isFailure()) {
1151                 errln("setMaxVariable() failed on line %d: %s",
1152                       (int)fileLineNumber, errorCode.errorName());
1153                 infoln(fileLine);
1154                 return;
1155             }
1156         }
1157         fileLine.remove();
1158         return;
1159     }
1160
1161     UColAttribute attr;
1162     for(int32_t i = 0;; ++i) {
1163         if(i == UPRV_LENGTHOF(attributes)) {
1164             errln("invalid attribute name on line %d", (int)fileLineNumber);
1165             infoln(fileLine);
1166             errorCode.set(U_PARSE_ERROR);
1167             return;
1168         }
1169         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1170             attr = attributes[i].attr;
1171             break;
1172         }
1173     }
1174
1175     UColAttributeValue value;
1176     for(int32_t i = 0;; ++i) {
1177         if(i == UPRV_LENGTHOF(attributeValues)) {
1178             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1179             infoln(fileLine);
1180             errorCode.set(U_PARSE_ERROR);
1181             return;
1182         }
1183         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1184             value = attributeValues[i].value;
1185             break;
1186         }
1187     }
1188
1189     if(coll != NULL) {
1190         coll->setAttribute(attr, value, errorCode);
1191         if(errorCode.isFailure()) {
1192             errln("illegal attribute=value combination on line %d: %s",
1193                   (int)fileLineNumber, errorCode.errorName());
1194             infoln(fileLine);
1195             return;
1196         }
1197     }
1198     fileLine.remove();
1199 }
1200
1201 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1202     UVector32 reorderCodes(errorCode);
1203     while(start < fileLine.length()) {
1204         start = skipSpaces(start);
1205         int32_t limit = start;
1206         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1207         CharString name;
1208         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1209         int32_t code = CollationRuleParser::getReorderCode(name.data());
1210         if(code < 0) {
1211             if(uprv_stricmp(name.data(), "default") == 0) {
1212                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1213             } else {
1214                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1215                 infoln(fileLine);
1216                 errorCode.set(U_PARSE_ERROR);
1217                 return;
1218             }
1219         }
1220         reorderCodes.addElement(code, errorCode);
1221         start = limit;
1222     }
1223     if(coll != NULL) {
1224         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1225         if(errorCode.isFailure()) {
1226             errln("setReorderCodes() failed on line %d: %s",
1227                   (int)fileLineNumber, errorCode.errorName());
1228             infoln(fileLine);
1229             return;
1230         }
1231     }
1232     fileLine.remove();
1233 }
1234
1235 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1236     UnicodeString rules;
1237     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1238         rules.append(fileLine.unescape());
1239     }
1240     if(errorCode.isFailure()) { return; }
1241     logln(rules);
1242
1243     UParseError parseError;
1244     UnicodeString reason;
1245     delete coll;
1246     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1247     if(coll == NULL) {
1248         errln("unable to allocate a new collator");
1249         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1250         return;
1251     }
1252     if(errorCode.isFailure()) {
1253         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1254         infoln(UnicodeString("  reason: ") + reason);
1255         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1256         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1257             infoln(UnicodeString("  snippet: ...") +
1258                 parseError.preContext + "(!)" + parseError.postContext + "...");
1259         }
1260         delete coll;
1261         coll = NULL;
1262         errorCode.reset();
1263     } else {
1264         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1265                      UnicodeString(), reason);
1266     }
1267 }
1268
1269 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1270     if(errorCode.isFailure()) { return; }
1271     delete coll;
1272     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1273     if(errorCode.isFailure()) {
1274         dataerrln("unable to create a root collator");
1275         return;
1276     }
1277 }
1278
1279 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1280     if(errorCode.isFailure()) { return; }
1281     delete coll;
1282     coll = NULL;
1283     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1284     if(at >= 0) {
1285         fileLine.setCharAt(at, (UChar)0x2a);  // *
1286     }
1287     CharString localeID;
1288     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1289     if(at >= 0) {
1290         localeID.data()[at - 9] = '@';
1291     }
1292     Locale locale(localeID.data());
1293     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1294         errln("invalid language tag on line %d", (int)fileLineNumber);
1295         infoln(fileLine);
1296         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1297         return;
1298     }
1299
1300     logln("creating a collator for locale ID %s", locale.getName());
1301     coll = Collator::createInstance(locale, errorCode);
1302     if(errorCode.isFailure()) {
1303         dataerrln("unable to create a collator for locale %s on line %d",
1304                   locale.getName(), (int)fileLineNumber);
1305         infoln(fileLine);
1306         delete coll;
1307         coll = NULL;
1308         errorCode.reset();
1309     }
1310 }
1311
1312 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1313     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1314     // In some sequences with Tibetan composite vowel signs,
1315     // even if the string passes the FCD check,
1316     // those composites must be decomposed.
1317     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1318     int32_t index = 0;
1319     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1320         if(++index < s.length()) {
1321             UChar c = s[index];
1322             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1323         }
1324     }
1325     return FALSE;
1326 }
1327
1328 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1329                                      CharString &dest, int32_t partSize,
1330                                      IcuTestErrorCode &errorCode) {
1331     if(errorCode.isFailure()) { return FALSE; }
1332     uint8_t part[32];
1333     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1334     UCharIterator iter;
1335     uiter_setString(&iter, s, length);
1336     uint32_t state[2] = { 0, 0 };
1337     for(;;) {
1338         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1339         UBool done = partLength < partSize;
1340         if(done) {
1341             // At the end, append the next byte as well which should be 00.
1342             ++partLength;
1343         }
1344         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1345         if(done) {
1346             return errorCode.isSuccess();
1347         }
1348     }
1349 }
1350
1351 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1352                                      const UChar *s, int32_t length,
1353                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1354     if(errorCode.isFailure()) { return FALSE; }
1355     coll->getCollationKey(s, length, key, errorCode);
1356     if(errorCode.isFailure()) {
1357         infoln(fileTestName);
1358         errln("Collator(%s).getCollationKey() failed: %s",
1359               norm, errorCode.errorName());
1360         infoln(line);
1361         return FALSE;
1362     }
1363     int32_t keyLength;
1364     const uint8_t *keyBytes = key.getByteArray(keyLength);
1365     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1366         infoln(fileTestName);
1367         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1368               norm);
1369         infoln(line);
1370         infoln(printCollationKey(key));
1371         return FALSE;
1372     }
1373
1374     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1375     if(numLevels < UCOL_IDENTICAL) {
1376         ++numLevels;
1377     } else {
1378         numLevels = 5;
1379     }
1380     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1381         ++numLevels;
1382     }
1383     errorCode.assertSuccess();
1384     int32_t numLevelSeparators = 0;
1385     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1386         uint8_t b = keyBytes[i];
1387         if(b == 0) {
1388             infoln(fileTestName);
1389             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1390             infoln(line);
1391             infoln(printCollationKey(key));
1392             return FALSE;
1393         }
1394         if(b == 1) { ++numLevelSeparators; }
1395     }
1396     if(numLevelSeparators != (numLevels - 1)) {
1397         infoln(fileTestName);
1398         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1399               norm, (int)numLevelSeparators, (int)numLevels);
1400         infoln(line);
1401         infoln(printCollationKey(key));
1402         return FALSE;
1403     }
1404
1405     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1406     static const int32_t partSizes[] = { 32, 3, 1 };
1407     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1408         int32_t partSize = partSizes[psi];
1409         CharString parts;
1410         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1411             infoln(fileTestName);
1412             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1413                   norm, (int)partSize, errorCode.errorName());
1414             infoln(line);
1415             return FALSE;
1416         }
1417         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1418             infoln(fileTestName);
1419             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1420                   norm, (int)partSize);
1421             infoln(line);
1422             infoln(printCollationKey(key));
1423             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1424             return FALSE;
1425         }
1426     }
1427     return TRUE;
1428 }
1429
1430 /**
1431  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1432  * Leaves key unchanged if s does not contain U+FFFE.
1433  * @return TRUE if the key was successfully changed
1434  */
1435 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1436                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1437     if(errorCode.isFailure()) { return FALSE; }
1438     LocalMemory<uint8_t> mergedKey;
1439     int32_t mergedKeyLength = 0;
1440     int32_t mergedKeyCapacity = 0;
1441     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1442     int32_t segmentStart = 0;
1443     for(int32_t i = 0;;) {
1444         if(i == sLength) {
1445             if(segmentStart == 0) {
1446                 // s does not contain any U+FFFE.
1447                 return FALSE;
1448             }
1449         } else if(s[i] != 0xfffe) {
1450             ++i;
1451             continue;
1452         }
1453         // Get the sort key for another segment and merge it into mergedKey.
1454         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1455         CollationKey key2;
1456         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1457         int32_t key1Length, key2Length;
1458         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1459         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1460         uint8_t *dest;
1461         int32_t minCapacity = key1Length + key2Length;
1462         if(key1Length > 0) { --minCapacity; }
1463         if(minCapacity <= mergedKeyCapacity) {
1464             dest = mergedKey.getAlias();
1465         } else {
1466             if(minCapacity <= 200) {
1467                 mergedKeyCapacity = 200;
1468             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1469                 mergedKeyCapacity *= 2;
1470             } else {
1471                 mergedKeyCapacity = minCapacity;
1472             }
1473             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1474         }
1475         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1476         if(key1Length == 0) {
1477             // key2 is the sort key for the first segment.
1478             uprv_memcpy(dest, key2Bytes, key2Length);
1479             mergedKeyLength = key2Length;
1480         } else {
1481             mergedKeyLength =
1482                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1483                                    dest, mergedKeyCapacity);
1484         }
1485         if(i == sLength) { break; }
1486         segmentStart = ++i;
1487     }
1488     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1489     return TRUE;
1490 }
1491
1492 namespace {
1493
1494 /**
1495  * Replaces unpaired surrogates with U+FFFD.
1496  * Returns s if no replacement was made, otherwise buffer.
1497  */
1498 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1499     int32_t i = 0;
1500     while(i < s.length()) {
1501         UChar32 c = s.char32At(i);
1502         if(U_IS_SURROGATE(c)) {
1503             if(buffer.length() < i) {
1504                 buffer.append(s, buffer.length(), i - buffer.length());
1505             }
1506             buffer.append((UChar)0xfffd);
1507         }
1508         i += U16_LENGTH(c);
1509     }
1510     if(buffer.isEmpty()) {
1511         return s;
1512     }
1513     if(buffer.length() < i) {
1514         buffer.append(s, buffer.length(), i - buffer.length());
1515     }
1516     return buffer;
1517 }
1518
1519 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1520                            UCollationResult order, UBool collHasCaseLevel) {
1521     if(order == UCOL_EQUAL) {
1522         return Collation::NO_LEVEL;
1523     }
1524     int32_t prevKeyLength;
1525     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1526     int32_t keyLength;
1527     const uint8_t *bytes = key.getByteArray(keyLength);
1528     int32_t level = Collation::PRIMARY_LEVEL;
1529     for(int32_t i = 0;; ++i) {
1530         uint8_t b = prevBytes[i];
1531         if(b != bytes[i]) { break; }
1532         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1533             ++level;
1534             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1535                 ++level;
1536             }
1537         }
1538     }
1539     return level;
1540 }
1541
1542 }
1543
1544 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1545                                      const UnicodeString &prevString, const UnicodeString &s,
1546                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1547                                      IcuTestErrorCode &errorCode) {
1548     if(errorCode.isFailure()) { return FALSE; }
1549
1550     // Get the sort keys first, for error debug output.
1551     CollationKey prevKey;
1552     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1553                         prevKey, errorCode)) {
1554         return FALSE;
1555     }
1556     CollationKey key;
1557     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1558
1559     UCollationResult order = coll->compare(prevString, s, errorCode);
1560     if(order != expectedOrder || errorCode.isFailure()) {
1561         infoln(fileTestName);
1562         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1563               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1564         infoln(prevFileLine);
1565         infoln(fileLine);
1566         infoln(printCollationKey(prevKey));
1567         infoln(printCollationKey(key));
1568         return FALSE;
1569     }
1570     order = coll->compare(s, prevString, errorCode);
1571     if(order != -expectedOrder || errorCode.isFailure()) {
1572         infoln(fileTestName);
1573         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1574               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1575         infoln(prevFileLine);
1576         infoln(fileLine);
1577         infoln(printCollationKey(prevKey));
1578         infoln(printCollationKey(key));
1579         return FALSE;
1580     }
1581     // Test NUL-termination if the strings do not contain NUL characters.
1582     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1583     if(!containNUL) {
1584         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1585         if(order != expectedOrder || errorCode.isFailure()) {
1586             infoln(fileTestName);
1587             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1588                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1589             infoln(prevFileLine);
1590             infoln(fileLine);
1591             infoln(printCollationKey(prevKey));
1592             infoln(printCollationKey(key));
1593             return FALSE;
1594         }
1595         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1596         if(order != -expectedOrder || errorCode.isFailure()) {
1597             infoln(fileTestName);
1598             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1599                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1600             infoln(prevFileLine);
1601             infoln(fileLine);
1602             infoln(printCollationKey(prevKey));
1603             infoln(printCollationKey(key));
1604             return FALSE;
1605         }
1606     }
1607
1608     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1609     // Unpaired surrogates cannot be converted to UTF-8.
1610     // Create valid UTF-16 strings if necessary, and use those for
1611     // both the expected compare() result and for the input to compare(UTF-8).
1612     UnicodeString prevBuffer, sBuffer;
1613     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1614     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1615     std::string prevUTF8, sUTF8;
1616     UnicodeString(prevValid).toUTF8String(prevUTF8);
1617     UnicodeString(sValid).toUTF8String(sUTF8);
1618     UCollationResult expectedUTF8Order;
1619     if(&prevValid == &prevString && &sValid == &s) {
1620         expectedUTF8Order = expectedOrder;
1621     } else {
1622         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1623     }
1624
1625     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1626     if(order != expectedUTF8Order || errorCode.isFailure()) {
1627         infoln(fileTestName);
1628         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1629               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1630         infoln(prevFileLine);
1631         infoln(fileLine);
1632         infoln(printCollationKey(prevKey));
1633         infoln(printCollationKey(key));
1634         return FALSE;
1635     }
1636     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1637     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1638         infoln(fileTestName);
1639         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1640               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1641         infoln(prevFileLine);
1642         infoln(fileLine);
1643         infoln(printCollationKey(prevKey));
1644         infoln(printCollationKey(key));
1645         return FALSE;
1646     }
1647     // Test NUL-termination if the strings do not contain NUL characters.
1648     if(!containNUL) {
1649         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1650         if(order != expectedUTF8Order || errorCode.isFailure()) {
1651             infoln(fileTestName);
1652             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1653                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1654             infoln(prevFileLine);
1655             infoln(fileLine);
1656             infoln(printCollationKey(prevKey));
1657             infoln(printCollationKey(key));
1658             return FALSE;
1659         }
1660         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1661         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1662             infoln(fileTestName);
1663             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1664                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1665             infoln(prevFileLine);
1666             infoln(fileLine);
1667             infoln(printCollationKey(prevKey));
1668             infoln(printCollationKey(key));
1669             return FALSE;
1670         }
1671     }
1672
1673     UCharIterator leftIter;
1674     UCharIterator rightIter;
1675     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1676     uiter_setString(&rightIter, s.getBuffer(), s.length());
1677     order = coll->compare(leftIter, rightIter, errorCode);
1678     if(order != expectedOrder || errorCode.isFailure()) {
1679         infoln(fileTestName);
1680         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1681               "wrong order: %d != %d (%s)",
1682               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1683         infoln(prevFileLine);
1684         infoln(fileLine);
1685         infoln(printCollationKey(prevKey));
1686         infoln(printCollationKey(key));
1687         return FALSE;
1688     }
1689
1690     order = prevKey.compareTo(key, errorCode);
1691     if(order != expectedOrder || errorCode.isFailure()) {
1692         infoln(fileTestName);
1693         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1694               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1695         infoln(prevFileLine);
1696         infoln(fileLine);
1697         infoln(printCollationKey(prevKey));
1698         infoln(printCollationKey(key));
1699         return FALSE;
1700     }
1701     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1702     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1703     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1704         if(level != expectedLevel) {
1705             infoln(fileTestName);
1706             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1707                   (int)fileLineNumber, norm, order, level, expectedLevel);
1708             infoln(prevFileLine);
1709             infoln(fileLine);
1710             infoln(printCollationKey(prevKey));
1711             infoln(printCollationKey(key));
1712             return FALSE;
1713         }
1714     }
1715
1716     // If either string contains U+FFFE, then their sort keys must compare the same as
1717     // the merged sort keys of each string's between-FFFE segments.
1718     //
1719     // It is not required that
1720     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1721     // only that those two methods yield the same order.
1722     //
1723     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1724     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1725                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1726             errorCode.isFailure()) {
1727         order = prevKey.compareTo(key, errorCode);
1728         if(order != expectedOrder || errorCode.isFailure()) {
1729             infoln(fileTestName);
1730             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1731                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1732                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1733             infoln(prevFileLine);
1734             infoln(fileLine);
1735             infoln(printCollationKey(prevKey));
1736             infoln(printCollationKey(key));
1737             return FALSE;
1738         }
1739         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1740         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1741             if(mergedLevel != level) {
1742                 infoln(fileTestName);
1743                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1744                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1745                     (int)fileLineNumber, norm, order, mergedLevel, level);
1746                 infoln(prevFileLine);
1747                 infoln(fileLine);
1748                 infoln(printCollationKey(prevKey));
1749                 infoln(printCollationKey(key));
1750                 return FALSE;
1751             }
1752         }
1753     }
1754     return TRUE;
1755 }
1756
1757 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1758     if(errorCode.isFailure()) { return; }
1759     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1760     UnicodeString prevString, s;
1761     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1762     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1763         // Parse the line even if it will be ignored (when we do not have a Collator)
1764         // in order to report syntax issues.
1765         Collation::Level relation = parseRelationAndString(s, errorCode);
1766         if(errorCode.isFailure()) {
1767             errorCode.reset();
1768             break;
1769         }
1770         if(coll == NULL) {
1771             // We were unable to create the Collator but continue with tests.
1772             // Ignore test data for this Collator.
1773             // The next Collator creation might work.
1774             continue;
1775         }
1776         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1777         Collation::Level expectedLevel = relation;
1778         s.getTerminatedBuffer();  // Ensure NUL-termination.
1779         UBool isOk = TRUE;
1780         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1781             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1782             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1783                                    expectedOrder, expectedLevel, errorCode);
1784         }
1785         if(isOk) {
1786             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1787             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1788                                    expectedOrder, expectedLevel, errorCode);
1789         }
1790         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1791             UnicodeString pn = nfd->normalize(prevString, errorCode);
1792             UnicodeString n = nfd->normalize(s, errorCode);
1793             pn.getTerminatedBuffer();
1794             n.getTerminatedBuffer();
1795             errorCode.assertSuccess();
1796             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1797                                    expectedOrder, expectedLevel, errorCode);
1798         }
1799         if(!isOk) {
1800             errorCode.reset();  // already reported
1801         }
1802         prevFileLine = fileLine;
1803         prevString = s;
1804         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1805     }
1806 }
1807
1808 void CollationTest::TestDataDriven() {
1809     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1810
1811     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1812     nfd = Normalizer2::getNFDInstance(errorCode);
1813     if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1814         return;
1815     }
1816
1817     CharString path(getSourceTestData(errorCode), errorCode);
1818     path.appendPathPart("collationtest.txt", errorCode);
1819     const char *codePage = "UTF-8";
1820     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1821     if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1822         return;
1823     }
1824     // Read a new line if necessary.
1825     // Sub-parsers leave the first line set that they do not handle.
1826     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1827         if(!isSectionStarter(fileLine[0])) {
1828             errln("syntax error on line %d", (int)fileLineNumber);
1829             infoln(fileLine);
1830             return;
1831         }
1832         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1833             fileTestName = fileLine;
1834             logln(fileLine);
1835             fileLine.remove();
1836         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1837             setRootCollator(errorCode);
1838             fileLine.remove();
1839         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1840             setLocaleCollator(errorCode);
1841             fileLine.remove();
1842         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1843             buildTailoring(f.getAlias(), errorCode);
1844         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1845             parseAndSetAttribute(errorCode);
1846         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1847             checkCompareStrings(f.getAlias(), errorCode);
1848         } else {
1849             errln("syntax error on line %d", (int)fileLineNumber);
1850             infoln(fileLine);
1851             return;
1852         }
1853     }
1854 }
1855
1856 #endif  // !UCONFIG_NO_COLLATION