icuSources/test/intltest/collationtest.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2012-2015, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * collationtest.cpp
   7 *
   8 * created on: 2012apr27
   9 * created by: Markus W. Scherer
  10 */
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_COLLATION
  15
  16 #include "unicode/coll.h"
  17 #include "unicode/errorcode.h"
  18 #include "unicode/localpointer.h"
  19 #include "unicode/normalizer2.h"
  20 #include "unicode/sortkey.h"
  21 #include "unicode/std_string.h"
  22 #include "unicode/strenum.h"
  23 #include "unicode/tblcoll.h"
  24 #include "unicode/uiter.h"
  25 #include "unicode/uniset.h"
  26 #include "unicode/unistr.h"
  27 #include "unicode/usetiter.h"
  28 #include "unicode/ustring.h"
  29 #include "charstr.h"
  30 #include "cmemory.h"
  31 #include "collation.h"
  32 #include "collationdata.h"
  33 #include "collationfcd.h"
  34 #include "collationiterator.h"
  35 #include "collationroot.h"
  36 #include "collationrootelements.h"
  37 #include "collationruleparser.h"
  38 #include "collationweights.h"
  39 #include "cstring.h"
  40 #include "intltest.h"
  41 #include "normalizer2impl.h"
  42 #include "ucbuf.h"
  43 #include "uhash.h"
  44 #include "uitercollationiterator.h"
  45 #include "utf16collationiterator.h"
  46 #include "utf8collationiterator.h"
  47 #include "uvectr32.h"
  48 #include "uvectr64.h"
  49 #include "writesrc.h"
  50
  51 // TODO: Move to ucbuf.h
  52 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
  53
  54 class CodePointIterator;
  55
  56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
  57
  58 class CollationTest : public IntlTest {
  59 public:
  60     CollationTest()
  61             : fcd(NULL), nfd(NULL),
  62               fileLineNumber(0),
  63               coll(NULL) {}
  64
  65     ~CollationTest() {
  66         delete coll;
  67     }
  68
  69     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
  70
  71     void TestMinMax();
  72     void TestImplicits();
  73     void TestNulTerminated();
  74     void TestIllegalUTF8();
  75     void TestShortFCDData();
  76     void TestFCD();
  77     void TestCollationWeights();
  78     void TestRootElements();
  79     void TestTailoredElements();
  80     void TestDataDriven();
  81
  82 private:
  83     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
  84     void checkAllocWeights(CollationWeights &cw,
  85                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
  86                            int32_t someLength, int32_t minCount);
  87
  88     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
  89     static UnicodeString printCollationKey(const CollationKey &key);
  90
  91     // Helpers & fields for data-driven test.
  92     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
  93     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
  94     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
  95     int32_t skipSpaces(int32_t i) {
  96         while(isSpace(fileLine[i])) { ++i; }
  97         return i;
  98     }
  99
 100     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
 101     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
 102     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
 103     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
 104     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
 105     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
 106     void setRootCollator(IcuTestErrorCode &errorCode);
 107     void setLocaleCollator(IcuTestErrorCode &errorCode);
 108
 109     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
 110
 111     UBool getSortKeyParts(const UChar *s, int32_t length,
 112                           CharString &dest, int32_t partSize,
 113                           IcuTestErrorCode &errorCode);
 114     UBool getCollationKey(const char *norm, const UnicodeString &line,
 115                           const UChar *s, int32_t length,
 116                           CollationKey &key, IcuTestErrorCode &errorCode);
 117     UBool getMergedCollationKey(const UChar *s, int32_t length,
 118                                 CollationKey &key, IcuTestErrorCode &errorCode);
 119     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
 120                           const UnicodeString &prevString, const UnicodeString &s,
 121                           UCollationResult expectedOrder, Collation::Level expectedLevel,
 122                           IcuTestErrorCode &errorCode);
 123     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
 124
 125     const Normalizer2 *fcd, *nfd;
 126     UnicodeString fileLine;
 127     int32_t fileLineNumber;
 128     UnicodeString fileTestName;
 129     Collator *coll;
 130 };
 131
 132 extern IntlTest *createCollationTest() {
 133     return new CollationTest();
 134 }
 135
 136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
 137     if(exec) {
 138         logln("TestSuite CollationTest: ");
 139     }
 140     TESTCASE_AUTO_BEGIN;
 141     TESTCASE_AUTO(TestMinMax);
 142     TESTCASE_AUTO(TestImplicits);
 143     TESTCASE_AUTO(TestNulTerminated);
 144     TESTCASE_AUTO(TestIllegalUTF8);
 145     TESTCASE_AUTO(TestShortFCDData);
 146     TESTCASE_AUTO(TestFCD);
 147     TESTCASE_AUTO(TestCollationWeights);
 148     TESTCASE_AUTO(TestRootElements);
 149     TESTCASE_AUTO(TestTailoredElements);
 150     TESTCASE_AUTO(TestDataDriven);
 151     TESTCASE_AUTO_END;
 152 }
 153
 154 void CollationTest::TestMinMax() {
 155     IcuTestErrorCode errorCode(*this, "TestMinMax");
 156
 157     setRootCollator(errorCode);
 158     if(errorCode.isFailure()) {
 159         errorCode.reset();
 160         return;
 161     }
 162     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
 163     if(rbc == NULL) {
 164         errln("the root collator is not a RuleBasedCollator");
 165         return;
 166     }
 167
 168     static const UChar s[2] = { 0xfffe, 0xffff };
 169     UVector64 ces(errorCode);
 170     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
 171     errorCode.assertSuccess();
 172     if(ces.size() != 2) {
 173         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
 174         return;
 175     }
 176     int64_t ce = ces.elementAti(0);
 177     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
 178     if(ce != expected) {
 179         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
 180     }
 181
 182     ce = ces.elementAti(1);
 183     expected = Collation::makeCE(Collation::MAX_PRIMARY);
 184     if(ce != expected) {
 185         errln("CE(U+ffff)=%04lx != max..", (long)ce);
 186     }
 187 }
 188
 189 void CollationTest::TestImplicits() {
 190     IcuTestErrorCode errorCode(*this, "TestImplicits");
 191
 192     const CollationData *cd = CollationRoot::getData(errorCode);
 193     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 194         return;
 195     }
 196
 197     // Implicit primary weights should be assigned for the following sets,
 198     // and sort in ascending order by set and then code point.
 199     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
 200
 201     // core Han Unified Ideographs
 202     UnicodeSet coreHan("[\\p{unified_ideograph}&"
 203                             "[\\p{Block=CJK_Unified_Ideographs}"
 204                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
 205                        errorCode);
 206     // all other Unified Han ideographs
 207     UnicodeSet otherHan("[\\p{unified ideograph}-"
 208                             "[\\p{Block=CJK_Unified_Ideographs}"
 209                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
 210                         errorCode);
 211     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
 212     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
 213
 214     // Starting with CLDR 26/ICU 54, the root Han order may instead be
 215     // the Unihan radical-stroke order.
 216     // The tests should pass either way, so we only test the order of a small set of Han characters
 217     // whose radical-stroke order is the same as their code point order.
 218     UnicodeSet someHanInCPOrder(
 219             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
 220             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
 221             errorCode);
 222     UnicodeSet inOrder(someHanInCPOrder);
 223     inOrder.addAll(unassigned).freeze();
 224     if(errorCode.logIfFailureAndReset("UnicodeSet")) {
 225         return;
 226     }
 227     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
 228     UChar32 prev = 0;
 229     uint32_t prevPrimary = 0;
 230     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
 231     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
 232         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
 233         while(iter->next()) {
 234             UChar32 c = iter->getCodepoint();
 235             UnicodeString s(c);
 236             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
 237             int64_t ce = ci.nextCE(errorCode);
 238             int64_t ce2 = ci.nextCE(errorCode);
 239             if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
 240                 return;
 241             }
 242             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
 243                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
 244                 continue;
 245             }
 246             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
 247                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
 248                       (long)c, (long)(ce & 0xffffffff));
 249                 continue;
 250             }
 251             uint32_t primary = (uint32_t)(ce >> 32);
 252             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
 253                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
 254                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
 255             }
 256             prev = c;
 257             prevPrimary = primary;
 258         }
 259     }
 260 }
 261
 262 void CollationTest::TestNulTerminated() {
 263     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
 264     const CollationData *data = CollationRoot::getData(errorCode);
 265     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 266         return;
 267     }
 268
 269     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
 270
 271     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
 272     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
 273     for(int32_t i = 0;; ++i) {
 274         int64_t ce1 = ci1.nextCE(errorCode);
 275         int64_t ce2 = ci2.nextCE(errorCode);
 276         if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
 277             return;
 278         }
 279         if(ce1 != ce2) {
 280             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
 281             break;
 282         }
 283         if(ce1 == Collation::NO_CE) { break; }
 284     }
 285 }
 286
 287 void CollationTest::TestIllegalUTF8() {
 288     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
 289
 290     setRootCollator(errorCode);
 291     if(errorCode.isFailure()) {
 292         errorCode.reset();
 293         return;
 294     }
 295     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
 296
 297     static const char *strings[] = {
 298         // U+FFFD
 299         "a\xef\xbf\xbdz",
 300         // illegal byte sequences
 301         "a\x80z",  // trail byte
 302         "a\xc1\x81z",  // non-shortest form
 303         "a\xe0\x82\x83z",  // non-shortest form
 304         "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
 305         "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
 306         "a\xf0\x8f\xbf\xbfz",  // non-shortest form
 307         "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
 308     };
 309
 310     StringPiece fffd(strings[0]);
 311     for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
 312         StringPiece illegal(strings[i]);
 313         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
 314         if(order != UCOL_EQUAL) {
 315             errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
 316                   (int)i, order);
 317         }
 318     }
 319 }
 320
 321 namespace {
 322
 323 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
 324     for(UChar32 c = 0x10000; c < 0x110000;) {
 325         UChar32 next = c + 0x400;
 326         if(src.containsSome(c, next - 1)) {
 327             dest.add(U16_LEAD(c));
 328         }
 329         c = next;
 330     }
 331 }
 332
 333 }  // namespace
 334
 335 void CollationTest::TestShortFCDData() {
 336     // See CollationFCD class comments.
 337     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
 338     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
 339     errorCode.assertSuccess();
 340     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
 341     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
 342     UnicodeSet lccc;  // actual
 343     for(UChar32 c = 0; c <= 0xffff; ++c) {
 344         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
 345     }
 346     UnicodeSet diff(expectedLccc);
 347     diff.removeAll(lccc);
 348     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
 349     UnicodeString empty("[]");
 350     UnicodeString diffString;
 351     diff.toPattern(diffString, TRUE);
 352     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
 353     diff = lccc;
 354     diff.removeAll(expectedLccc);
 355     diff.toPattern(diffString, TRUE);
 356     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
 357
 358     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
 359     if (errorCode.isSuccess()) {
 360         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
 361         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
 362         UnicodeSet tccc;  // actual
 363         for(UChar32 c = 0; c <= 0xffff; ++c) {
 364             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
 365         }
 366         diff = expectedTccc;
 367         diff.removeAll(tccc);
 368         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
 369         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
 370         diff = tccc;
 371         diff.removeAll(expectedTccc);
 372         diff.toPattern(diffString, TRUE);
 373         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
 374     }
 375 }
 376
 377 class CodePointIterator {
 378 public:
 379     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
 380     void resetToStart() { pos = 0; }
 381     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
 382     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
 383     int32_t getLength() const { return length; }
 384     int getIndex() const { return (int)pos; }
 385 private:
 386     const UChar32 *cp;
 387     int32_t length;
 388     int32_t pos;
 389 };
 390
 391 void CollationTest::checkFCD(const char *name,
 392                              CollationIterator &ci, CodePointIterator &cpi) {
 393     IcuTestErrorCode errorCode(*this, "checkFCD");
 394
 395     // Iterate forward to the limit.
 396     for(;;) {
 397         UChar32 c1 = ci.nextCodePoint(errorCode);
 398         UChar32 c2 = cpi.next();
 399         if(c1 != c2) {
 400             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
 401                   name, (long)c1, (long)c2, cpi.getIndex());
 402             return;
 403         }
 404         if(c1 < 0) { break; }
 405     }
 406
 407     // Iterate backward most of the way.
 408     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
 409         UChar32 c1 = ci.previousCodePoint(errorCode);
 410         UChar32 c2 = cpi.previous();
 411         if(c1 != c2) {
 412             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
 413                   name, (long)c1, (long)c2, cpi.getIndex());
 414             return;
 415         }
 416     }
 417
 418     // Forward again.
 419     for(;;) {
 420         UChar32 c1 = ci.nextCodePoint(errorCode);
 421         UChar32 c2 = cpi.next();
 422         if(c1 != c2) {
 423             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
 424                   name, (long)c1, (long)c2, cpi.getIndex());
 425             return;
 426         }
 427         if(c1 < 0) { break; }
 428     }
 429
 430     // Iterate backward to the start.
 431     for(;;) {
 432         UChar32 c1 = ci.previousCodePoint(errorCode);
 433         UChar32 c2 = cpi.previous();
 434         if(c1 != c2) {
 435             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
 436                   name, (long)c1, (long)c2, cpi.getIndex());
 437             return;
 438         }
 439         if(c1 < 0) { break; }
 440     }
 441 }
 442
 443 void CollationTest::TestFCD() {
 444     IcuTestErrorCode errorCode(*this, "TestFCD");
 445     const CollationData *data = CollationRoot::getData(errorCode);
 446     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 447         return;
 448     }
 449
 450     // Input string, not FCD, NUL-terminated.
 451     static const UChar s[] = {
 452         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
 453         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
 454         0x327, 0x308,  // ccc=202, 230
 455         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
 456         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
 457         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
 458         0xac01,
 459         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
 460         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
 461         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
 462         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
 463         0x4e00, 0xf81,
 464         0
 465     };
 466     // Expected code points.
 467     static const UChar32 cp[] = {
 468         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
 469         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
 470         0x1D15F, 0x1D16D,
 471         0xac01,
 472         0x63, 0x327, 0x1D165, 0x1D16D,
 473         0x61,
 474         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
 475         0x4e00, 0xf71, 0xf80
 476     };
 477
 478     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
 479     if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
 480         return;
 481     }
 482     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
 483     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
 484
 485 #if U_HAVE_STD_STRING
 486     cpi.resetToStart();
 487     std::string utf8;
 488     UnicodeString(s).toUTF8String(utf8);
 489     FCDUTF8CollationIterator u8ci(data, FALSE,
 490                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
 491     if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
 492         return;
 493     }
 494     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
 495 #endif
 496
 497     cpi.resetToStart();
 498     UCharIterator iter;
 499     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
 500     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
 501     if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
 502         return;
 503     }
 504     checkFCD("FCDUIterCollationIterator", uici, cpi);
 505 }
 506
 507 void CollationTest::checkAllocWeights(CollationWeights &cw,
 508                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
 509                                       int32_t someLength, int32_t minCount) {
 510     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
 511         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
 512               (long)lowerLimit, (long)upperLimit, (long)n);
 513         return;
 514     }
 515     uint32_t previous = lowerLimit;
 516     int32_t count = 0;  // number of weights that have someLength
 517     for(int32_t i = 0; i < n; ++i) {
 518         uint32_t w = cw.nextWeight();
 519         if(w == 0xffffffff) {
 520             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 521                   "returns only %ld weights",
 522                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
 523             return;
 524         }
 525         if(!(previous < w && w < upperLimit)) {
 526             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 527                   "number %ld -> %lx not between %lx and %lx",
 528                   (long)lowerLimit, (long)upperLimit, (long)n,
 529                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
 530             return;
 531         }
 532         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
 533     }
 534     if(count < minCount) {
 535         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 536               "returns only %ld < %ld weights of length %d",
 537               (long)lowerLimit, (long)upperLimit, (long)n,
 538               (long)count, (long)minCount, (int)someLength);
 539     }
 540 }
 541
 542 void CollationTest::TestCollationWeights() {
 543     CollationWeights cw;
 544
 545     // Non-compressible primaries use 254 second bytes 02..FF.
 546     logln("CollationWeights.initForPrimary(non-compressible)");
 547     cw.initForPrimary(FALSE);
 548     // Expect 1 weight 11 and 254 weights 12xx.
 549     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
 550     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
 551     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
 552     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
 553     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
 554     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
 555     // Expect 254^2=64516 three-byte weights.
 556     // During computation, there should be 3 three-byte ranges
 557     // 10ffff, 11xxxx, 120202.
 558     // The middle one should be split 64515:1,
 559     // and the newly-split-off range and the last ranged lengthened.
 560     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
 561     // Expect weights 1102 & 1103.
 562     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
 563     // Expect weights 102102 & 102103.
 564     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
 565
 566     // Compressible primaries use 251 second bytes 04..FE.
 567     logln("CollationWeights.initForPrimary(compressible)");
 568     cw.initForPrimary(TRUE);
 569     // Expect 1 weight 11 and 251 weights 12xx.
 570     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
 571     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
 572     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
 573     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
 574     // Expect weights 1104 & 1105.
 575     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
 576     // Expect weights 102102 & 102103.
 577     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
 578
 579     // Secondary and tertiary weights use only bytes 3 & 4.
 580     logln("CollationWeights.initForSecondary()");
 581     cw.initForSecondary();
 582     // Expect weights fbxx and all four fc..ff.
 583     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
 584
 585     logln("CollationWeights.initForTertiary()");
 586     cw.initForTertiary();
 587     // Expect weights 3dxx and both 3e & 3f.
 588     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
 589 }
 590
 591 namespace {
 592
 593 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
 594                 uint32_t p, uint32_t s, uint32_t ctq) {
 595     uint32_t p1 = p >> 24;
 596     uint32_t p2 = (p >> 16) & 0xff;
 597     uint32_t p3 = (p >> 8) & 0xff;
 598     uint32_t p4 = p & 0xff;
 599     uint32_t s1 = s >> 8;
 600     uint32_t s2 = s & 0xff;
 601     // ctq = Case, Tertiary, Quaternary
 602     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
 603     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
 604     uint32_t t1 = t >> 8;
 605     uint32_t t2 = t & 0xff;
 606     uint32_t q = ctq & Collation::QUATERNARY_MASK;
 607     // No leading zero bytes.
 608     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
 609         return FALSE;
 610     }
 611     // No intermediate zero bytes.
 612     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
 613         return FALSE;
 614     }
 615     if(p2 != 0 && p3 == 0 && p4 != 0) {
 616         return FALSE;
 617     }
 618     // Minimum & maximum lead bytes.
 619     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
 620             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
 621             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
 622         return FALSE;
 623     }
 624     if(c > 2) {
 625         return FALSE;
 626     }
 627     // The valid byte range for the second primary byte depends on compressibility.
 628     if(p2 != 0) {
 629         if(data.isCompressibleLeadByte(p1)) {
 630             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
 631                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
 632                 return FALSE;
 633             }
 634         } else {
 635             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
 636                 return FALSE;
 637             }
 638         }
 639     }
 640     // Other bytes just need to avoid the level separator.
 641     // Trailing zeros are ok.
 642     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
 643     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
 644             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
 645         return FALSE;
 646     }
 647     // Well-formed CEs.
 648     if(p == 0) {
 649         if(s == 0) {
 650             if(t == 0) {
 651                 // Completely ignorable CE.
 652                 // Quaternary CEs are not supported.
 653                 if(c != 0 || q != 0) {
 654                     return FALSE;
 655                 }
 656             } else {
 657                 // Tertiary CE.
 658                 if(t < re.getTertiaryBoundary() || c != 2) {
 659                     return FALSE;
 660                 }
 661             }
 662         } else {
 663             // Secondary CE.
 664             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
 665                 return FALSE;
 666             }
 667         }
 668     } else {
 669         // Primary CE.
 670         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
 671                 s >= re.getSecondaryBoundary()) {
 672             return FALSE;
 673         }
 674         if(t == 0 || t >= re.getTertiaryBoundary()) {
 675             return FALSE;
 676         }
 677     }
 678     return TRUE;
 679 }
 680
 681 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
 682     uint32_t p = (uint32_t)(ce >> 32);
 683     uint32_t secTer = (uint32_t)ce;
 684     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
 685 }
 686
 687 class RootElementsIterator {
 688 public:
 689     RootElementsIterator(const CollationData &root)
 690             : data(root),
 691               elements(root.rootElements), length(root.rootElementsLength),
 692               pri(0), secTer(0),
 693               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
 694
 695     UBool next() {
 696         if(index >= length) { return FALSE; }
 697         uint32_t p = elements[index];
 698         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
 699         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
 700             ++index;
 701             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
 702             return TRUE;
 703         }
 704         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
 705             // End of a range, enumerate the primaries in the range.
 706             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
 707             p &= 0xffffff00;
 708             if(pri == p) {
 709                 // Finished the range, return the next CE after it.
 710                 ++index;
 711                 return next();
 712             }
 713             U_ASSERT(pri < p);
 714             // Return the next primary in this range.
 715             UBool isCompressible = data.isCompressiblePrimary(pri);
 716             if((pri & 0xffff) == 0) {
 717                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
 718             } else {
 719                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
 720             }
 721             return TRUE;
 722         }
 723         // Simple primary CE.
 724         ++index;
 725         pri = p;
 726         // Does this have an explicit below-common sec/ter unit,
 727         // or does it imply a common one?
 728         if(index == length) {
 729             secTer = Collation::COMMON_SEC_AND_TER_CE;
 730         } else {
 731             secTer = elements[index];
 732             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
 733                 // No sec/ter delta.
 734                 secTer = Collation::COMMON_SEC_AND_TER_CE;
 735             } else {
 736                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
 737                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
 738                     // Implied sec/ter.
 739                     secTer = Collation::COMMON_SEC_AND_TER_CE;
 740                 } else {
 741                     // Explicit sec/ter below common/common.
 742                     ++index;
 743                 }
 744             }
 745         }
 746         return TRUE;
 747     }
 748
 749     uint32_t getPrimary() const { return pri; }
 750     uint32_t getSecTer() const { return secTer; }
 751
 752 private:
 753     const CollationData &data;
 754     const uint32_t *elements;
 755     int32_t length;
 756
 757     uint32_t pri;
 758     uint32_t secTer;
 759     int32_t index;
 760 };
 761
 762 }  // namespace
 763
 764 void CollationTest::TestRootElements() {
 765     IcuTestErrorCode errorCode(*this, "TestRootElements");
 766     const CollationData *root = CollationRoot::getData(errorCode);
 767     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 768         return;
 769     }
 770     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
 771     RootElementsIterator iter(*root);
 772
 773     // We check each root CE for validity,
 774     // and we also verify that there is a tailoring gap between each two CEs.
 775     CollationWeights cw1c;  // compressible primary weights
 776     CollationWeights cw1u;  // uncompressible primary weights
 777     CollationWeights cw2;
 778     CollationWeights cw3;
 779
 780     cw1c.initForPrimary(TRUE);
 781     cw1u.initForPrimary(FALSE);
 782     cw2.initForSecondary();
 783     cw3.initForTertiary();
 784
 785     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
 786     // nor the special merge-separator CE for U+FFFE.
 787     uint32_t prevPri = 0;
 788     uint32_t prevSec = 0;
 789     uint32_t prevTer = 0;
 790     while(iter.next()) {
 791         uint32_t pri = iter.getPrimary();
 792         uint32_t secTer = iter.getSecTer();
 793         // CollationRootElements CEs must have 0 case and quaternary bits.
 794         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
 795             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
 796                   (long)pri, (long)secTer);
 797         }
 798         uint32_t sec = secTer >> 16;
 799         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
 800         uint32_t ctq = ter;
 801         if(pri == 0 && sec == 0 && ter != 0) {
 802             // Tertiary CEs must have uppercase bits,
 803             // but they are not stored in the CollationRootElements.
 804             ctq |= 0x8000;
 805         }
 806         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
 807             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
 808         } else {
 809             if(pri != prevPri) {
 810                 uint32_t newWeight = 0;
 811                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
 812                     // There is currently no tailoring gap after primary ignorables,
 813                     // and we forbid tailoring after U+FFFD and U+FFFF.
 814                 } else if(root->isCompressiblePrimary(prevPri)) {
 815                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
 816                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
 817                               (long)prevPri, (long)pri);
 818                     } else {
 819                         newWeight = cw1c.nextWeight();
 820                     }
 821                 } else {
 822                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
 823                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
 824                               (long)prevPri, (long)pri);
 825                     } else {
 826                         newWeight = cw1u.nextWeight();
 827                     }
 828                 }
 829                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
 830                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
 831                           (long)prevPri, (long)newWeight, (long)pri);
 832                 }
 833             } else if(sec != prevSec) {
 834                 uint32_t lowerLimit =
 835                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
 836                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
 837                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
 838                 } else {
 839                     uint32_t newWeight = cw2.nextWeight();
 840                     if(!(prevSec < newWeight && newWeight < sec)) {
 841                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
 842                               (long)lowerLimit, (long)newWeight, (long)sec);
 843                     }
 844                 }
 845             } else if(ter != prevTer) {
 846                 uint32_t lowerLimit =
 847                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
 848                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
 849                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
 850                 } else {
 851                     uint32_t newWeight = cw3.nextWeight();
 852                     if(!(prevTer < newWeight && newWeight < ter)) {
 853                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
 854                               (long)lowerLimit, (long)newWeight, (long)ter);
 855                     }
 856                 }
 857             } else {
 858                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
 859             }
 860         }
 861         prevPri = pri;
 862         prevSec = sec;
 863         prevTer = ter;
 864     }
 865 }
 866
 867 void CollationTest::TestTailoredElements() {
 868     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
 869     const CollationData *root = CollationRoot::getData(errorCode);
 870     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 871         return;
 872     }
 873     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
 874
 875     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
 876     if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
 877         return;
 878     }
 879     uhash_setKeyDeleter(prevLocales, uprv_free);
 880     // TestRootElements() tests the root collator which does not have tailorings.
 881     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
 882     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
 883     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
 884
 885     UVector64 ces(errorCode);
 886     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
 887     U_ASSERT(locales.isValid());
 888     const char *localeID = "root";
 889     do {
 890         Locale locale(localeID);
 891         LocalPointer<StringEnumeration> types(
 892                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
 893         errorCode.assertSuccess();
 894         const char *type;  // first: default type
 895         while((type = types->next(NULL, errorCode)) != NULL) {
 896             if(strncmp(type, "private-", 8) == 0) {
 897                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
 898                         localeID, type);
 899             }
 900             Locale localeWithType(locale);
 901             localeWithType.setKeywordValue("collation", type, errorCode);
 902             errorCode.assertSuccess();
 903             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
 904             if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
 905                                               localeWithType.getName())) {
 906                 continue;
 907             }
 908             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
 909             if(uhash_geti(prevLocales, actual.getName()) != 0) {
 910                 continue;
 911             }
 912             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
 913             errorCode.assertSuccess();
 914             logln("TestTailoredElements(): requested %s -> actual %s",
 915                   localeWithType.getName(), actual.getName());
 916             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
 917             if(rbc == NULL) {
 918                 continue;
 919             }
 920             // Note: It would be better to get tailored strings such that we can
 921             // identify the prefix, and only get the CEs for the prefix+string,
 922             // not also for the prefix.
 923             // There is currently no API for that.
 924             // It would help in an unusual case where a contraction starting in the prefix
 925             // extends past its end, and we do not see the intended mapping.
 926             // For example, for a mapping p|st, if there is also a contraction ps,
 927             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
 928             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
 929             errorCode.assertSuccess();
 930             UnicodeSetIterator iter(*tailored);
 931             while(iter.next()) {
 932                 const UnicodeString &s = iter.getString();
 933                 ces.removeAllElements();
 934                 rbc->internalGetCEs(s, ces, errorCode);
 935                 errorCode.assertSuccess();
 936                 for(int32_t i = 0; i < ces.size(); ++i) {
 937                     int64_t ce = ces.elementAti(i);
 938                     if(!isValidCE(rootElements, *root, ce)) {
 939                         errln("invalid tailored CE %016llx at CE index %d from string:",
 940                               (long long)ce, (int)i);
 941                         infoln(prettify(s));
 942                     }
 943                 }
 944             }
 945         }
 946     } while((localeID = locales->next(NULL, errorCode)) != NULL);
 947     uhash_close(prevLocales);
 948 }
 949
 950 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
 951     UnicodeString s;
 952     for(int32_t i = 0; i < length; ++i) {
 953         if(i > 0) { s.append((UChar)0x20); }
 954         uint8_t b = p[i];
 955         if(b == 0) {
 956             s.append((UChar)0x2e);  // period
 957         } else if(b == 1) {
 958             s.append((UChar)0x7c);  // vertical bar
 959         } else {
 960             appendHex(b, 2, s);
 961         }
 962     }
 963     return s;
 964 }
 965
 966 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
 967     int32_t length;
 968     const uint8_t *p = key.getByteArray(length);
 969     return printSortKey(p, length);
 970 }
 971
 972 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
 973     for(;;) {
 974         int32_t lineLength;
 975         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
 976         if(line == NULL || errorCode.isFailure()) {
 977             fileLine.remove();
 978             return FALSE;
 979         }
 980         ++fileLineNumber;
 981         // Strip trailing CR/LF, comments, and spaces.
 982         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
 983         if(comment != NULL) {
 984             lineLength = (int32_t)(comment - line);
 985         } else {
 986             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
 987         }
 988         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
 989         if(lineLength != 0) {
 990             fileLine.setTo(FALSE, line, lineLength);
 991             return TRUE;
 992         }
 993         // Empty line, continue.
 994     }
 995 }
 996
 997 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
 998                                 UErrorCode &errorCode) {
 999     int32_t length = fileLine.length();
1000     int32_t i;
1001     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1002     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1003     if(pipeIndex >= 0) {
1004         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1005         if(prefix.isEmpty()) {
1006             errln("empty prefix on line %d", (int)fileLineNumber);
1007             infoln(fileLine);
1008             errorCode = U_PARSE_ERROR;
1009             return;
1010         }
1011         start = pipeIndex + 1;
1012     } else {
1013         prefix.remove();
1014     }
1015     s = fileLine.tempSubStringBetween(start, i).unescape();
1016     if(s.isEmpty()) {
1017         errln("empty string on line %d", (int)fileLineNumber);
1018         infoln(fileLine);
1019         errorCode = U_PARSE_ERROR;
1020         return;
1021     }
1022     start = i;
1023 }
1024
1025 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1026     Collation::Level relation;
1027     int32_t start;
1028     if(fileLine[0] == 0x3c) {  // <
1029         UChar second = fileLine[1];
1030         start = 2;
1031         switch(second) {
1032         case 0x31:  // <1
1033             relation = Collation::PRIMARY_LEVEL;
1034             break;
1035         case 0x32:  // <2
1036             relation = Collation::SECONDARY_LEVEL;
1037             break;
1038         case 0x33:  // <3
1039             relation = Collation::TERTIARY_LEVEL;
1040             break;
1041         case 0x34:  // <4
1042             relation = Collation::QUATERNARY_LEVEL;
1043             break;
1044         case 0x63:  // <c
1045             relation = Collation::CASE_LEVEL;
1046             break;
1047         case 0x69:  // <i
1048             relation = Collation::IDENTICAL_LEVEL;
1049             break;
1050         default:  // just <
1051             relation = Collation::NO_LEVEL;
1052             start = 1;
1053             break;
1054         }
1055     } else if(fileLine[0] == 0x3d) {  // =
1056         relation = Collation::ZERO_LEVEL;
1057         start = 1;
1058     } else {
1059         start = 0;
1060     }
1061     if(start == 0 || !isSpace(fileLine[start])) {
1062         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1063         infoln(fileLine);
1064         errorCode.set(U_PARSE_ERROR);
1065         return Collation::NO_LEVEL;
1066     }
1067     start = skipSpaces(start);
1068     UnicodeString prefix;
1069     parseString(start, prefix, s, errorCode);
1070     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1071         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1072         infoln(fileLine);
1073         errorCode.set(U_PARSE_ERROR);
1074         return Collation::NO_LEVEL;
1075     }
1076     if(start < fileLine.length()) {
1077         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1078         infoln(fileLine);
1079         errorCode.set(U_PARSE_ERROR);
1080         return Collation::NO_LEVEL;
1081     }
1082     return relation;
1083 }
1084
1085 static const struct {
1086     const char *name;
1087     UColAttribute attr;
1088 } attributes[] = {
1089     { "backwards", UCOL_FRENCH_COLLATION },
1090     { "alternate", UCOL_ALTERNATE_HANDLING },
1091     { "caseFirst", UCOL_CASE_FIRST },
1092     { "caseLevel", UCOL_CASE_LEVEL },
1093     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1094     { "strength", UCOL_STRENGTH },
1095     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1096     { "numeric", UCOL_NUMERIC_COLLATION }
1097 };
1098
1099 static const struct {
1100     const char *name;
1101     UColAttributeValue value;
1102 } attributeValues[] = {
1103     { "default", UCOL_DEFAULT },
1104     { "primary", UCOL_PRIMARY },
1105     { "secondary", UCOL_SECONDARY },
1106     { "tertiary", UCOL_TERTIARY },
1107     { "quaternary", UCOL_QUATERNARY },
1108     { "identical", UCOL_IDENTICAL },
1109     { "off", UCOL_OFF },
1110     { "on", UCOL_ON },
1111     { "shifted", UCOL_SHIFTED },
1112     { "non-ignorable", UCOL_NON_IGNORABLE },
1113     { "lower", UCOL_LOWER_FIRST },
1114     { "upper", UCOL_UPPER_FIRST }
1115 };
1116
1117 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1118     // Parse attributes even if the Collator could not be created,
1119     // in order to report syntax errors.
1120     int32_t start = skipSpaces(1);
1121     int32_t equalPos = fileLine.indexOf(0x3d);
1122     if(equalPos < 0) {
1123         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1124             parseAndSetReorderCodes(start + 7, errorCode);
1125             return;
1126         }
1127         errln("missing '=' on line %d", (int)fileLineNumber);
1128         infoln(fileLine);
1129         errorCode.set(U_PARSE_ERROR);
1130         return;
1131     }
1132
1133     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1134     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1135     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1136         UColReorderCode max;
1137         if(valueString == UNICODE_STRING("space", 5)) {
1138             max = UCOL_REORDER_CODE_SPACE;
1139         } else if(valueString == UNICODE_STRING("punct", 5)) {
1140             max = UCOL_REORDER_CODE_PUNCTUATION;
1141         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1142             max = UCOL_REORDER_CODE_SYMBOL;
1143         } else if(valueString == UNICODE_STRING("currency", 8)) {
1144             max = UCOL_REORDER_CODE_CURRENCY;
1145         } else {
1146             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1147             infoln(fileLine);
1148             errorCode.set(U_PARSE_ERROR);
1149             return;
1150         }
1151         if(coll != NULL) {
1152             coll->setMaxVariable(max, errorCode);
1153             if(errorCode.isFailure()) {
1154                 errln("setMaxVariable() failed on line %d: %s",
1155                       (int)fileLineNumber, errorCode.errorName());
1156                 infoln(fileLine);
1157                 return;
1158             }
1159         }
1160         fileLine.remove();
1161         return;
1162     }
1163
1164     UColAttribute attr;
1165     for(int32_t i = 0;; ++i) {
1166         if(i == UPRV_LENGTHOF(attributes)) {
1167             errln("invalid attribute name on line %d", (int)fileLineNumber);
1168             infoln(fileLine);
1169             errorCode.set(U_PARSE_ERROR);
1170             return;
1171         }
1172         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1173             attr = attributes[i].attr;
1174             break;
1175         }
1176     }
1177
1178     UColAttributeValue value;
1179     for(int32_t i = 0;; ++i) {
1180         if(i == UPRV_LENGTHOF(attributeValues)) {
1181             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1182             infoln(fileLine);
1183             errorCode.set(U_PARSE_ERROR);
1184             return;
1185         }
1186         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1187             value = attributeValues[i].value;
1188             break;
1189         }
1190     }
1191
1192     if(coll != NULL) {
1193         coll->setAttribute(attr, value, errorCode);
1194         if(errorCode.isFailure()) {
1195             errln("illegal attribute=value combination on line %d: %s",
1196                   (int)fileLineNumber, errorCode.errorName());
1197             infoln(fileLine);
1198             return;
1199         }
1200     }
1201     fileLine.remove();
1202 }
1203
1204 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1205     UVector32 reorderCodes(errorCode);
1206     while(start < fileLine.length()) {
1207         start = skipSpaces(start);
1208         int32_t limit = start;
1209         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1210         CharString name;
1211         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1212         int32_t code = CollationRuleParser::getReorderCode(name.data());
1213         if(code < 0) {
1214             if(uprv_stricmp(name.data(), "default") == 0) {
1215                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1216             } else {
1217                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1218                 infoln(fileLine);
1219                 errorCode.set(U_PARSE_ERROR);
1220                 return;
1221             }
1222         }
1223         reorderCodes.addElement(code, errorCode);
1224         start = limit;
1225     }
1226     if(coll != NULL) {
1227         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1228         if(errorCode.isFailure()) {
1229             errln("setReorderCodes() failed on line %d: %s",
1230                   (int)fileLineNumber, errorCode.errorName());
1231             infoln(fileLine);
1232             return;
1233         }
1234     }
1235     fileLine.remove();
1236 }
1237
1238 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1239     UnicodeString rules;
1240     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1241         rules.append(fileLine.unescape());
1242     }
1243     if(errorCode.isFailure()) { return; }
1244     logln(rules);
1245
1246     UParseError parseError;
1247     UnicodeString reason;
1248     delete coll;
1249     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1250     if(coll == NULL) {
1251         errln("unable to allocate a new collator");
1252         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1253         return;
1254     }
1255     if(errorCode.isFailure()) {
1256         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1257         infoln(UnicodeString("  reason: ") + reason);
1258         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1259         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1260             infoln(UnicodeString("  snippet: ...") +
1261                 parseError.preContext + "(!)" + parseError.postContext + "...");
1262         }
1263         delete coll;
1264         coll = NULL;
1265         errorCode.reset();
1266     } else {
1267         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1268                      UnicodeString(), reason);
1269     }
1270 }
1271
1272 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1273     if(errorCode.isFailure()) { return; }
1274     delete coll;
1275     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1276     if(errorCode.isFailure()) {
1277         dataerrln("unable to create a root collator");
1278         return;
1279     }
1280 }
1281
1282 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1283     if(errorCode.isFailure()) { return; }
1284     delete coll;
1285     coll = NULL;
1286     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1287     if(at >= 0) {
1288         fileLine.setCharAt(at, (UChar)0x2a);  // *
1289     }
1290     CharString localeID;
1291     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1292     if(at >= 0) {
1293         localeID.data()[at - 9] = '@';
1294     }
1295     Locale locale(localeID.data());
1296     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1297         errln("invalid language tag on line %d", (int)fileLineNumber);
1298         infoln(fileLine);
1299         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1300         return;
1301     }
1302
1303     logln("creating a collator for locale ID %s", locale.getName());
1304     coll = Collator::createInstance(locale, errorCode);
1305     if(errorCode.isFailure()) {
1306         dataerrln("unable to create a collator for locale %s on line %d",
1307                   locale.getName(), (int)fileLineNumber);
1308         infoln(fileLine);
1309         delete coll;
1310         coll = NULL;
1311         errorCode.reset();
1312     }
1313 }
1314
1315 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1316     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1317     // In some sequences with Tibetan composite vowel signs,
1318     // even if the string passes the FCD check,
1319     // those composites must be decomposed.
1320     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1321     int32_t index = 0;
1322     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1323         if(++index < s.length()) {
1324             UChar c = s[index];
1325             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1326         }
1327     }
1328     return FALSE;
1329 }
1330
1331 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1332                                      CharString &dest, int32_t partSize,
1333                                      IcuTestErrorCode &errorCode) {
1334     if(errorCode.isFailure()) { return FALSE; }
1335     uint8_t part[32];
1336     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1337     UCharIterator iter;
1338     uiter_setString(&iter, s, length);
1339     uint32_t state[2] = { 0, 0 };
1340     for(;;) {
1341         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1342         UBool done = partLength < partSize;
1343         if(done) {
1344             // At the end, append the next byte as well which should be 00.
1345             ++partLength;
1346         }
1347         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1348         if(done) {
1349             return errorCode.isSuccess();
1350         }
1351     }
1352 }
1353
1354 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1355                                      const UChar *s, int32_t length,
1356                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1357     if(errorCode.isFailure()) { return FALSE; }
1358     coll->getCollationKey(s, length, key, errorCode);
1359     if(errorCode.isFailure()) {
1360         infoln(fileTestName);
1361         errln("Collator(%s).getCollationKey() failed: %s",
1362               norm, errorCode.errorName());
1363         infoln(line);
1364         return FALSE;
1365     }
1366     int32_t keyLength;
1367     const uint8_t *keyBytes = key.getByteArray(keyLength);
1368     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1369         infoln(fileTestName);
1370         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1371               norm);
1372         infoln(line);
1373         infoln(printCollationKey(key));
1374         return FALSE;
1375     }
1376
1377     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1378     if(numLevels < UCOL_IDENTICAL) {
1379         ++numLevels;
1380     } else {
1381         numLevels = 5;
1382     }
1383     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1384         ++numLevels;
1385     }
1386     errorCode.assertSuccess();
1387     int32_t numLevelSeparators = 0;
1388     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1389         uint8_t b = keyBytes[i];
1390         if(b == 0) {
1391             infoln(fileTestName);
1392             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1393             infoln(line);
1394             infoln(printCollationKey(key));
1395             return FALSE;
1396         }
1397         if(b == 1) { ++numLevelSeparators; }
1398     }
1399     if(numLevelSeparators != (numLevels - 1)) {
1400         infoln(fileTestName);
1401         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1402               norm, (int)numLevelSeparators, (int)numLevels);
1403         infoln(line);
1404         infoln(printCollationKey(key));
1405         return FALSE;
1406     }
1407
1408     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1409     static const int32_t partSizes[] = { 32, 3, 1 };
1410     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1411         int32_t partSize = partSizes[psi];
1412         CharString parts;
1413         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1414             infoln(fileTestName);
1415             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1416                   norm, (int)partSize, errorCode.errorName());
1417             infoln(line);
1418             return FALSE;
1419         }
1420         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1421             infoln(fileTestName);
1422             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1423                   norm, (int)partSize);
1424             infoln(line);
1425             infoln(printCollationKey(key));
1426             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1427             return FALSE;
1428         }
1429     }
1430     return TRUE;
1431 }
1432
1433 /**
1434  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1435  * Leaves key unchanged if s does not contain U+FFFE.
1436  * @return TRUE if the key was successfully changed
1437  */
1438 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1439                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1440     if(errorCode.isFailure()) { return FALSE; }
1441     LocalMemory<uint8_t> mergedKey;
1442     int32_t mergedKeyLength = 0;
1443     int32_t mergedKeyCapacity = 0;
1444     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1445     int32_t segmentStart = 0;
1446     for(int32_t i = 0;;) {
1447         if(i == sLength) {
1448             if(segmentStart == 0) {
1449                 // s does not contain any U+FFFE.
1450                 return FALSE;
1451             }
1452         } else if(s[i] != 0xfffe) {
1453             ++i;
1454             continue;
1455         }
1456         // Get the sort key for another segment and merge it into mergedKey.
1457         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1458         CollationKey key2;
1459         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1460         int32_t key1Length, key2Length;
1461         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1462         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1463         uint8_t *dest;
1464         int32_t minCapacity = key1Length + key2Length;
1465         if(key1Length > 0) { --minCapacity; }
1466         if(minCapacity <= mergedKeyCapacity) {
1467             dest = mergedKey.getAlias();
1468         } else {
1469             if(minCapacity <= 200) {
1470                 mergedKeyCapacity = 200;
1471             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1472                 mergedKeyCapacity *= 2;
1473             } else {
1474                 mergedKeyCapacity = minCapacity;
1475             }
1476             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1477         }
1478         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1479         if(key1Length == 0) {
1480             // key2 is the sort key for the first segment.
1481             uprv_memcpy(dest, key2Bytes, key2Length);
1482             mergedKeyLength = key2Length;
1483         } else {
1484             mergedKeyLength =
1485                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1486                                    dest, mergedKeyCapacity);
1487         }
1488         if(i == sLength) { break; }
1489         segmentStart = ++i;
1490     }
1491     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1492     return TRUE;
1493 }
1494
1495 namespace {
1496
1497 /**
1498  * Replaces unpaired surrogates with U+FFFD.
1499  * Returns s if no replacement was made, otherwise buffer.
1500  */
1501 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1502     int32_t i = 0;
1503     while(i < s.length()) {
1504         UChar32 c = s.char32At(i);
1505         if(U_IS_SURROGATE(c)) {
1506             if(buffer.length() < i) {
1507                 buffer.append(s, buffer.length(), i - buffer.length());
1508             }
1509             buffer.append((UChar)0xfffd);
1510         }
1511         i += U16_LENGTH(c);
1512     }
1513     if(buffer.isEmpty()) {
1514         return s;
1515     }
1516     if(buffer.length() < i) {
1517         buffer.append(s, buffer.length(), i - buffer.length());
1518     }
1519     return buffer;
1520 }
1521
1522 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1523                            UCollationResult order, UBool collHasCaseLevel) {
1524     if(order == UCOL_EQUAL) {
1525         return Collation::NO_LEVEL;
1526     }
1527     int32_t prevKeyLength;
1528     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1529     int32_t keyLength;
1530     const uint8_t *bytes = key.getByteArray(keyLength);
1531     int32_t level = Collation::PRIMARY_LEVEL;
1532     for(int32_t i = 0;; ++i) {
1533         uint8_t b = prevBytes[i];
1534         if(b != bytes[i]) { break; }
1535         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1536             ++level;
1537             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1538                 ++level;
1539             }
1540         }
1541     }
1542     return level;
1543 }
1544
1545 }
1546
1547 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1548                                      const UnicodeString &prevString, const UnicodeString &s,
1549                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1550                                      IcuTestErrorCode &errorCode) {
1551     if(errorCode.isFailure()) { return FALSE; }
1552
1553     // Get the sort keys first, for error debug output.
1554     CollationKey prevKey;
1555     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1556                         prevKey, errorCode)) {
1557         return FALSE;
1558     }
1559     CollationKey key;
1560     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1561
1562     UCollationResult order = coll->compare(prevString, s, errorCode);
1563     if(order != expectedOrder || errorCode.isFailure()) {
1564         infoln(fileTestName);
1565         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1566               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1567         infoln(prevFileLine);
1568         infoln(fileLine);
1569         infoln(printCollationKey(prevKey));
1570         infoln(printCollationKey(key));
1571         return FALSE;
1572     }
1573     order = coll->compare(s, prevString, errorCode);
1574     if(order != -expectedOrder || errorCode.isFailure()) {
1575         infoln(fileTestName);
1576         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1577               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1578         infoln(prevFileLine);
1579         infoln(fileLine);
1580         infoln(printCollationKey(prevKey));
1581         infoln(printCollationKey(key));
1582         return FALSE;
1583     }
1584     // Test NUL-termination if the strings do not contain NUL characters.
1585     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1586     if(!containNUL) {
1587         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1588         if(order != expectedOrder || errorCode.isFailure()) {
1589             infoln(fileTestName);
1590             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1591                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1592             infoln(prevFileLine);
1593             infoln(fileLine);
1594             infoln(printCollationKey(prevKey));
1595             infoln(printCollationKey(key));
1596             return FALSE;
1597         }
1598         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1599         if(order != -expectedOrder || errorCode.isFailure()) {
1600             infoln(fileTestName);
1601             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1602                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1603             infoln(prevFileLine);
1604             infoln(fileLine);
1605             infoln(printCollationKey(prevKey));
1606             infoln(printCollationKey(key));
1607             return FALSE;
1608         }
1609     }
1610
1611 #if U_HAVE_STD_STRING
1612     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1613     // Unpaired surrogates cannot be converted to UTF-8.
1614     // Create valid UTF-16 strings if necessary, and use those for
1615     // both the expected compare() result and for the input to compare(UTF-8).
1616     UnicodeString prevBuffer, sBuffer;
1617     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1618     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1619     std::string prevUTF8, sUTF8;
1620     UnicodeString(prevValid).toUTF8String(prevUTF8);
1621     UnicodeString(sValid).toUTF8String(sUTF8);
1622     UCollationResult expectedUTF8Order;
1623     if(&prevValid == &prevString && &sValid == &s) {
1624         expectedUTF8Order = expectedOrder;
1625     } else {
1626         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1627     }
1628
1629     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1630     if(order != expectedUTF8Order || errorCode.isFailure()) {
1631         infoln(fileTestName);
1632         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1633               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1634         infoln(prevFileLine);
1635         infoln(fileLine);
1636         infoln(printCollationKey(prevKey));
1637         infoln(printCollationKey(key));
1638         return FALSE;
1639     }
1640     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1641     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1642         infoln(fileTestName);
1643         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1644               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1645         infoln(prevFileLine);
1646         infoln(fileLine);
1647         infoln(printCollationKey(prevKey));
1648         infoln(printCollationKey(key));
1649         return FALSE;
1650     }
1651     // Test NUL-termination if the strings do not contain NUL characters.
1652     if(!containNUL) {
1653         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1654         if(order != expectedUTF8Order || errorCode.isFailure()) {
1655             infoln(fileTestName);
1656             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1657                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1658             infoln(prevFileLine);
1659             infoln(fileLine);
1660             infoln(printCollationKey(prevKey));
1661             infoln(printCollationKey(key));
1662             return FALSE;
1663         }
1664         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1665         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1666             infoln(fileTestName);
1667             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1668                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1669             infoln(prevFileLine);
1670             infoln(fileLine);
1671             infoln(printCollationKey(prevKey));
1672             infoln(printCollationKey(key));
1673             return FALSE;
1674         }
1675     }
1676 #endif
1677
1678     UCharIterator leftIter;
1679     UCharIterator rightIter;
1680     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1681     uiter_setString(&rightIter, s.getBuffer(), s.length());
1682     order = coll->compare(leftIter, rightIter, errorCode);
1683     if(order != expectedOrder || errorCode.isFailure()) {
1684         infoln(fileTestName);
1685         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1686               "wrong order: %d != %d (%s)",
1687               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1688         infoln(prevFileLine);
1689         infoln(fileLine);
1690         infoln(printCollationKey(prevKey));
1691         infoln(printCollationKey(key));
1692         return FALSE;
1693     }
1694
1695     order = prevKey.compareTo(key, errorCode);
1696     if(order != expectedOrder || errorCode.isFailure()) {
1697         infoln(fileTestName);
1698         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1699               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1700         infoln(prevFileLine);
1701         infoln(fileLine);
1702         infoln(printCollationKey(prevKey));
1703         infoln(printCollationKey(key));
1704         return FALSE;
1705     }
1706     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1707     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1708     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1709         if(level != expectedLevel) {
1710             infoln(fileTestName);
1711             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1712                   (int)fileLineNumber, norm, order, level, expectedLevel);
1713             infoln(prevFileLine);
1714             infoln(fileLine);
1715             infoln(printCollationKey(prevKey));
1716             infoln(printCollationKey(key));
1717             return FALSE;
1718         }
1719     }
1720
1721     // If either string contains U+FFFE, then their sort keys must compare the same as
1722     // the merged sort keys of each string's between-FFFE segments.
1723     //
1724     // It is not required that
1725     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1726     // only that those two methods yield the same order.
1727     //
1728     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1729     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1730                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1731             errorCode.isFailure()) {
1732         order = prevKey.compareTo(key, errorCode);
1733         if(order != expectedOrder || errorCode.isFailure()) {
1734             infoln(fileTestName);
1735             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1736                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1737                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1738             infoln(prevFileLine);
1739             infoln(fileLine);
1740             infoln(printCollationKey(prevKey));
1741             infoln(printCollationKey(key));
1742             return FALSE;
1743         }
1744         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1745         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1746             if(mergedLevel != level) {
1747                 infoln(fileTestName);
1748                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1749                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1750                     (int)fileLineNumber, norm, order, mergedLevel, level);
1751                 infoln(prevFileLine);
1752                 infoln(fileLine);
1753                 infoln(printCollationKey(prevKey));
1754                 infoln(printCollationKey(key));
1755                 return FALSE;
1756             }
1757         }
1758     }
1759     return TRUE;
1760 }
1761
1762 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1763     if(errorCode.isFailure()) { return; }
1764     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1765     UnicodeString prevString, s;
1766     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1767     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1768         // Parse the line even if it will be ignored (when we do not have a Collator)
1769         // in order to report syntax issues.
1770         Collation::Level relation = parseRelationAndString(s, errorCode);
1771         if(errorCode.isFailure()) {
1772             errorCode.reset();
1773             break;
1774         }
1775         if(coll == NULL) {
1776             // We were unable to create the Collator but continue with tests.
1777             // Ignore test data for this Collator.
1778             // The next Collator creation might work.
1779             continue;
1780         }
1781         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1782         Collation::Level expectedLevel = relation;
1783         s.getTerminatedBuffer();  // Ensure NUL-termination.
1784         UBool isOk = TRUE;
1785         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1786             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1787             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1788                                    expectedOrder, expectedLevel, errorCode);
1789         }
1790         if(isOk) {
1791             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1792             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1793                                    expectedOrder, expectedLevel, errorCode);
1794         }
1795         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1796             UnicodeString pn = nfd->normalize(prevString, errorCode);
1797             UnicodeString n = nfd->normalize(s, errorCode);
1798             pn.getTerminatedBuffer();
1799             n.getTerminatedBuffer();
1800             errorCode.assertSuccess();
1801             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1802                                    expectedOrder, expectedLevel, errorCode);
1803         }
1804         if(!isOk) {
1805             errorCode.reset();  // already reported
1806         }
1807         prevFileLine = fileLine;
1808         prevString = s;
1809         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1810     }
1811 }
1812
1813 void CollationTest::TestDataDriven() {
1814     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1815
1816     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1817     nfd = Normalizer2::getNFDInstance(errorCode);
1818     if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1819         return;
1820     }
1821
1822     CharString path(getSourceTestData(errorCode), errorCode);
1823     path.appendPathPart("collationtest.txt", errorCode);
1824     const char *codePage = "UTF-8";
1825     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1826     if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1827         return;
1828     }
1829     // Read a new line if necessary.
1830     // Sub-parsers leave the first line set that they do not handle.
1831     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1832         if(!isSectionStarter(fileLine[0])) {
1833             errln("syntax error on line %d", (int)fileLineNumber);
1834             infoln(fileLine);
1835             return;
1836         }
1837         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1838             fileTestName = fileLine;
1839             logln(fileLine);
1840             fileLine.remove();
1841         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1842             setRootCollator(errorCode);
1843             fileLine.remove();
1844         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1845             setLocaleCollator(errorCode);
1846             fileLine.remove();
1847         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1848             buildTailoring(f.getAlias(), errorCode);
1849         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1850             parseAndSetAttribute(errorCode);
1851         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1852             checkCompareStrings(f.getAlias(), errorCode);
1853         } else {
1854             errln("syntax error on line %d", (int)fileLineNumber);
1855             infoln(fileLine);
1856             return;
1857         }
1858     }
1859 }
1860
1861 #endif  // !UCONFIG_NO_COLLATION