icuSources/test/intltest/collationtest.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2012-2014, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * collationtest.cpp
   7 *
   8 * created on: 2012apr27
   9 * created by: Markus W. Scherer
  10 */
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_COLLATION
  15
  16 #include "unicode/coll.h"
  17 #include "unicode/errorcode.h"
  18 #include "unicode/localpointer.h"
  19 #include "unicode/normalizer2.h"
  20 #include "unicode/sortkey.h"
  21 #include "unicode/std_string.h"
  22 #include "unicode/strenum.h"
  23 #include "unicode/tblcoll.h"
  24 #include "unicode/uiter.h"
  25 #include "unicode/uniset.h"
  26 #include "unicode/unistr.h"
  27 #include "unicode/usetiter.h"
  28 #include "unicode/ustring.h"
  29 #include "charstr.h"
  30 #include "cmemory.h"
  31 #include "collation.h"
  32 #include "collationdata.h"
  33 #include "collationfcd.h"
  34 #include "collationiterator.h"
  35 #include "collationroot.h"
  36 #include "collationrootelements.h"
  37 #include "collationruleparser.h"
  38 #include "collationweights.h"
  39 #include "cstring.h"
  40 #include "intltest.h"
  41 #include "normalizer2impl.h"
  42 #include "ucbuf.h"
  43 #include "uhash.h"
  44 #include "uitercollationiterator.h"
  45 #include "utf16collationiterator.h"
  46 #include "utf8collationiterator.h"
  47 #include "uvectr32.h"
  48 #include "uvectr64.h"
  49 #include "writesrc.h"
  50
  51 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  52
  53 // TODO: Move to ucbuf.h
  54 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
  55
  56 class CodePointIterator;
  57
  58 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
  59
  60 class CollationTest : public IntlTest {
  61 public:
  62     CollationTest()
  63             : fcd(NULL), nfd(NULL),
  64               fileLineNumber(0),
  65               coll(NULL) {}
  66
  67     ~CollationTest() {
  68         delete coll;
  69     }
  70
  71     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
  72
  73     void TestMinMax();
  74     void TestImplicits();
  75     void TestNulTerminated();
  76     void TestIllegalUTF8();
  77     void TestShortFCDData();
  78     void TestFCD();
  79     void TestCollationWeights();
  80     void TestRootElements();
  81     void TestTailoredElements();
  82     void TestDataDriven();
  83
  84 private:
  85     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
  86     void checkAllocWeights(CollationWeights &cw,
  87                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
  88                            int32_t someLength, int32_t minCount);
  89
  90     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
  91     static UnicodeString printCollationKey(const CollationKey &key);
  92
  93     // Helpers & fields for data-driven test.
  94     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
  95     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
  96     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
  97     int32_t skipSpaces(int32_t i) {
  98         while(isSpace(fileLine[i])) { ++i; }
  99         return i;
 100     }
 101
 102     UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
 103     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
 104     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
 105     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
 106     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
 107     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
 108     void setRootCollator(IcuTestErrorCode &errorCode);
 109     void setLocaleCollator(IcuTestErrorCode &errorCode);
 110
 111     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
 112
 113     UBool getSortKeyParts(const UChar *s, int32_t length,
 114                           CharString &dest, int32_t partSize,
 115                           IcuTestErrorCode &errorCode);
 116     UBool getCollationKey(const char *norm, const UnicodeString &line,
 117                           const UChar *s, int32_t length,
 118                           CollationKey &key, IcuTestErrorCode &errorCode);
 119     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
 120                           const UnicodeString &prevString, const UnicodeString &s,
 121                           UCollationResult expectedOrder, Collation::Level expectedLevel,
 122                           IcuTestErrorCode &errorCode);
 123     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
 124
 125     const Normalizer2 *fcd, *nfd;
 126     UnicodeString fileLine;
 127     int32_t fileLineNumber;
 128     UnicodeString fileTestName;
 129     Collator *coll;
 130 };
 131
 132 extern IntlTest *createCollationTest() {
 133     return new CollationTest();
 134 }
 135
 136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
 137     if(exec) {
 138         logln("TestSuite CollationTest: ");
 139     }
 140     TESTCASE_AUTO_BEGIN;
 141     TESTCASE_AUTO(TestMinMax);
 142     TESTCASE_AUTO(TestImplicits);
 143     TESTCASE_AUTO(TestNulTerminated);
 144     TESTCASE_AUTO(TestIllegalUTF8);
 145     TESTCASE_AUTO(TestShortFCDData);
 146     TESTCASE_AUTO(TestFCD);
 147     TESTCASE_AUTO(TestCollationWeights);
 148     TESTCASE_AUTO(TestRootElements);
 149     TESTCASE_AUTO(TestTailoredElements);
 150     TESTCASE_AUTO(TestDataDriven);
 151     TESTCASE_AUTO_END;
 152 }
 153
 154 void CollationTest::TestMinMax() {
 155     IcuTestErrorCode errorCode(*this, "TestMinMax");
 156
 157     setRootCollator(errorCode);
 158     if(errorCode.isFailure()) {
 159         errorCode.reset();
 160         return;
 161     }
 162     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
 163     if(rbc == NULL) {
 164         errln("the root collator is not a RuleBasedCollator");
 165         return;
 166     }
 167
 168     static const UChar s[2] = { 0xfffe, 0xffff };
 169     UVector64 ces(errorCode);
 170     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
 171     errorCode.assertSuccess();
 172     if(ces.size() != 2) {
 173         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
 174         return;
 175     }
 176     int64_t ce = ces.elementAti(0);
 177     int64_t expected =
 178         ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
 179         Collation::MERGE_SEPARATOR_LOWER32;
 180     if(ce != expected) {
 181         errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
 182     }
 183
 184     ce = ces.elementAti(1);
 185     expected = Collation::makeCE(Collation::MAX_PRIMARY);
 186     if(ce != expected) {
 187         errln("CE(U+ffff)=%04lx != max..", (long)ce);
 188     }
 189 }
 190
 191 void CollationTest::TestImplicits() {
 192     IcuTestErrorCode errorCode(*this, "TestImplicits");
 193
 194     const CollationData *cd = CollationRoot::getData(errorCode);
 195     if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
 196         return;
 197     }
 198
 199     // Implicit primary weights should be assigned for the following sets,
 200     // and sort in ascending order by set and then code point.
 201     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
 202     // core Han Unified Ideographs
 203     UnicodeSet coreHan("[\\p{unified_ideograph}&"
 204                             "[\\p{Block=CJK_Unified_Ideographs}"
 205                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
 206                        errorCode);
 207     // all other Unified Han ideographs
 208     UnicodeSet otherHan("[\\p{unified ideograph}-"
 209                             "[\\p{Block=CJK_Unified_Ideographs}"
 210                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
 211                         errorCode);
 212     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
 213     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
 214     if(errorCode.logIfFailureAndReset("UnicodeSet")) {
 215         return;
 216     }
 217     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
 218     UChar32 prev = 0;
 219     uint32_t prevPrimary = 0;
 220     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
 221     for(int32_t i = 0; i < LENGTHOF(sets); ++i) {
 222         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
 223         while(iter->next()) {
 224             UChar32 c = iter->getCodepoint();
 225             UnicodeString s(c);
 226             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
 227             int64_t ce = ci.nextCE(errorCode);
 228             int64_t ce2 = ci.nextCE(errorCode);
 229             if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
 230                 return;
 231             }
 232             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
 233                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
 234                 continue;
 235             }
 236             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
 237                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
 238                       (long)c, (long)(ce & 0xffffffff));
 239                 continue;
 240             }
 241             uint32_t primary = (uint32_t)(ce >> 32);
 242             if(!(primary > prevPrimary)) {
 243                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
 244                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
 245             }
 246             prev = c;
 247             prevPrimary = primary;
 248         }
 249     }
 250 }
 251
 252 void CollationTest::TestNulTerminated() {
 253     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
 254     const CollationData *data = CollationRoot::getData(errorCode);
 255     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 256         return;
 257     }
 258
 259     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
 260
 261     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
 262     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
 263     for(int32_t i = 0;; ++i) {
 264         int64_t ce1 = ci1.nextCE(errorCode);
 265         int64_t ce2 = ci2.nextCE(errorCode);
 266         if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
 267             return;
 268         }
 269         if(ce1 != ce2) {
 270             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
 271             break;
 272         }
 273         if(ce1 == Collation::NO_CE) { break; }
 274     }
 275 }
 276
 277 void CollationTest::TestIllegalUTF8() {
 278     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
 279
 280     setRootCollator(errorCode);
 281     if(errorCode.isFailure()) {
 282         errorCode.reset();
 283         return;
 284     }
 285     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
 286
 287     static const char *strings[] = {
 288         // U+FFFD
 289         "a\xef\xbf\xbdz",
 290         // illegal byte sequences
 291         "a\x80z",  // trail byte
 292         "a\xc1\x81z",  // non-shortest form
 293         "a\xe0\x82\x83z",  // non-shortest form
 294         "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
 295         "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
 296         "a\xf0\x8f\xbf\xbfz",  // non-shortest form
 297         "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
 298     };
 299
 300     StringPiece fffd(strings[0]);
 301     for(int32_t i = 1; i < LENGTHOF(strings); ++i) {
 302         StringPiece illegal(strings[i]);
 303         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
 304         if(order != UCOL_EQUAL) {
 305             errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
 306                   (int)i, order);
 307         }
 308     }
 309 }
 310
 311 namespace {
 312
 313 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
 314     for(UChar32 c = 0x10000; c < 0x110000;) {
 315         UChar32 next = c + 0x400;
 316         if(src.containsSome(c, next - 1)) {
 317             dest.add(U16_LEAD(c));
 318         }
 319         c = next;
 320     }
 321 }
 322
 323 }  // namespace
 324
 325 void CollationTest::TestShortFCDData() {
 326     // See CollationFCD class comments.
 327     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
 328     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
 329     errorCode.assertSuccess();
 330     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
 331     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
 332     UnicodeSet lccc;  // actual
 333     for(UChar32 c = 0; c <= 0xffff; ++c) {
 334         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
 335     }
 336     UnicodeSet diff(expectedLccc);
 337     diff.removeAll(lccc);
 338     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
 339     UnicodeString empty("[]");
 340     UnicodeString diffString;
 341     diff.toPattern(diffString, TRUE);
 342     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
 343     diff = lccc;
 344     diff.removeAll(expectedLccc);
 345     diff.toPattern(diffString, TRUE);
 346     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
 347
 348     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
 349     if (errorCode.isSuccess()) {
 350         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
 351         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
 352         UnicodeSet tccc;  // actual
 353         for(UChar32 c = 0; c <= 0xffff; ++c) {
 354             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
 355         }
 356         diff = expectedTccc;
 357         diff.removeAll(tccc);
 358         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
 359         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
 360         diff = tccc;
 361         diff.removeAll(expectedTccc);
 362         diff.toPattern(diffString, TRUE);
 363         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
 364     }
 365 }
 366
 367 class CodePointIterator {
 368 public:
 369     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
 370     void resetToStart() { pos = 0; }
 371     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
 372     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
 373     int32_t getLength() const { return length; }
 374     int getIndex() const { return (int)pos; }
 375 private:
 376     const UChar32 *cp;
 377     int32_t length;
 378     int32_t pos;
 379 };
 380
 381 void CollationTest::checkFCD(const char *name,
 382                              CollationIterator &ci, CodePointIterator &cpi) {
 383     IcuTestErrorCode errorCode(*this, "checkFCD");
 384
 385     // Iterate forward to the limit.
 386     for(;;) {
 387         UChar32 c1 = ci.nextCodePoint(errorCode);
 388         UChar32 c2 = cpi.next();
 389         if(c1 != c2) {
 390             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
 391                   name, (long)c1, (long)c2, cpi.getIndex());
 392             return;
 393         }
 394         if(c1 < 0) { break; }
 395     }
 396
 397     // Iterate backward most of the way.
 398     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
 399         UChar32 c1 = ci.previousCodePoint(errorCode);
 400         UChar32 c2 = cpi.previous();
 401         if(c1 != c2) {
 402             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
 403                   name, (long)c1, (long)c2, cpi.getIndex());
 404             return;
 405         }
 406     }
 407
 408     // Forward again.
 409     for(;;) {
 410         UChar32 c1 = ci.nextCodePoint(errorCode);
 411         UChar32 c2 = cpi.next();
 412         if(c1 != c2) {
 413             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
 414                   name, (long)c1, (long)c2, cpi.getIndex());
 415             return;
 416         }
 417         if(c1 < 0) { break; }
 418     }
 419
 420     // Iterate backward to the start.
 421     for(;;) {
 422         UChar32 c1 = ci.previousCodePoint(errorCode);
 423         UChar32 c2 = cpi.previous();
 424         if(c1 != c2) {
 425             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
 426                   name, (long)c1, (long)c2, cpi.getIndex());
 427             return;
 428         }
 429         if(c1 < 0) { break; }
 430     }
 431 }
 432
 433 void CollationTest::TestFCD() {
 434     IcuTestErrorCode errorCode(*this, "TestFCD");
 435     const CollationData *data = CollationRoot::getData(errorCode);
 436     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 437         return;
 438     }
 439
 440     // Input string, not FCD, NUL-terminated.
 441     static const UChar s[] = {
 442         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
 443         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
 444         0x327, 0x308,  // ccc=202, 230
 445         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
 446         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
 447         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
 448         0xac01,
 449         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
 450         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
 451         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
 452         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
 453         0x4e00, 0xf81,
 454         0
 455     };
 456     // Expected code points.
 457     static const UChar32 cp[] = {
 458         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
 459         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
 460         0x1D15F, 0x1D16D,
 461         0xac01,
 462         0x63, 0x327, 0x1D165, 0x1D16D,
 463         0x61,
 464         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
 465         0x4e00, 0xf71, 0xf80
 466     };
 467
 468     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
 469     if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
 470         return;
 471     }
 472     CodePointIterator cpi(cp, LENGTHOF(cp));
 473     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
 474
 475 #if U_HAVE_STD_STRING
 476     cpi.resetToStart();
 477     std::string utf8;
 478     UnicodeString(s).toUTF8String(utf8);
 479     FCDUTF8CollationIterator u8ci(data, FALSE,
 480                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
 481     if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
 482         return;
 483     }
 484     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
 485 #endif
 486
 487     cpi.resetToStart();
 488     UCharIterator iter;
 489     uiter_setString(&iter, s, LENGTHOF(s) - 1);  // -1: without the terminating NUL
 490     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
 491     if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
 492         return;
 493     }
 494     checkFCD("FCDUIterCollationIterator", uici, cpi);
 495 }
 496
 497 void CollationTest::checkAllocWeights(CollationWeights &cw,
 498                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
 499                                       int32_t someLength, int32_t minCount) {
 500     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
 501         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
 502               (long)lowerLimit, (long)upperLimit, (long)n);
 503         return;
 504     }
 505     uint32_t previous = lowerLimit;
 506     int32_t count = 0;  // number of weights that have someLength
 507     for(int32_t i = 0; i < n; ++i) {
 508         uint32_t w = cw.nextWeight();
 509         if(w == 0xffffffff) {
 510             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 511                   "returns only %ld weights",
 512                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
 513             return;
 514         }
 515         if(!(previous < w && w < upperLimit)) {
 516             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 517                   "number %ld -> %lx not between %lx and %lx",
 518                   (long)lowerLimit, (long)upperLimit, (long)n,
 519                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
 520             return;
 521         }
 522         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
 523     }
 524     if(count < minCount) {
 525         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
 526               "returns only %ld < %ld weights of length %d",
 527               (long)lowerLimit, (long)upperLimit, (long)n,
 528               (long)count, (long)minCount, (int)someLength);
 529     }
 530 }
 531
 532 void CollationTest::TestCollationWeights() {
 533     CollationWeights cw;
 534
 535     // Non-compressible primaries use 254 second bytes 02..FF.
 536     logln("CollationWeights.initForPrimary(non-compressible)");
 537     cw.initForPrimary(FALSE);
 538     // Expect 1 weight 11 and 254 weights 12xx.
 539     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
 540     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
 541     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
 542     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
 543     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
 544     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
 545     // Expect 254^2=64516 three-byte weights.
 546     // During computation, there should be 3 three-byte ranges
 547     // 10ffff, 11xxxx, 120202.
 548     // The middle one should be split 64515:1,
 549     // and the newly-split-off range and the last ranged lengthened.
 550     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
 551     // Expect weights 1102 & 1103.
 552     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
 553     // Expect weights 102102 & 102103.
 554     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
 555
 556     // Compressible primaries use 251 second bytes 04..FE.
 557     logln("CollationWeights.initForPrimary(compressible)");
 558     cw.initForPrimary(TRUE);
 559     // Expect 1 weight 11 and 251 weights 12xx.
 560     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
 561     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
 562     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
 563     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
 564     // Expect weights 1104 & 1105.
 565     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
 566     // Expect weights 102102 & 102103.
 567     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
 568
 569     // Secondary and tertiary weights use only bytes 3 & 4.
 570     logln("CollationWeights.initForSecondary()");
 571     cw.initForSecondary();
 572     // Expect weights fbxx and all four fc..ff.
 573     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
 574
 575     logln("CollationWeights.initForTertiary()");
 576     cw.initForTertiary();
 577     // Expect weights 3dxx and both 3e & 3f.
 578     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
 579 }
 580
 581 namespace {
 582
 583 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
 584                 uint32_t p, uint32_t s, uint32_t ctq) {
 585     uint32_t p1 = p >> 24;
 586     uint32_t p2 = (p >> 16) & 0xff;
 587     uint32_t p3 = (p >> 8) & 0xff;
 588     uint32_t p4 = p & 0xff;
 589     uint32_t s1 = s >> 8;
 590     uint32_t s2 = s & 0xff;
 591     // ctq = Case, Tertiary, Quaternary
 592     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
 593     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
 594     uint32_t t1 = t >> 8;
 595     uint32_t t2 = t & 0xff;
 596     uint32_t q = ctq & Collation::QUATERNARY_MASK;
 597     // No leading zero bytes.
 598     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
 599         return FALSE;
 600     }
 601     // No intermediate zero bytes.
 602     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
 603         return FALSE;
 604     }
 605     if(p2 != 0 && p3 == 0 && p4 != 0) {
 606         return FALSE;
 607     }
 608     // Minimum & maximum lead bytes.
 609     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
 610             (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
 611             (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
 612         return FALSE;
 613     }
 614     if(t1 != 0 && t1 > 0x3f) {
 615         return FALSE;
 616     }
 617     if(c > 2) {
 618         return FALSE;
 619     }
 620     // The valid byte range for the second primary byte depends on compressibility.
 621     if(p2 != 0) {
 622         if(data.isCompressibleLeadByte(p1)) {
 623             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
 624                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
 625                 return FALSE;
 626             }
 627         } else {
 628             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
 629                 return FALSE;
 630             }
 631         }
 632     }
 633     // Other bytes just need to avoid the level separator.
 634     // Trailing zeros are ok.
 635     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
 636     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
 637             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
 638         return FALSE;
 639     }
 640     // Well-formed CEs.
 641     if(p == 0) {
 642         if(s == 0) {
 643             if(t == 0) {
 644                 // Completely ignorable CE.
 645                 // Quaternary CEs are not supported.
 646                 if(c != 0 || q != 0) {
 647                     return FALSE;
 648                 }
 649             } else {
 650                 // Tertiary CE.
 651                 if(t < re.getTertiaryBoundary() || c != 2) {
 652                     return FALSE;
 653                 }
 654             }
 655         } else {
 656             // Secondary CE.
 657             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
 658                 return FALSE;
 659             }
 660         }
 661     } else {
 662         // Primary CE.
 663         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
 664                 s >= re.getSecondaryBoundary()) {
 665             return FALSE;
 666         }
 667         if(t == 0 || t >= re.getTertiaryBoundary()) {
 668             return FALSE;
 669         }
 670     }
 671     return TRUE;
 672 }
 673
 674 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
 675     uint32_t p = (uint32_t)(ce >> 32);
 676     uint32_t secTer = (uint32_t)ce;
 677     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
 678 }
 679
 680 class RootElementsIterator {
 681 public:
 682     RootElementsIterator(const CollationData &root)
 683             : data(root),
 684               elements(root.rootElements), length(root.rootElementsLength),
 685               pri(0), secTer(0),
 686               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
 687
 688     UBool next() {
 689         if(index >= length) { return FALSE; }
 690         uint32_t p = elements[index];
 691         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
 692         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
 693             ++index;
 694             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
 695             return TRUE;
 696         }
 697         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
 698             // End of a range, enumerate the primaries in the range.
 699             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
 700             p &= 0xffffff00;
 701             if(pri == p) {
 702                 // Finished the range, return the next CE after it.
 703                 ++index;
 704                 return next();
 705             }
 706             U_ASSERT(pri < p);
 707             // Return the next primary in this range.
 708             UBool isCompressible = data.isCompressiblePrimary(pri);
 709             if((pri & 0xffff) == 0) {
 710                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
 711             } else {
 712                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
 713             }
 714             return TRUE;
 715         }
 716         // Simple primary CE.
 717         ++index;
 718         pri = p;
 719         secTer = Collation::COMMON_SEC_AND_TER_CE;
 720         return TRUE;
 721     }
 722
 723     uint32_t getPrimary() const { return pri; }
 724     uint32_t getSecTer() const { return secTer; }
 725
 726 private:
 727     const CollationData &data;
 728     const uint32_t *elements;
 729     int32_t length;
 730
 731     uint32_t pri;
 732     uint32_t secTer;
 733     int32_t index;
 734 };
 735
 736 }  // namespace
 737
 738 void CollationTest::TestRootElements() {
 739     IcuTestErrorCode errorCode(*this, "TestRootElements");
 740     const CollationData *root = CollationRoot::getData(errorCode);
 741     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 742         return;
 743     }
 744     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
 745     RootElementsIterator iter(*root);
 746
 747     // We check each root CE for validity,
 748     // and we also verify that there is a tailoring gap between each two CEs.
 749     CollationWeights cw1c;  // compressible primary weights
 750     CollationWeights cw1u;  // uncompressible primary weights
 751     CollationWeights cw2;
 752     CollationWeights cw3;
 753
 754     cw1c.initForPrimary(TRUE);
 755     cw1u.initForPrimary(FALSE);
 756     cw2.initForSecondary();
 757     cw3.initForTertiary();
 758
 759     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
 760     // nor the special merge-separator CE for U+FFFE.
 761     uint32_t prevPri = 0;
 762     uint32_t prevSec = 0;
 763     uint32_t prevTer = 0;
 764     while(iter.next()) {
 765         uint32_t pri = iter.getPrimary();
 766         uint32_t secTer = iter.getSecTer();
 767         // CollationRootElements CEs must have 0 case and quaternary bits.
 768         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
 769             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
 770                   (long)pri, (long)secTer);
 771         }
 772         uint32_t sec = secTer >> 16;
 773         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
 774         uint32_t ctq = ter;
 775         if(pri == 0 && sec == 0 && ter != 0) {
 776             // Tertiary CEs must have uppercase bits,
 777             // but they are not stored in the CollationRootElements.
 778             ctq |= 0x8000;
 779         }
 780         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
 781             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
 782         } else {
 783             if(pri != prevPri) {
 784                 uint32_t newWeight = 0;
 785                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
 786                     // There is currently no tailoring gap after primary ignorables,
 787                     // and we forbid tailoring after U+FFFD and U+FFFF.
 788                 } else if(root->isCompressiblePrimary(prevPri)) {
 789                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
 790                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
 791                               (long)prevPri, (long)pri);
 792                     } else {
 793                         newWeight = cw1c.nextWeight();
 794                     }
 795                 } else {
 796                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
 797                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
 798                               (long)prevPri, (long)pri);
 799                     } else {
 800                         newWeight = cw1u.nextWeight();
 801                     }
 802                 }
 803                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
 804                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
 805                           (long)prevPri, (long)newWeight, (long)pri);
 806                 }
 807             } else if(sec != prevSec) {
 808                 uint32_t lowerLimit =
 809                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
 810                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
 811                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
 812                 } else {
 813                     uint32_t newWeight = cw2.nextWeight();
 814                     if(!(prevSec < newWeight && newWeight < sec)) {
 815                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
 816                               (long)lowerLimit, (long)newWeight, (long)sec);
 817                     }
 818                 }
 819             } else if(ter != prevTer) {
 820                 uint32_t lowerLimit =
 821                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
 822                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
 823                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
 824                 } else {
 825                     uint32_t newWeight = cw3.nextWeight();
 826                     if(!(prevTer < newWeight && newWeight < ter)) {
 827                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
 828                               (long)lowerLimit, (long)newWeight, (long)ter);
 829                     }
 830                 }
 831             } else {
 832                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
 833             }
 834         }
 835         prevPri = pri;
 836         prevSec = sec;
 837         prevTer = ter;
 838     }
 839 }
 840
 841 void CollationTest::TestTailoredElements() {
 842     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
 843     const CollationData *root = CollationRoot::getData(errorCode);
 844     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
 845         return;
 846     }
 847     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
 848
 849     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
 850     if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
 851         return;
 852     }
 853     uhash_setKeyDeleter(prevLocales, uprv_free);
 854     // TestRootElements() tests the root collator which does not have tailorings.
 855     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
 856     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
 857     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
 858
 859     UVector64 ces(errorCode);
 860     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
 861     U_ASSERT(locales.isValid());
 862     const char *localeID = "root";
 863     do {
 864         Locale locale(localeID);
 865         LocalPointer<StringEnumeration> types(
 866                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
 867         errorCode.assertSuccess();
 868         const char *type = NULL;  // default type
 869         do {
 870             Locale localeWithType(locale);
 871             if(type != NULL) {
 872                 localeWithType.setKeywordValue("collation", type, errorCode);
 873             }
 874             errorCode.assertSuccess();
 875             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
 876             if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
 877                                               localeWithType.getName())) {
 878                 continue;
 879             }
 880             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
 881             if(uhash_geti(prevLocales, actual.getName()) != 0) {
 882                 continue;
 883             }
 884             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
 885             errorCode.assertSuccess();
 886             logln("TestTailoredElements(): requested %s -> actual %s",
 887                   localeWithType.getName(), actual.getName());
 888             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
 889             if(rbc == NULL) {
 890                 continue;
 891             }
 892             // Note: It would be better to get tailored strings such that we can
 893             // identify the prefix, and only get the CEs for the prefix+string,
 894             // not also for the prefix.
 895             // There is currently no API for that.
 896             // It would help in an unusual case where a contraction starting in the prefix
 897             // extends past its end, and we do not see the intended mapping.
 898             // For example, for a mapping p|st, if there is also a contraction ps,
 899             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
 900             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
 901             errorCode.assertSuccess();
 902             UnicodeSetIterator iter(*tailored);
 903             while(iter.next()) {
 904                 const UnicodeString &s = iter.getString();
 905                 ces.removeAllElements();
 906                 rbc->internalGetCEs(s, ces, errorCode);
 907                 errorCode.assertSuccess();
 908                 for(int32_t i = 0; i < ces.size(); ++i) {
 909                     int64_t ce = ces.elementAti(i);
 910                     if(!isValidCE(rootElements, *root, ce)) {
 911                         errln("invalid tailored CE %016llx at CE index %d from string:",
 912                               (long long)ce, (int)i);
 913                         infoln(prettify(s));
 914                     }
 915                 }
 916             }
 917         } while((type = types->next(NULL, errorCode)) != NULL);
 918     } while((localeID = locales->next(NULL, errorCode)) != NULL);
 919     uhash_close(prevLocales);
 920 }
 921
 922 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
 923     UnicodeString s;
 924     for(int32_t i = 0; i < length; ++i) {
 925         if(i > 0) { s.append((UChar)0x20); }
 926         uint8_t b = p[i];
 927         if(b == 0) {
 928             s.append((UChar)0x2e);  // period
 929         } else if(b == 1) {
 930             s.append((UChar)0x7c);  // vertical bar
 931         } else {
 932             appendHex(b, 2, s);
 933         }
 934     }
 935     return s;
 936 }
 937
 938 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
 939     int32_t length;
 940     const uint8_t *p = key.getByteArray(length);
 941     return printSortKey(p, length);
 942 }
 943
 944 UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
 945     int32_t lineLength;
 946     const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
 947     if(line == NULL || errorCode.isFailure()) {
 948         fileLine.remove();
 949         return FALSE;
 950     }
 951     ++fileLineNumber;
 952     // Strip trailing CR/LF, comments, and spaces.
 953     const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
 954     if(comment != NULL) {
 955         lineLength = (int32_t)(comment - line);
 956     } else {
 957         while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
 958     }
 959     while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
 960     fileLine.setTo(FALSE, line, lineLength);
 961     return TRUE;
 962 }
 963
 964 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
 965                                 UErrorCode &errorCode) {
 966     int32_t length = fileLine.length();
 967     int32_t i;
 968     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
 969     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
 970     if(pipeIndex >= 0) {
 971         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
 972         if(prefix.isEmpty()) {
 973             errln("empty prefix on line %d", (int)fileLineNumber);
 974             infoln(fileLine);
 975             errorCode = U_PARSE_ERROR;
 976             return;
 977         }
 978         start = pipeIndex + 1;
 979     } else {
 980         prefix.remove();
 981     }
 982     s = fileLine.tempSubStringBetween(start, i).unescape();
 983     if(s.isEmpty()) {
 984         errln("empty string on line %d", (int)fileLineNumber);
 985         infoln(fileLine);
 986         errorCode = U_PARSE_ERROR;
 987         return;
 988     }
 989     start = i;
 990 }
 991
 992 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
 993     Collation::Level relation;
 994     int32_t start;
 995     if(fileLine[0] == 0x3c) {  // <
 996         UChar second = fileLine[1];
 997         start = 2;
 998         switch(second) {
 999         case 0x31:  // <1
1000             relation = Collation::PRIMARY_LEVEL;
1001             break;
1002         case 0x32:  // <2
1003             relation = Collation::SECONDARY_LEVEL;
1004             break;
1005         case 0x33:  // <3
1006             relation = Collation::TERTIARY_LEVEL;
1007             break;
1008         case 0x34:  // <4
1009             relation = Collation::QUATERNARY_LEVEL;
1010             break;
1011         case 0x63:  // <c
1012             relation = Collation::CASE_LEVEL;
1013             break;
1014         case 0x69:  // <i
1015             relation = Collation::IDENTICAL_LEVEL;
1016             break;
1017         default:  // just <
1018             relation = Collation::NO_LEVEL;
1019             start = 1;
1020             break;
1021         }
1022     } else if(fileLine[0] == 0x3d) {  // =
1023         relation = Collation::ZERO_LEVEL;
1024         start = 1;
1025     } else {
1026         start = 0;
1027     }
1028     if(start == 0 || !isSpace(fileLine[start])) {
1029         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1030         infoln(fileLine);
1031         errorCode.set(U_PARSE_ERROR);
1032         return Collation::NO_LEVEL;
1033     }
1034     start = skipSpaces(start);
1035     UnicodeString prefix;
1036     parseString(start, prefix, s, errorCode);
1037     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1038         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1039         infoln(fileLine);
1040         errorCode.set(U_PARSE_ERROR);
1041         return Collation::NO_LEVEL;
1042     }
1043     if(start < fileLine.length()) {
1044         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1045         infoln(fileLine);
1046         errorCode.set(U_PARSE_ERROR);
1047         return Collation::NO_LEVEL;
1048     }
1049     return relation;
1050 }
1051
1052 static const struct {
1053     const char *name;
1054     UColAttribute attr;
1055 } attributes[] = {
1056     { "backwards", UCOL_FRENCH_COLLATION },
1057     { "alternate", UCOL_ALTERNATE_HANDLING },
1058     { "caseFirst", UCOL_CASE_FIRST },
1059     { "caseLevel", UCOL_CASE_LEVEL },
1060     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061     { "strength", UCOL_STRENGTH },
1062     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063     { "numeric", UCOL_NUMERIC_COLLATION }
1064 };
1065
1066 static const struct {
1067     const char *name;
1068     UColAttributeValue value;
1069 } attributeValues[] = {
1070     { "default", UCOL_DEFAULT },
1071     { "primary", UCOL_PRIMARY },
1072     { "secondary", UCOL_SECONDARY },
1073     { "tertiary", UCOL_TERTIARY },
1074     { "quaternary", UCOL_QUATERNARY },
1075     { "identical", UCOL_IDENTICAL },
1076     { "off", UCOL_OFF },
1077     { "on", UCOL_ON },
1078     { "shifted", UCOL_SHIFTED },
1079     { "non-ignorable", UCOL_NON_IGNORABLE },
1080     { "lower", UCOL_LOWER_FIRST },
1081     { "upper", UCOL_UPPER_FIRST }
1082 };
1083
1084 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1085     int32_t start = skipSpaces(1);
1086     int32_t equalPos = fileLine.indexOf(0x3d);
1087     if(equalPos < 0) {
1088         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089             parseAndSetReorderCodes(start + 7, errorCode);
1090             return;
1091         }
1092         errln("missing '=' on line %d", (int)fileLineNumber);
1093         infoln(fileLine);
1094         errorCode.set(U_PARSE_ERROR);
1095         return;
1096     }
1097
1098     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1099     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1100     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1101         UColReorderCode max;
1102         if(valueString == UNICODE_STRING("space", 5)) {
1103             max = UCOL_REORDER_CODE_SPACE;
1104         } else if(valueString == UNICODE_STRING("punct", 5)) {
1105             max = UCOL_REORDER_CODE_PUNCTUATION;
1106         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1107             max = UCOL_REORDER_CODE_SYMBOL;
1108         } else if(valueString == UNICODE_STRING("currency", 8)) {
1109             max = UCOL_REORDER_CODE_CURRENCY;
1110         } else {
1111             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1112             infoln(fileLine);
1113             errorCode.set(U_PARSE_ERROR);
1114             return;
1115         }
1116         coll->setMaxVariable(max, errorCode);
1117         if(errorCode.isFailure()) {
1118             errln("setMaxVariable() failed on line %d: %s",
1119                   (int)fileLineNumber, errorCode.errorName());
1120             infoln(fileLine);
1121             return;
1122         }
1123         fileLine.remove();
1124         return;
1125     }
1126
1127     UColAttribute attr;
1128     for(int32_t i = 0;; ++i) {
1129         if(i == LENGTHOF(attributes)) {
1130             errln("invalid attribute name on line %d", (int)fileLineNumber);
1131             infoln(fileLine);
1132             errorCode.set(U_PARSE_ERROR);
1133             return;
1134         }
1135         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1136             attr = attributes[i].attr;
1137             break;
1138         }
1139     }
1140
1141     UColAttributeValue value;
1142     for(int32_t i = 0;; ++i) {
1143         if(i == LENGTHOF(attributeValues)) {
1144             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145             infoln(fileLine);
1146             errorCode.set(U_PARSE_ERROR);
1147             return;
1148         }
1149         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1150             value = attributeValues[i].value;
1151             break;
1152         }
1153     }
1154
1155     coll->setAttribute(attr, value, errorCode);
1156     if(errorCode.isFailure()) {
1157         errln("illegal attribute=value combination on line %d: %s",
1158               (int)fileLineNumber, errorCode.errorName());
1159         infoln(fileLine);
1160         return;
1161     }
1162     fileLine.remove();
1163 }
1164
1165 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1166     UVector32 reorderCodes(errorCode);
1167     while(start < fileLine.length()) {
1168         start = skipSpaces(start);
1169         int32_t limit = start;
1170         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1171         CharString name;
1172         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1173         int32_t code = CollationRuleParser::getReorderCode(name.data());
1174         if(code < -1) {
1175             errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1176             infoln(fileLine);
1177             errorCode.set(U_PARSE_ERROR);
1178             return;
1179         }
1180         reorderCodes.addElement(code, errorCode);
1181         start = limit;
1182     }
1183     coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1184     if(errorCode.isFailure()) {
1185         errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName());
1186         infoln(fileLine);
1187         return;
1188     }
1189     fileLine.remove();
1190 }
1191
1192 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1193     UnicodeString rules;
1194     while(readLine(f, errorCode)) {
1195         if(fileLine.isEmpty()) { continue; }
1196         if(isSectionStarter(fileLine[0])) { break; }
1197         rules.append(fileLine.unescape());
1198     }
1199     if(errorCode.isFailure()) { return; }
1200     logln(rules);
1201
1202     UParseError parseError;
1203     UnicodeString reason;
1204     delete coll;
1205     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1206     if(coll == NULL) {
1207         errln("unable to allocate a new collator");
1208         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1209         return;
1210     }
1211     if(errorCode.isFailure()) {
1212         errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1213         infoln(UnicodeString("  reason: ") + reason);
1214         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1215         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1216             infoln(UnicodeString("  snippet: ...") +
1217                 parseError.preContext + "(!)" + parseError.postContext + "...");
1218         }
1219     } else {
1220         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221                      UnicodeString(), reason);
1222     }
1223 }
1224
1225 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1226     if(errorCode.isFailure()) { return; }
1227     delete coll;
1228     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1229     if(errorCode.isFailure()) {
1230         dataerrln("unable to create a root collator");
1231         return;
1232     }
1233 }
1234
1235 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1236     if(errorCode.isFailure()) { return; }
1237     CharString langTag;
1238     langTag.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1239     char localeID[ULOC_FULLNAME_CAPACITY];
1240     int32_t parsedLength;
1241     (void)uloc_forLanguageTag(
1242         langTag.data(), localeID, LENGTHOF(localeID), &parsedLength, errorCode);
1243     Locale locale(localeID);
1244     if(fileLine.length() == 9 ||
1245             errorCode.isFailure() || errorCode.get() == U_STRING_NOT_TERMINATED_WARNING ||
1246             parsedLength != langTag.length() || locale.isBogus()) {
1247         errln("invalid language tag on line %d", (int)fileLineNumber);
1248         infoln(fileLine);
1249         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1250         return;
1251     }
1252
1253     logln("creating a collator for locale ID %s", locale.getName());
1254     Collator *newColl = Collator::createInstance(locale, errorCode);
1255     if(errorCode.isFailure()) {
1256         dataerrln("unable to create a collator for locale %s on line %d",
1257                   locale.getName(), (int)fileLineNumber);
1258         infoln(fileLine);
1259         return;
1260     }
1261     delete coll;
1262     coll = newColl;
1263 }
1264
1265 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1266     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1267     // In some sequences with Tibetan composite vowel signs,
1268     // even if the string passes the FCD check,
1269     // those composites must be decomposed.
1270     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1271     int32_t index = 0;
1272     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1273         if(++index < s.length()) {
1274             UChar c = s[index];
1275             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1276         }
1277     }
1278     return FALSE;
1279 }
1280
1281 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1282                                      CharString &dest, int32_t partSize,
1283                                      IcuTestErrorCode &errorCode) {
1284     if(errorCode.isFailure()) { return FALSE; }
1285     uint8_t part[32];
1286     U_ASSERT(partSize <= LENGTHOF(part));
1287     UCharIterator iter;
1288     uiter_setString(&iter, s, length);
1289     uint32_t state[2] = { 0, 0 };
1290     for(;;) {
1291         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1292         UBool done = partLength < partSize;
1293         if(done) {
1294             // At the end, append the next byte as well which should be 00.
1295             ++partLength;
1296         }
1297         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1298         if(done) {
1299             return errorCode.isSuccess();
1300         }
1301     }
1302 }
1303
1304 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1305                                      const UChar *s, int32_t length,
1306                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1307     if(errorCode.isFailure()) { return FALSE; }
1308     coll->getCollationKey(s, length, key, errorCode);
1309     if(errorCode.isFailure()) {
1310         infoln(fileTestName);
1311         errln("Collator(%s).getCollationKey() failed: %s",
1312               norm, errorCode.errorName());
1313         infoln(line);
1314         return FALSE;
1315     }
1316     int32_t keyLength;
1317     const uint8_t *keyBytes = key.getByteArray(keyLength);
1318     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1319         infoln(fileTestName);
1320         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1321               norm);
1322         infoln(line);
1323         infoln(printCollationKey(key));
1324         return FALSE;
1325     }
1326
1327     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1328     if(numLevels < UCOL_IDENTICAL) {
1329         ++numLevels;
1330     } else {
1331         numLevels = 5;
1332     }
1333     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1334         ++numLevels;
1335     }
1336     errorCode.assertSuccess();
1337     int32_t numLevelSeparators = 0;
1338     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1339         uint8_t b = keyBytes[i];
1340         if(b == 0) {
1341             infoln(fileTestName);
1342             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1343             infoln(line);
1344             infoln(printCollationKey(key));
1345             return FALSE;
1346         }
1347         if(b == 1) { ++numLevelSeparators; }
1348     }
1349     if(numLevelSeparators != (numLevels - 1)) {
1350         infoln(fileTestName);
1351         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1352               norm, (int)numLevelSeparators, (int)numLevels);
1353         infoln(line);
1354         infoln(printCollationKey(key));
1355         return FALSE;
1356     }
1357
1358     // If s contains U+FFFE, check that merged segments make the same key.
1359     LocalMemory<uint8_t> mergedKey;
1360     int32_t mergedKeyLength = 0;
1361     int32_t mergedKeyCapacity = 0;
1362     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1363     int32_t segmentStart = 0;
1364     for(int32_t i = 0;;) {
1365         if(i == sLength) {
1366             if(segmentStart == 0) {
1367                 // s does not contain any U+FFFE.
1368                 break;
1369             }
1370         } else if(s[i] != 0xfffe) {
1371             ++i;
1372             continue;
1373         }
1374         // Get the sort key for another segment and merge it into mergedKey.
1375         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1376         CollationKey key2;
1377         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1378         int32_t key1Length, key2Length;
1379         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1380         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1381         uint8_t *dest;
1382         int32_t minCapacity = key1Length + key2Length;
1383         if(key1Length > 0) { --minCapacity; }
1384         if(minCapacity <= mergedKeyCapacity) {
1385             dest = mergedKey.getAlias();
1386         } else {
1387             if(minCapacity <= 200) {
1388                 mergedKeyCapacity = 200;
1389             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1390                 mergedKeyCapacity *= 2;
1391             } else {
1392                 mergedKeyCapacity = minCapacity;
1393             }
1394             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1395         }
1396         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1397         if(key1Length == 0) {
1398             // key2 is the sort key for the first segment.
1399             uprv_memcpy(dest, key2Bytes, key2Length);
1400             mergedKeyLength = key2Length;
1401         } else {
1402             mergedKeyLength =
1403                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1404                                    dest, mergedKeyCapacity);
1405         }
1406         if(i == sLength) { break; }
1407         segmentStart = ++i;
1408     }
1409     if(segmentStart != 0 &&
1410             (mergedKeyLength != keyLength ||
1411             uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1412         infoln(fileTestName);
1413         errln("Collator(%s).getCollationKey(with U+FFFE) != "
1414               "ucol_mergeSortkeys(segments)",
1415               norm);
1416         infoln(line);
1417         infoln(printCollationKey(key));
1418         infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1419         return FALSE;
1420     }
1421
1422     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1423     static const int32_t partSizes[] = { 32, 3, 1 };
1424     for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) {
1425         int32_t partSize = partSizes[psi];
1426         CharString parts;
1427         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1428             infoln(fileTestName);
1429             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1430                   norm, (int)partSize, errorCode.errorName());
1431             infoln(line);
1432             return FALSE;
1433         }
1434         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1435             infoln(fileTestName);
1436             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1437                   norm, (int)partSize);
1438             infoln(line);
1439             infoln(printCollationKey(key));
1440             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1441             return FALSE;
1442         }
1443     }
1444     return TRUE;
1445 }
1446
1447 namespace {
1448
1449 /**
1450  * Replaces unpaired surrogates with U+FFFD.
1451  * Returns s if no replacement was made, otherwise buffer.
1452  */
1453 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1454     int32_t i = 0;
1455     while(i < s.length()) {
1456         UChar32 c = s.char32At(i);
1457         if(U_IS_SURROGATE(c)) {
1458             if(buffer.length() < i) {
1459                 buffer.append(s, buffer.length(), i - buffer.length());
1460             }
1461             buffer.append((UChar)0xfffd);
1462         }
1463         i += U16_LENGTH(c);
1464     }
1465     if(buffer.isEmpty()) {
1466         return s;
1467     }
1468     if(buffer.length() < i) {
1469         buffer.append(s, buffer.length(), i - buffer.length());
1470     }
1471     return buffer;
1472 }
1473
1474 }
1475
1476 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1477                                      const UnicodeString &prevString, const UnicodeString &s,
1478                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1479                                      IcuTestErrorCode &errorCode) {
1480     if(errorCode.isFailure()) { return FALSE; }
1481
1482     // Get the sort keys first, for error debug output.
1483     CollationKey prevKey;
1484     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1485                         prevKey, errorCode)) {
1486         return FALSE;
1487     }
1488     CollationKey key;
1489     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1490
1491     UCollationResult order = coll->compare(prevString, s, errorCode);
1492     if(order != expectedOrder || errorCode.isFailure()) {
1493         infoln(fileTestName);
1494         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1495               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1496         infoln(prevFileLine);
1497         infoln(fileLine);
1498         infoln(printCollationKey(prevKey));
1499         infoln(printCollationKey(key));
1500         return FALSE;
1501     }
1502     order = coll->compare(s, prevString, errorCode);
1503     if(order != -expectedOrder || errorCode.isFailure()) {
1504         infoln(fileTestName);
1505         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1506               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1507         infoln(prevFileLine);
1508         infoln(fileLine);
1509         infoln(printCollationKey(prevKey));
1510         infoln(printCollationKey(key));
1511         return FALSE;
1512     }
1513     // Test NUL-termination if the strings do not contain NUL characters.
1514     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1515     if(!containNUL) {
1516         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1517         if(order != expectedOrder || errorCode.isFailure()) {
1518             infoln(fileTestName);
1519             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1520                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1521             infoln(prevFileLine);
1522             infoln(fileLine);
1523             infoln(printCollationKey(prevKey));
1524             infoln(printCollationKey(key));
1525             return FALSE;
1526         }
1527         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1528         if(order != -expectedOrder || errorCode.isFailure()) {
1529             infoln(fileTestName);
1530             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1531                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1532             infoln(prevFileLine);
1533             infoln(fileLine);
1534             infoln(printCollationKey(prevKey));
1535             infoln(printCollationKey(key));
1536             return FALSE;
1537         }
1538     }
1539
1540 #if U_HAVE_STD_STRING
1541     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1542     // Unpaired surrogates cannot be converted to UTF-8.
1543     // Create valid UTF-16 strings if necessary, and use those for
1544     // both the expected compare() result and for the input to compare(UTF-8).
1545     UnicodeString prevBuffer, sBuffer;
1546     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1547     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1548     std::string prevUTF8, sUTF8;
1549     UnicodeString(prevValid).toUTF8String(prevUTF8);
1550     UnicodeString(sValid).toUTF8String(sUTF8);
1551     UCollationResult expectedUTF8Order;
1552     if(&prevValid == &prevString && &sValid == &s) {
1553         expectedUTF8Order = expectedOrder;
1554     } else {
1555         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1556     }
1557
1558     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1559     if(order != expectedUTF8Order || errorCode.isFailure()) {
1560         infoln(fileTestName);
1561         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1562               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1563         infoln(prevFileLine);
1564         infoln(fileLine);
1565         infoln(printCollationKey(prevKey));
1566         infoln(printCollationKey(key));
1567         return FALSE;
1568     }
1569     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1570     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1571         infoln(fileTestName);
1572         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1573               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1574         infoln(prevFileLine);
1575         infoln(fileLine);
1576         infoln(printCollationKey(prevKey));
1577         infoln(printCollationKey(key));
1578         return FALSE;
1579     }
1580     // Test NUL-termination if the strings do not contain NUL characters.
1581     if(!containNUL) {
1582         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1583         if(order != expectedUTF8Order || errorCode.isFailure()) {
1584             infoln(fileTestName);
1585             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1587             infoln(prevFileLine);
1588             infoln(fileLine);
1589             infoln(printCollationKey(prevKey));
1590             infoln(printCollationKey(key));
1591             return FALSE;
1592         }
1593         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1594         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1595             infoln(fileTestName);
1596             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1598             infoln(prevFileLine);
1599             infoln(fileLine);
1600             infoln(printCollationKey(prevKey));
1601             infoln(printCollationKey(key));
1602             return FALSE;
1603         }
1604     }
1605 #endif
1606
1607     UCharIterator leftIter;
1608     UCharIterator rightIter;
1609     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1610     uiter_setString(&rightIter, s.getBuffer(), s.length());
1611     order = coll->compare(leftIter, rightIter, errorCode);
1612     if(order != expectedOrder || errorCode.isFailure()) {
1613         infoln(fileTestName);
1614         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1615               "wrong order: %d != %d (%s)",
1616               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1617         infoln(prevFileLine);
1618         infoln(fileLine);
1619         infoln(printCollationKey(prevKey));
1620         infoln(printCollationKey(key));
1621         return FALSE;
1622     }
1623
1624     order = prevKey.compareTo(key, errorCode);
1625     if(order != expectedOrder || errorCode.isFailure()) {
1626         infoln(fileTestName);
1627         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1628               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1629         infoln(prevFileLine);
1630         infoln(fileLine);
1631         infoln(printCollationKey(prevKey));
1632         infoln(printCollationKey(key));
1633         return FALSE;
1634     }
1635     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1636         int32_t prevKeyLength;
1637         const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1638         int32_t keyLength;
1639         const uint8_t *bytes = key.getByteArray(keyLength);
1640         int32_t level = Collation::PRIMARY_LEVEL;
1641         for(int32_t i = 0;; ++i) {
1642             uint8_t b = prevBytes[i];
1643             if(b != bytes[i]) { break; }
1644             if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1645                 ++level;
1646                 if(level == Collation::CASE_LEVEL &&
1647                         coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
1648                     ++level;
1649                 }
1650             }
1651         }
1652         if(level != expectedLevel) {
1653             infoln(fileTestName);
1654             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1655                   (int)fileLineNumber, norm, order, level, expectedLevel);
1656             infoln(prevFileLine);
1657             infoln(fileLine);
1658             infoln(printCollationKey(prevKey));
1659             infoln(printCollationKey(key));
1660             return FALSE;
1661         }
1662     }
1663     return TRUE;
1664 }
1665
1666 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1667     if(errorCode.isFailure()) { return; }
1668     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1669     UnicodeString prevString, s;
1670     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1671     while(readLine(f, errorCode)) {
1672         if(fileLine.isEmpty()) { continue; }
1673         if(isSectionStarter(fileLine[0])) { break; }
1674         Collation::Level relation = parseRelationAndString(s, errorCode);
1675         if(errorCode.isFailure()) {
1676             errorCode.reset();
1677             break;
1678         }
1679         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1680         Collation::Level expectedLevel = relation;
1681         s.getTerminatedBuffer();  // Ensure NUL-termination.
1682         UBool isOk = TRUE;
1683         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1684             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1685             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1686                                    expectedOrder, expectedLevel, errorCode);
1687         }
1688         if(isOk) {
1689             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1690             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1691                                    expectedOrder, expectedLevel, errorCode);
1692         }
1693         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1694             UnicodeString pn = nfd->normalize(prevString, errorCode);
1695             UnicodeString n = nfd->normalize(s, errorCode);
1696             pn.getTerminatedBuffer();
1697             n.getTerminatedBuffer();
1698             errorCode.assertSuccess();
1699             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1700                                    expectedOrder, expectedLevel, errorCode);
1701         }
1702         if(!isOk) {
1703             errorCode.reset();  // already reported
1704         }
1705         prevFileLine = fileLine;
1706         prevString = s;
1707         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1708     }
1709 }
1710
1711 void CollationTest::TestDataDriven() {
1712     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1713
1714     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1715     nfd = Normalizer2Factory::getNFDInstance(errorCode);
1716     if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1717         return;
1718     }
1719
1720     CharString path(getSourceTestData(errorCode), errorCode);
1721     path.appendPathPart("collationtest.txt", errorCode);
1722     const char *codePage = "UTF-8";
1723     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1724     if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1725         return;
1726     }
1727     while(errorCode.isSuccess()) {
1728         // Read a new line if necessary.
1729         // Sub-parsers leave the first line set that they do not handle.
1730         if(fileLine.isEmpty()) {
1731             if(!readLine(f.getAlias(), errorCode)) { break; }
1732             continue;
1733         }
1734         if(!isSectionStarter(fileLine[0])) {
1735             errln("syntax error on line %d", (int)fileLineNumber);
1736             infoln(fileLine);
1737             return;
1738         }
1739         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1740             fileTestName = fileLine;
1741             logln(fileLine);
1742             fileLine.remove();
1743         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1744             setRootCollator(errorCode);
1745             fileLine.remove();
1746         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1747             setLocaleCollator(errorCode);
1748             fileLine.remove();
1749         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1750             buildTailoring(f.getAlias(), errorCode);
1751         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1752             parseAndSetAttribute(errorCode);
1753         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1754             checkCompareStrings(f.getAlias(), errorCode);
1755         } else {
1756             errln("syntax error on line %d", (int)fileLineNumber);
1757             infoln(fileLine);
1758             return;
1759         }
1760     }
1761 }
1762
1763 #endif  // !UCONFIG_NO_COLLATION