]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/collationtest.cpp
ICU-59180.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / collationtest.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
57a6839d
A
3/*
4*******************************************************************************
b331163b 5* Copyright (C) 2012-2015, International Business Machines
57a6839d
A
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationtest.cpp
9*
10* created on: 2012apr27
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/coll.h"
19#include "unicode/errorcode.h"
20#include "unicode/localpointer.h"
21#include "unicode/normalizer2.h"
22#include "unicode/sortkey.h"
23#include "unicode/std_string.h"
24#include "unicode/strenum.h"
25#include "unicode/tblcoll.h"
26#include "unicode/uiter.h"
27#include "unicode/uniset.h"
28#include "unicode/unistr.h"
29#include "unicode/usetiter.h"
30#include "unicode/ustring.h"
31#include "charstr.h"
32#include "cmemory.h"
33#include "collation.h"
34#include "collationdata.h"
35#include "collationfcd.h"
36#include "collationiterator.h"
37#include "collationroot.h"
38#include "collationrootelements.h"
39#include "collationruleparser.h"
40#include "collationweights.h"
41#include "cstring.h"
42#include "intltest.h"
43#include "normalizer2impl.h"
44#include "ucbuf.h"
45#include "uhash.h"
46#include "uitercollationiterator.h"
47#include "utf16collationiterator.h"
48#include "utf8collationiterator.h"
49#include "uvectr32.h"
50#include "uvectr64.h"
51#include "writesrc.h"
52
57a6839d
A
53class CodePointIterator;
54
55// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
56
57class CollationTest : public IntlTest {
58public:
59 CollationTest()
60 : fcd(NULL), nfd(NULL),
61 fileLineNumber(0),
62 coll(NULL) {}
63
64 ~CollationTest() {
65 delete coll;
66 }
67
68 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
69
70 void TestMinMax();
71 void TestImplicits();
72 void TestNulTerminated();
73 void TestIllegalUTF8();
74 void TestShortFCDData();
75 void TestFCD();
76 void TestCollationWeights();
77 void TestRootElements();
78 void TestTailoredElements();
79 void TestDataDriven();
80
81private:
82 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
83 void checkAllocWeights(CollationWeights &cw,
84 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
85 int32_t someLength, int32_t minCount);
86
87 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
88 static UnicodeString printCollationKey(const CollationKey &key);
89
90 // Helpers & fields for data-driven test.
91 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
92 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
93 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
94 int32_t skipSpaces(int32_t i) {
95 while(isSpace(fileLine[i])) { ++i; }
96 return i;
97 }
98
b331163b 99 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
57a6839d
A
100 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
101 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
102 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
103 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
104 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
105 void setRootCollator(IcuTestErrorCode &errorCode);
106 void setLocaleCollator(IcuTestErrorCode &errorCode);
107
108 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
109
110 UBool getSortKeyParts(const UChar *s, int32_t length,
111 CharString &dest, int32_t partSize,
112 IcuTestErrorCode &errorCode);
113 UBool getCollationKey(const char *norm, const UnicodeString &line,
114 const UChar *s, int32_t length,
115 CollationKey &key, IcuTestErrorCode &errorCode);
b331163b
A
116 UBool getMergedCollationKey(const UChar *s, int32_t length,
117 CollationKey &key, IcuTestErrorCode &errorCode);
57a6839d
A
118 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
119 const UnicodeString &prevString, const UnicodeString &s,
120 UCollationResult expectedOrder, Collation::Level expectedLevel,
121 IcuTestErrorCode &errorCode);
122 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
123
124 const Normalizer2 *fcd, *nfd;
125 UnicodeString fileLine;
126 int32_t fileLineNumber;
127 UnicodeString fileTestName;
128 Collator *coll;
129};
130
131extern IntlTest *createCollationTest() {
132 return new CollationTest();
133}
134
135void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
136 if(exec) {
137 logln("TestSuite CollationTest: ");
138 }
139 TESTCASE_AUTO_BEGIN;
140 TESTCASE_AUTO(TestMinMax);
141 TESTCASE_AUTO(TestImplicits);
142 TESTCASE_AUTO(TestNulTerminated);
143 TESTCASE_AUTO(TestIllegalUTF8);
144 TESTCASE_AUTO(TestShortFCDData);
145 TESTCASE_AUTO(TestFCD);
146 TESTCASE_AUTO(TestCollationWeights);
147 TESTCASE_AUTO(TestRootElements);
148 TESTCASE_AUTO(TestTailoredElements);
149 TESTCASE_AUTO(TestDataDriven);
150 TESTCASE_AUTO_END;
151}
152
153void CollationTest::TestMinMax() {
154 IcuTestErrorCode errorCode(*this, "TestMinMax");
155
156 setRootCollator(errorCode);
157 if(errorCode.isFailure()) {
158 errorCode.reset();
159 return;
160 }
161 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
162 if(rbc == NULL) {
163 errln("the root collator is not a RuleBasedCollator");
164 return;
165 }
166
167 static const UChar s[2] = { 0xfffe, 0xffff };
168 UVector64 ces(errorCode);
169 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
170 errorCode.assertSuccess();
171 if(ces.size() != 2) {
172 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
173 return;
174 }
175 int64_t ce = ces.elementAti(0);
b331163b 176 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
57a6839d 177 if(ce != expected) {
b331163b 178 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
57a6839d
A
179 }
180
181 ce = ces.elementAti(1);
182 expected = Collation::makeCE(Collation::MAX_PRIMARY);
183 if(ce != expected) {
184 errln("CE(U+ffff)=%04lx != max..", (long)ce);
185 }
186}
187
188void CollationTest::TestImplicits() {
189 IcuTestErrorCode errorCode(*this, "TestImplicits");
190
191 const CollationData *cd = CollationRoot::getData(errorCode);
b331163b 192 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
57a6839d
A
193 return;
194 }
195
196 // Implicit primary weights should be assigned for the following sets,
197 // and sort in ascending order by set and then code point.
198 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
b331163b 199
57a6839d
A
200 // core Han Unified Ideographs
201 UnicodeSet coreHan("[\\p{unified_ideograph}&"
202 "[\\p{Block=CJK_Unified_Ideographs}"
203 "\\p{Block=CJK_Compatibility_Ideographs}]]",
204 errorCode);
205 // all other Unified Han ideographs
206 UnicodeSet otherHan("[\\p{unified ideograph}-"
207 "[\\p{Block=CJK_Unified_Ideographs}"
208 "\\p{Block=CJK_Compatibility_Ideographs}]]",
209 errorCode);
210 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
211 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
b331163b
A
212
213 // Starting with CLDR 26/ICU 54, the root Han order may instead be
214 // the Unihan radical-stroke order.
215 // The tests should pass either way, so we only test the order of a small set of Han characters
216 // whose radical-stroke order is the same as their code point order.
217 UnicodeSet someHanInCPOrder(
218 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
219 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
220 errorCode);
221 UnicodeSet inOrder(someHanInCPOrder);
222 inOrder.addAll(unassigned).freeze();
57a6839d
A
223 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
224 return;
225 }
226 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
227 UChar32 prev = 0;
228 uint32_t prevPrimary = 0;
229 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
b331163b 230 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
57a6839d
A
231 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
232 while(iter->next()) {
233 UChar32 c = iter->getCodepoint();
234 UnicodeString s(c);
235 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
236 int64_t ce = ci.nextCE(errorCode);
237 int64_t ce2 = ci.nextCE(errorCode);
238 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
239 return;
240 }
241 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
242 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
243 continue;
244 }
245 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
246 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
247 (long)c, (long)(ce & 0xffffffff));
248 continue;
249 }
250 uint32_t primary = (uint32_t)(ce >> 32);
b331163b 251 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
57a6839d
A
252 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
253 (long)c, (long)primary, (long)prev, (long)prevPrimary);
254 }
255 prev = c;
256 prevPrimary = primary;
257 }
258 }
259}
260
261void CollationTest::TestNulTerminated() {
262 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
263 const CollationData *data = CollationRoot::getData(errorCode);
264 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
265 return;
266 }
267
268 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
269
270 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
271 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
272 for(int32_t i = 0;; ++i) {
273 int64_t ce1 = ci1.nextCE(errorCode);
274 int64_t ce2 = ci2.nextCE(errorCode);
275 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
276 return;
277 }
278 if(ce1 != ce2) {
279 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
280 break;
281 }
282 if(ce1 == Collation::NO_CE) { break; }
283 }
284}
285
286void CollationTest::TestIllegalUTF8() {
287 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
288
289 setRootCollator(errorCode);
290 if(errorCode.isFailure()) {
291 errorCode.reset();
292 return;
293 }
294 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
295
296 static const char *strings[] = {
297 // U+FFFD
298 "a\xef\xbf\xbdz",
299 // illegal byte sequences
300 "a\x80z", // trail byte
301 "a\xc1\x81z", // non-shortest form
302 "a\xe0\x82\x83z", // non-shortest form
303 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
304 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
305 "a\xf0\x8f\xbf\xbfz", // non-shortest form
306 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
307 };
308
309 StringPiece fffd(strings[0]);
b331163b 310 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
57a6839d
A
311 StringPiece illegal(strings[i]);
312 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
313 if(order != UCOL_EQUAL) {
314 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
315 (int)i, order);
316 }
317 }
318}
319
320namespace {
321
322void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
323 for(UChar32 c = 0x10000; c < 0x110000;) {
324 UChar32 next = c + 0x400;
325 if(src.containsSome(c, next - 1)) {
326 dest.add(U16_LEAD(c));
327 }
328 c = next;
329 }
330}
331
332} // namespace
333
334void CollationTest::TestShortFCDData() {
335 // See CollationFCD class comments.
336 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
337 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
338 errorCode.assertSuccess();
339 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
340 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
341 UnicodeSet lccc; // actual
342 for(UChar32 c = 0; c <= 0xffff; ++c) {
343 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
344 }
345 UnicodeSet diff(expectedLccc);
346 diff.removeAll(lccc);
347 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
348 UnicodeString empty("[]");
349 UnicodeString diffString;
350 diff.toPattern(diffString, TRUE);
351 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
352 diff = lccc;
353 diff.removeAll(expectedLccc);
354 diff.toPattern(diffString, TRUE);
355 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
356
357 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
358 if (errorCode.isSuccess()) {
359 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
360 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
361 UnicodeSet tccc; // actual
362 for(UChar32 c = 0; c <= 0xffff; ++c) {
363 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
364 }
365 diff = expectedTccc;
366 diff.removeAll(tccc);
367 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
368 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
369 diff = tccc;
370 diff.removeAll(expectedTccc);
371 diff.toPattern(diffString, TRUE);
372 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
373 }
374}
375
376class CodePointIterator {
377public:
378 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
379 void resetToStart() { pos = 0; }
380 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
381 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
382 int32_t getLength() const { return length; }
383 int getIndex() const { return (int)pos; }
384private:
385 const UChar32 *cp;
386 int32_t length;
387 int32_t pos;
388};
389
390void CollationTest::checkFCD(const char *name,
391 CollationIterator &ci, CodePointIterator &cpi) {
392 IcuTestErrorCode errorCode(*this, "checkFCD");
393
394 // Iterate forward to the limit.
395 for(;;) {
396 UChar32 c1 = ci.nextCodePoint(errorCode);
397 UChar32 c2 = cpi.next();
398 if(c1 != c2) {
399 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
400 name, (long)c1, (long)c2, cpi.getIndex());
401 return;
402 }
403 if(c1 < 0) { break; }
404 }
405
406 // Iterate backward most of the way.
407 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
408 UChar32 c1 = ci.previousCodePoint(errorCode);
409 UChar32 c2 = cpi.previous();
410 if(c1 != c2) {
411 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
412 name, (long)c1, (long)c2, cpi.getIndex());
413 return;
414 }
415 }
416
417 // Forward again.
418 for(;;) {
419 UChar32 c1 = ci.nextCodePoint(errorCode);
420 UChar32 c2 = cpi.next();
421 if(c1 != c2) {
422 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
423 name, (long)c1, (long)c2, cpi.getIndex());
424 return;
425 }
426 if(c1 < 0) { break; }
427 }
428
429 // Iterate backward to the start.
430 for(;;) {
431 UChar32 c1 = ci.previousCodePoint(errorCode);
432 UChar32 c2 = cpi.previous();
433 if(c1 != c2) {
434 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
435 name, (long)c1, (long)c2, cpi.getIndex());
436 return;
437 }
438 if(c1 < 0) { break; }
439 }
440}
441
442void CollationTest::TestFCD() {
443 IcuTestErrorCode errorCode(*this, "TestFCD");
444 const CollationData *data = CollationRoot::getData(errorCode);
445 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
446 return;
447 }
448
449 // Input string, not FCD, NUL-terminated.
450 static const UChar s[] = {
451 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
452 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
453 0x327, 0x308, // ccc=202, 230
454 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
455 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
456 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
457 0xac01,
458 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
459 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
460 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
461 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
462 0x4e00, 0xf81,
463 0
464 };
465 // Expected code points.
466 static const UChar32 cp[] = {
467 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
468 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
469 0x1D15F, 0x1D16D,
470 0xac01,
471 0x63, 0x327, 0x1D165, 0x1D16D,
472 0x61,
473 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
474 0x4e00, 0xf71, 0xf80
475 };
476
477 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
478 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
479 return;
480 }
b331163b 481 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
57a6839d
A
482 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
483
57a6839d
A
484 cpi.resetToStart();
485 std::string utf8;
486 UnicodeString(s).toUTF8String(utf8);
487 FCDUTF8CollationIterator u8ci(data, FALSE,
488 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
489 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
490 return;
491 }
492 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
57a6839d
A
493
494 cpi.resetToStart();
495 UCharIterator iter;
b331163b 496 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
57a6839d
A
497 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
498 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
499 return;
500 }
501 checkFCD("FCDUIterCollationIterator", uici, cpi);
502}
503
504void CollationTest::checkAllocWeights(CollationWeights &cw,
505 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
506 int32_t someLength, int32_t minCount) {
507 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
508 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
509 (long)lowerLimit, (long)upperLimit, (long)n);
510 return;
511 }
512 uint32_t previous = lowerLimit;
513 int32_t count = 0; // number of weights that have someLength
514 for(int32_t i = 0; i < n; ++i) {
515 uint32_t w = cw.nextWeight();
516 if(w == 0xffffffff) {
517 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
518 "returns only %ld weights",
519 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
520 return;
521 }
522 if(!(previous < w && w < upperLimit)) {
523 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
524 "number %ld -> %lx not between %lx and %lx",
525 (long)lowerLimit, (long)upperLimit, (long)n,
526 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
527 return;
528 }
529 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
530 }
531 if(count < minCount) {
532 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
533 "returns only %ld < %ld weights of length %d",
534 (long)lowerLimit, (long)upperLimit, (long)n,
535 (long)count, (long)minCount, (int)someLength);
536 }
537}
538
539void CollationTest::TestCollationWeights() {
540 CollationWeights cw;
541
542 // Non-compressible primaries use 254 second bytes 02..FF.
543 logln("CollationWeights.initForPrimary(non-compressible)");
544 cw.initForPrimary(FALSE);
545 // Expect 1 weight 11 and 254 weights 12xx.
546 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
547 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
548 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
549 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
550 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
551 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
552 // Expect 254^2=64516 three-byte weights.
553 // During computation, there should be 3 three-byte ranges
554 // 10ffff, 11xxxx, 120202.
555 // The middle one should be split 64515:1,
556 // and the newly-split-off range and the last ranged lengthened.
557 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
558 // Expect weights 1102 & 1103.
559 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
560 // Expect weights 102102 & 102103.
561 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
562
563 // Compressible primaries use 251 second bytes 04..FE.
564 logln("CollationWeights.initForPrimary(compressible)");
565 cw.initForPrimary(TRUE);
566 // Expect 1 weight 11 and 251 weights 12xx.
567 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
568 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
569 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
570 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
571 // Expect weights 1104 & 1105.
572 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
573 // Expect weights 102102 & 102103.
574 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
575
576 // Secondary and tertiary weights use only bytes 3 & 4.
577 logln("CollationWeights.initForSecondary()");
578 cw.initForSecondary();
579 // Expect weights fbxx and all four fc..ff.
580 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
581
582 logln("CollationWeights.initForTertiary()");
583 cw.initForTertiary();
584 // Expect weights 3dxx and both 3e & 3f.
585 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
586}
587
588namespace {
589
590UBool isValidCE(const CollationRootElements &re, const CollationData &data,
591 uint32_t p, uint32_t s, uint32_t ctq) {
592 uint32_t p1 = p >> 24;
593 uint32_t p2 = (p >> 16) & 0xff;
594 uint32_t p3 = (p >> 8) & 0xff;
595 uint32_t p4 = p & 0xff;
596 uint32_t s1 = s >> 8;
597 uint32_t s2 = s & 0xff;
598 // ctq = Case, Tertiary, Quaternary
599 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
600 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
601 uint32_t t1 = t >> 8;
602 uint32_t t2 = t & 0xff;
603 uint32_t q = ctq & Collation::QUATERNARY_MASK;
604 // No leading zero bytes.
605 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
606 return FALSE;
607 }
608 // No intermediate zero bytes.
609 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
610 return FALSE;
611 }
612 if(p2 != 0 && p3 == 0 && p4 != 0) {
613 return FALSE;
614 }
615 // Minimum & maximum lead bytes.
616 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
b331163b
A
617 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
618 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
57a6839d
A
619 return FALSE;
620 }
621 if(c > 2) {
622 return FALSE;
623 }
624 // The valid byte range for the second primary byte depends on compressibility.
625 if(p2 != 0) {
626 if(data.isCompressibleLeadByte(p1)) {
627 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
628 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
629 return FALSE;
630 }
631 } else {
632 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
633 return FALSE;
634 }
635 }
636 }
637 // Other bytes just need to avoid the level separator.
638 // Trailing zeros are ok.
639 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
640 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
641 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
642 return FALSE;
643 }
644 // Well-formed CEs.
645 if(p == 0) {
646 if(s == 0) {
647 if(t == 0) {
648 // Completely ignorable CE.
649 // Quaternary CEs are not supported.
650 if(c != 0 || q != 0) {
651 return FALSE;
652 }
653 } else {
654 // Tertiary CE.
655 if(t < re.getTertiaryBoundary() || c != 2) {
656 return FALSE;
657 }
658 }
659 } else {
660 // Secondary CE.
661 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
662 return FALSE;
663 }
664 }
665 } else {
666 // Primary CE.
667 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
668 s >= re.getSecondaryBoundary()) {
669 return FALSE;
670 }
671 if(t == 0 || t >= re.getTertiaryBoundary()) {
672 return FALSE;
673 }
674 }
675 return TRUE;
676}
677
678UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
679 uint32_t p = (uint32_t)(ce >> 32);
680 uint32_t secTer = (uint32_t)ce;
681 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
682}
683
684class RootElementsIterator {
685public:
686 RootElementsIterator(const CollationData &root)
687 : data(root),
688 elements(root.rootElements), length(root.rootElementsLength),
689 pri(0), secTer(0),
690 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
691
692 UBool next() {
693 if(index >= length) { return FALSE; }
694 uint32_t p = elements[index];
695 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
696 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
697 ++index;
698 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
699 return TRUE;
700 }
701 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
702 // End of a range, enumerate the primaries in the range.
703 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
704 p &= 0xffffff00;
705 if(pri == p) {
706 // Finished the range, return the next CE after it.
707 ++index;
708 return next();
709 }
710 U_ASSERT(pri < p);
711 // Return the next primary in this range.
712 UBool isCompressible = data.isCompressiblePrimary(pri);
713 if((pri & 0xffff) == 0) {
714 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
715 } else {
716 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
717 }
718 return TRUE;
719 }
720 // Simple primary CE.
721 ++index;
722 pri = p;
b331163b
A
723 // Does this have an explicit below-common sec/ter unit,
724 // or does it imply a common one?
725 if(index == length) {
726 secTer = Collation::COMMON_SEC_AND_TER_CE;
727 } else {
728 secTer = elements[index];
729 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
730 // No sec/ter delta.
731 secTer = Collation::COMMON_SEC_AND_TER_CE;
732 } else {
733 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
734 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
735 // Implied sec/ter.
736 secTer = Collation::COMMON_SEC_AND_TER_CE;
737 } else {
738 // Explicit sec/ter below common/common.
739 ++index;
740 }
741 }
742 }
57a6839d
A
743 return TRUE;
744 }
745
746 uint32_t getPrimary() const { return pri; }
747 uint32_t getSecTer() const { return secTer; }
748
749private:
750 const CollationData &data;
751 const uint32_t *elements;
752 int32_t length;
753
754 uint32_t pri;
755 uint32_t secTer;
756 int32_t index;
757};
758
759} // namespace
760
761void CollationTest::TestRootElements() {
762 IcuTestErrorCode errorCode(*this, "TestRootElements");
763 const CollationData *root = CollationRoot::getData(errorCode);
764 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
765 return;
766 }
767 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
768 RootElementsIterator iter(*root);
769
770 // We check each root CE for validity,
771 // and we also verify that there is a tailoring gap between each two CEs.
772 CollationWeights cw1c; // compressible primary weights
773 CollationWeights cw1u; // uncompressible primary weights
774 CollationWeights cw2;
775 CollationWeights cw3;
776
777 cw1c.initForPrimary(TRUE);
778 cw1u.initForPrimary(FALSE);
779 cw2.initForSecondary();
780 cw3.initForTertiary();
781
782 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
783 // nor the special merge-separator CE for U+FFFE.
784 uint32_t prevPri = 0;
785 uint32_t prevSec = 0;
786 uint32_t prevTer = 0;
787 while(iter.next()) {
788 uint32_t pri = iter.getPrimary();
789 uint32_t secTer = iter.getSecTer();
790 // CollationRootElements CEs must have 0 case and quaternary bits.
791 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
792 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
793 (long)pri, (long)secTer);
794 }
795 uint32_t sec = secTer >> 16;
796 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
797 uint32_t ctq = ter;
798 if(pri == 0 && sec == 0 && ter != 0) {
799 // Tertiary CEs must have uppercase bits,
800 // but they are not stored in the CollationRootElements.
801 ctq |= 0x8000;
802 }
803 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
804 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
805 } else {
806 if(pri != prevPri) {
807 uint32_t newWeight = 0;
808 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
809 // There is currently no tailoring gap after primary ignorables,
810 // and we forbid tailoring after U+FFFD and U+FFFF.
811 } else if(root->isCompressiblePrimary(prevPri)) {
812 if(!cw1c.allocWeights(prevPri, pri, 1)) {
813 errln("no primary/compressible tailoring gap between %08lx and %08lx",
814 (long)prevPri, (long)pri);
815 } else {
816 newWeight = cw1c.nextWeight();
817 }
818 } else {
819 if(!cw1u.allocWeights(prevPri, pri, 1)) {
820 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
821 (long)prevPri, (long)pri);
822 } else {
823 newWeight = cw1u.nextWeight();
824 }
825 }
826 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
827 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
828 (long)prevPri, (long)newWeight, (long)pri);
829 }
830 } else if(sec != prevSec) {
831 uint32_t lowerLimit =
832 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
833 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
834 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
835 } else {
836 uint32_t newWeight = cw2.nextWeight();
837 if(!(prevSec < newWeight && newWeight < sec)) {
838 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
839 (long)lowerLimit, (long)newWeight, (long)sec);
840 }
841 }
842 } else if(ter != prevTer) {
843 uint32_t lowerLimit =
844 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
845 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
846 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
847 } else {
848 uint32_t newWeight = cw3.nextWeight();
849 if(!(prevTer < newWeight && newWeight < ter)) {
850 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
851 (long)lowerLimit, (long)newWeight, (long)ter);
852 }
853 }
854 } else {
855 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
856 }
857 }
858 prevPri = pri;
859 prevSec = sec;
860 prevTer = ter;
861 }
862}
863
864void CollationTest::TestTailoredElements() {
865 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
866 const CollationData *root = CollationRoot::getData(errorCode);
867 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
868 return;
869 }
870 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
871
872 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
873 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
874 return;
875 }
876 uhash_setKeyDeleter(prevLocales, uprv_free);
877 // TestRootElements() tests the root collator which does not have tailorings.
878 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
879 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
880 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
881
882 UVector64 ces(errorCode);
883 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
884 U_ASSERT(locales.isValid());
885 const char *localeID = "root";
886 do {
887 Locale locale(localeID);
888 LocalPointer<StringEnumeration> types(
889 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
890 errorCode.assertSuccess();
b331163b
A
891 const char *type; // first: default type
892 while((type = types->next(NULL, errorCode)) != NULL) {
893 if(strncmp(type, "private-", 8) == 0) {
894 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
895 localeID, type);
57a6839d 896 }
b331163b
A
897 Locale localeWithType(locale);
898 localeWithType.setKeywordValue("collation", type, errorCode);
57a6839d
A
899 errorCode.assertSuccess();
900 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
901 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
902 localeWithType.getName())) {
903 continue;
904 }
905 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
906 if(uhash_geti(prevLocales, actual.getName()) != 0) {
907 continue;
908 }
909 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
910 errorCode.assertSuccess();
911 logln("TestTailoredElements(): requested %s -> actual %s",
912 localeWithType.getName(), actual.getName());
913 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
914 if(rbc == NULL) {
915 continue;
916 }
917 // Note: It would be better to get tailored strings such that we can
918 // identify the prefix, and only get the CEs for the prefix+string,
919 // not also for the prefix.
920 // There is currently no API for that.
921 // It would help in an unusual case where a contraction starting in the prefix
922 // extends past its end, and we do not see the intended mapping.
923 // For example, for a mapping p|st, if there is also a contraction ps,
924 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
925 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
926 errorCode.assertSuccess();
927 UnicodeSetIterator iter(*tailored);
928 while(iter.next()) {
929 const UnicodeString &s = iter.getString();
930 ces.removeAllElements();
931 rbc->internalGetCEs(s, ces, errorCode);
932 errorCode.assertSuccess();
933 for(int32_t i = 0; i < ces.size(); ++i) {
934 int64_t ce = ces.elementAti(i);
935 if(!isValidCE(rootElements, *root, ce)) {
936 errln("invalid tailored CE %016llx at CE index %d from string:",
937 (long long)ce, (int)i);
938 infoln(prettify(s));
939 }
940 }
941 }
b331163b 942 }
57a6839d
A
943 } while((localeID = locales->next(NULL, errorCode)) != NULL);
944 uhash_close(prevLocales);
945}
946
947UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
948 UnicodeString s;
949 for(int32_t i = 0; i < length; ++i) {
950 if(i > 0) { s.append((UChar)0x20); }
951 uint8_t b = p[i];
952 if(b == 0) {
953 s.append((UChar)0x2e); // period
954 } else if(b == 1) {
955 s.append((UChar)0x7c); // vertical bar
956 } else {
957 appendHex(b, 2, s);
958 }
959 }
960 return s;
961}
962
963UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
964 int32_t length;
965 const uint8_t *p = key.getByteArray(length);
966 return printSortKey(p, length);
967}
968
b331163b
A
969UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
970 for(;;) {
971 int32_t lineLength;
972 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
973 if(line == NULL || errorCode.isFailure()) {
974 fileLine.remove();
975 return FALSE;
976 }
977 ++fileLineNumber;
978 // Strip trailing CR/LF, comments, and spaces.
979 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
980 if(comment != NULL) {
981 lineLength = (int32_t)(comment - line);
982 } else {
983 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
984 }
985 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
986 if(lineLength != 0) {
987 fileLine.setTo(FALSE, line, lineLength);
988 return TRUE;
989 }
990 // Empty line, continue.
57a6839d 991 }
57a6839d
A
992}
993
994void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
995 UErrorCode &errorCode) {
996 int32_t length = fileLine.length();
997 int32_t i;
998 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
999 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
1000 if(pipeIndex >= 0) {
1001 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1002 if(prefix.isEmpty()) {
1003 errln("empty prefix on line %d", (int)fileLineNumber);
1004 infoln(fileLine);
1005 errorCode = U_PARSE_ERROR;
1006 return;
1007 }
1008 start = pipeIndex + 1;
1009 } else {
1010 prefix.remove();
1011 }
1012 s = fileLine.tempSubStringBetween(start, i).unescape();
1013 if(s.isEmpty()) {
1014 errln("empty string on line %d", (int)fileLineNumber);
1015 infoln(fileLine);
1016 errorCode = U_PARSE_ERROR;
1017 return;
1018 }
1019 start = i;
1020}
1021
1022Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1023 Collation::Level relation;
1024 int32_t start;
1025 if(fileLine[0] == 0x3c) { // <
1026 UChar second = fileLine[1];
1027 start = 2;
1028 switch(second) {
1029 case 0x31: // <1
1030 relation = Collation::PRIMARY_LEVEL;
1031 break;
1032 case 0x32: // <2
1033 relation = Collation::SECONDARY_LEVEL;
1034 break;
1035 case 0x33: // <3
1036 relation = Collation::TERTIARY_LEVEL;
1037 break;
1038 case 0x34: // <4
1039 relation = Collation::QUATERNARY_LEVEL;
1040 break;
1041 case 0x63: // <c
1042 relation = Collation::CASE_LEVEL;
1043 break;
1044 case 0x69: // <i
1045 relation = Collation::IDENTICAL_LEVEL;
1046 break;
1047 default: // just <
1048 relation = Collation::NO_LEVEL;
1049 start = 1;
1050 break;
1051 }
1052 } else if(fileLine[0] == 0x3d) { // =
1053 relation = Collation::ZERO_LEVEL;
1054 start = 1;
1055 } else {
1056 start = 0;
1057 }
1058 if(start == 0 || !isSpace(fileLine[start])) {
1059 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1060 infoln(fileLine);
1061 errorCode.set(U_PARSE_ERROR);
1062 return Collation::NO_LEVEL;
1063 }
1064 start = skipSpaces(start);
1065 UnicodeString prefix;
1066 parseString(start, prefix, s, errorCode);
1067 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1068 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1069 infoln(fileLine);
1070 errorCode.set(U_PARSE_ERROR);
1071 return Collation::NO_LEVEL;
1072 }
1073 if(start < fileLine.length()) {
1074 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1075 infoln(fileLine);
1076 errorCode.set(U_PARSE_ERROR);
1077 return Collation::NO_LEVEL;
1078 }
1079 return relation;
1080}
1081
1082static const struct {
1083 const char *name;
1084 UColAttribute attr;
1085} attributes[] = {
1086 { "backwards", UCOL_FRENCH_COLLATION },
1087 { "alternate", UCOL_ALTERNATE_HANDLING },
1088 { "caseFirst", UCOL_CASE_FIRST },
1089 { "caseLevel", UCOL_CASE_LEVEL },
1090 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1091 { "strength", UCOL_STRENGTH },
1092 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1093 { "numeric", UCOL_NUMERIC_COLLATION }
1094};
1095
1096static const struct {
1097 const char *name;
1098 UColAttributeValue value;
1099} attributeValues[] = {
1100 { "default", UCOL_DEFAULT },
1101 { "primary", UCOL_PRIMARY },
1102 { "secondary", UCOL_SECONDARY },
1103 { "tertiary", UCOL_TERTIARY },
1104 { "quaternary", UCOL_QUATERNARY },
1105 { "identical", UCOL_IDENTICAL },
1106 { "off", UCOL_OFF },
1107 { "on", UCOL_ON },
1108 { "shifted", UCOL_SHIFTED },
1109 { "non-ignorable", UCOL_NON_IGNORABLE },
1110 { "lower", UCOL_LOWER_FIRST },
1111 { "upper", UCOL_UPPER_FIRST }
1112};
1113
1114void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
b331163b
A
1115 // Parse attributes even if the Collator could not be created,
1116 // in order to report syntax errors.
57a6839d 1117 int32_t start = skipSpaces(1);
f3c0d7a5 1118 int32_t equalPos = fileLine.indexOf((UChar)0x3d);
57a6839d
A
1119 if(equalPos < 0) {
1120 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1121 parseAndSetReorderCodes(start + 7, errorCode);
1122 return;
1123 }
1124 errln("missing '=' on line %d", (int)fileLineNumber);
1125 infoln(fileLine);
1126 errorCode.set(U_PARSE_ERROR);
1127 return;
1128 }
1129
1130 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1131 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1132 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1133 UColReorderCode max;
1134 if(valueString == UNICODE_STRING("space", 5)) {
1135 max = UCOL_REORDER_CODE_SPACE;
1136 } else if(valueString == UNICODE_STRING("punct", 5)) {
1137 max = UCOL_REORDER_CODE_PUNCTUATION;
1138 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1139 max = UCOL_REORDER_CODE_SYMBOL;
1140 } else if(valueString == UNICODE_STRING("currency", 8)) {
1141 max = UCOL_REORDER_CODE_CURRENCY;
1142 } else {
1143 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1144 infoln(fileLine);
1145 errorCode.set(U_PARSE_ERROR);
1146 return;
1147 }
b331163b
A
1148 if(coll != NULL) {
1149 coll->setMaxVariable(max, errorCode);
1150 if(errorCode.isFailure()) {
1151 errln("setMaxVariable() failed on line %d: %s",
1152 (int)fileLineNumber, errorCode.errorName());
1153 infoln(fileLine);
1154 return;
1155 }
57a6839d
A
1156 }
1157 fileLine.remove();
1158 return;
1159 }
1160
1161 UColAttribute attr;
1162 for(int32_t i = 0;; ++i) {
b331163b 1163 if(i == UPRV_LENGTHOF(attributes)) {
57a6839d
A
1164 errln("invalid attribute name on line %d", (int)fileLineNumber);
1165 infoln(fileLine);
1166 errorCode.set(U_PARSE_ERROR);
1167 return;
1168 }
1169 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1170 attr = attributes[i].attr;
1171 break;
1172 }
1173 }
1174
1175 UColAttributeValue value;
1176 for(int32_t i = 0;; ++i) {
b331163b 1177 if(i == UPRV_LENGTHOF(attributeValues)) {
57a6839d
A
1178 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1179 infoln(fileLine);
1180 errorCode.set(U_PARSE_ERROR);
1181 return;
1182 }
1183 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1184 value = attributeValues[i].value;
1185 break;
1186 }
1187 }
1188
b331163b
A
1189 if(coll != NULL) {
1190 coll->setAttribute(attr, value, errorCode);
1191 if(errorCode.isFailure()) {
1192 errln("illegal attribute=value combination on line %d: %s",
1193 (int)fileLineNumber, errorCode.errorName());
1194 infoln(fileLine);
1195 return;
1196 }
57a6839d
A
1197 }
1198 fileLine.remove();
1199}
1200
1201void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1202 UVector32 reorderCodes(errorCode);
1203 while(start < fileLine.length()) {
1204 start = skipSpaces(start);
1205 int32_t limit = start;
1206 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1207 CharString name;
1208 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1209 int32_t code = CollationRuleParser::getReorderCode(name.data());
b331163b
A
1210 if(code < 0) {
1211 if(uprv_stricmp(name.data(), "default") == 0) {
1212 code = UCOL_REORDER_CODE_DEFAULT; // -1
1213 } else {
1214 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1215 infoln(fileLine);
1216 errorCode.set(U_PARSE_ERROR);
1217 return;
1218 }
57a6839d
A
1219 }
1220 reorderCodes.addElement(code, errorCode);
1221 start = limit;
1222 }
b331163b
A
1223 if(coll != NULL) {
1224 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1225 if(errorCode.isFailure()) {
1226 errln("setReorderCodes() failed on line %d: %s",
1227 (int)fileLineNumber, errorCode.errorName());
1228 infoln(fileLine);
1229 return;
1230 }
57a6839d
A
1231 }
1232 fileLine.remove();
1233}
1234
1235void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1236 UnicodeString rules;
b331163b 1237 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
57a6839d
A
1238 rules.append(fileLine.unescape());
1239 }
1240 if(errorCode.isFailure()) { return; }
1241 logln(rules);
1242
1243 UParseError parseError;
1244 UnicodeString reason;
1245 delete coll;
1246 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1247 if(coll == NULL) {
1248 errln("unable to allocate a new collator");
1249 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1250 return;
1251 }
1252 if(errorCode.isFailure()) {
b331163b 1253 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
57a6839d
A
1254 infoln(UnicodeString(" reason: ") + reason);
1255 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1256 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1257 infoln(UnicodeString(" snippet: ...") +
1258 parseError.preContext + "(!)" + parseError.postContext + "...");
1259 }
b331163b
A
1260 delete coll;
1261 coll = NULL;
1262 errorCode.reset();
57a6839d
A
1263 } else {
1264 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1265 UnicodeString(), reason);
1266 }
1267}
1268
1269void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1270 if(errorCode.isFailure()) { return; }
1271 delete coll;
1272 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1273 if(errorCode.isFailure()) {
1274 dataerrln("unable to create a root collator");
1275 return;
1276 }
1277}
1278
1279void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1280 if(errorCode.isFailure()) { return; }
b331163b
A
1281 delete coll;
1282 coll = NULL;
1283 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1284 if(at >= 0) {
1285 fileLine.setCharAt(at, (UChar)0x2a); // *
1286 }
1287 CharString localeID;
1288 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1289 if(at >= 0) {
1290 localeID.data()[at - 9] = '@';
1291 }
1292 Locale locale(localeID.data());
1293 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
57a6839d
A
1294 errln("invalid language tag on line %d", (int)fileLineNumber);
1295 infoln(fileLine);
1296 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1297 return;
1298 }
1299
1300 logln("creating a collator for locale ID %s", locale.getName());
b331163b 1301 coll = Collator::createInstance(locale, errorCode);
57a6839d
A
1302 if(errorCode.isFailure()) {
1303 dataerrln("unable to create a collator for locale %s on line %d",
1304 locale.getName(), (int)fileLineNumber);
1305 infoln(fileLine);
b331163b
A
1306 delete coll;
1307 coll = NULL;
1308 errorCode.reset();
57a6839d 1309 }
57a6839d
A
1310}
1311
1312UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1313 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1314 // In some sequences with Tibetan composite vowel signs,
1315 // even if the string passes the FCD check,
1316 // those composites must be decomposed.
1317 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1318 int32_t index = 0;
1319 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1320 if(++index < s.length()) {
1321 UChar c = s[index];
1322 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1323 }
1324 }
1325 return FALSE;
1326}
1327
1328UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1329 CharString &dest, int32_t partSize,
1330 IcuTestErrorCode &errorCode) {
1331 if(errorCode.isFailure()) { return FALSE; }
1332 uint8_t part[32];
b331163b 1333 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
57a6839d
A
1334 UCharIterator iter;
1335 uiter_setString(&iter, s, length);
1336 uint32_t state[2] = { 0, 0 };
1337 for(;;) {
1338 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1339 UBool done = partLength < partSize;
1340 if(done) {
1341 // At the end, append the next byte as well which should be 00.
1342 ++partLength;
1343 }
1344 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1345 if(done) {
1346 return errorCode.isSuccess();
1347 }
1348 }
1349}
1350
1351UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1352 const UChar *s, int32_t length,
1353 CollationKey &key, IcuTestErrorCode &errorCode) {
1354 if(errorCode.isFailure()) { return FALSE; }
1355 coll->getCollationKey(s, length, key, errorCode);
1356 if(errorCode.isFailure()) {
1357 infoln(fileTestName);
1358 errln("Collator(%s).getCollationKey() failed: %s",
1359 norm, errorCode.errorName());
1360 infoln(line);
1361 return FALSE;
1362 }
1363 int32_t keyLength;
1364 const uint8_t *keyBytes = key.getByteArray(keyLength);
1365 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1366 infoln(fileTestName);
1367 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1368 norm);
1369 infoln(line);
1370 infoln(printCollationKey(key));
1371 return FALSE;
1372 }
1373
1374 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1375 if(numLevels < UCOL_IDENTICAL) {
1376 ++numLevels;
1377 } else {
1378 numLevels = 5;
1379 }
1380 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1381 ++numLevels;
1382 }
1383 errorCode.assertSuccess();
1384 int32_t numLevelSeparators = 0;
1385 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1386 uint8_t b = keyBytes[i];
1387 if(b == 0) {
1388 infoln(fileTestName);
1389 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1390 infoln(line);
1391 infoln(printCollationKey(key));
1392 return FALSE;
1393 }
1394 if(b == 1) { ++numLevelSeparators; }
1395 }
1396 if(numLevelSeparators != (numLevels - 1)) {
1397 infoln(fileTestName);
1398 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1399 norm, (int)numLevelSeparators, (int)numLevels);
1400 infoln(line);
1401 infoln(printCollationKey(key));
1402 return FALSE;
1403 }
1404
b331163b
A
1405 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1406 static const int32_t partSizes[] = { 32, 3, 1 };
1407 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1408 int32_t partSize = partSizes[psi];
1409 CharString parts;
1410 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1411 infoln(fileTestName);
1412 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1413 norm, (int)partSize, errorCode.errorName());
1414 infoln(line);
1415 return FALSE;
1416 }
1417 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1418 infoln(fileTestName);
1419 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1420 norm, (int)partSize);
1421 infoln(line);
1422 infoln(printCollationKey(key));
1423 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1424 return FALSE;
1425 }
1426 }
1427 return TRUE;
1428}
1429
1430/**
1431 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1432 * Leaves key unchanged if s does not contain U+FFFE.
1433 * @return TRUE if the key was successfully changed
1434 */
1435UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1436 CollationKey &key, IcuTestErrorCode &errorCode) {
1437 if(errorCode.isFailure()) { return FALSE; }
57a6839d
A
1438 LocalMemory<uint8_t> mergedKey;
1439 int32_t mergedKeyLength = 0;
1440 int32_t mergedKeyCapacity = 0;
1441 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1442 int32_t segmentStart = 0;
1443 for(int32_t i = 0;;) {
1444 if(i == sLength) {
1445 if(segmentStart == 0) {
1446 // s does not contain any U+FFFE.
b331163b 1447 return FALSE;
57a6839d
A
1448 }
1449 } else if(s[i] != 0xfffe) {
1450 ++i;
1451 continue;
1452 }
1453 // Get the sort key for another segment and merge it into mergedKey.
1454 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1455 CollationKey key2;
1456 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1457 int32_t key1Length, key2Length;
1458 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1459 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1460 uint8_t *dest;
1461 int32_t minCapacity = key1Length + key2Length;
1462 if(key1Length > 0) { --minCapacity; }
1463 if(minCapacity <= mergedKeyCapacity) {
1464 dest = mergedKey.getAlias();
1465 } else {
1466 if(minCapacity <= 200) {
1467 mergedKeyCapacity = 200;
1468 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1469 mergedKeyCapacity *= 2;
1470 } else {
1471 mergedKeyCapacity = minCapacity;
1472 }
1473 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1474 }
1475 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1476 if(key1Length == 0) {
1477 // key2 is the sort key for the first segment.
1478 uprv_memcpy(dest, key2Bytes, key2Length);
1479 mergedKeyLength = key2Length;
1480 } else {
1481 mergedKeyLength =
1482 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1483 dest, mergedKeyCapacity);
1484 }
1485 if(i == sLength) { break; }
1486 segmentStart = ++i;
1487 }
b331163b 1488 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
57a6839d
A
1489 return TRUE;
1490}
1491
1492namespace {
1493
1494/**
1495 * Replaces unpaired surrogates with U+FFFD.
1496 * Returns s if no replacement was made, otherwise buffer.
1497 */
1498const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1499 int32_t i = 0;
1500 while(i < s.length()) {
1501 UChar32 c = s.char32At(i);
1502 if(U_IS_SURROGATE(c)) {
1503 if(buffer.length() < i) {
1504 buffer.append(s, buffer.length(), i - buffer.length());
1505 }
1506 buffer.append((UChar)0xfffd);
1507 }
1508 i += U16_LENGTH(c);
1509 }
1510 if(buffer.isEmpty()) {
1511 return s;
1512 }
1513 if(buffer.length() < i) {
1514 buffer.append(s, buffer.length(), i - buffer.length());
1515 }
1516 return buffer;
1517}
1518
b331163b
A
1519int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1520 UCollationResult order, UBool collHasCaseLevel) {
1521 if(order == UCOL_EQUAL) {
1522 return Collation::NO_LEVEL;
1523 }
1524 int32_t prevKeyLength;
1525 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1526 int32_t keyLength;
1527 const uint8_t *bytes = key.getByteArray(keyLength);
1528 int32_t level = Collation::PRIMARY_LEVEL;
1529 for(int32_t i = 0;; ++i) {
1530 uint8_t b = prevBytes[i];
1531 if(b != bytes[i]) { break; }
1532 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1533 ++level;
1534 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1535 ++level;
1536 }
1537 }
1538 }
1539 return level;
1540}
1541
57a6839d
A
1542}
1543
1544UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1545 const UnicodeString &prevString, const UnicodeString &s,
1546 UCollationResult expectedOrder, Collation::Level expectedLevel,
1547 IcuTestErrorCode &errorCode) {
1548 if(errorCode.isFailure()) { return FALSE; }
1549
1550 // Get the sort keys first, for error debug output.
1551 CollationKey prevKey;
1552 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1553 prevKey, errorCode)) {
1554 return FALSE;
1555 }
1556 CollationKey key;
1557 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1558
1559 UCollationResult order = coll->compare(prevString, s, errorCode);
1560 if(order != expectedOrder || errorCode.isFailure()) {
1561 infoln(fileTestName);
1562 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1563 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1564 infoln(prevFileLine);
1565 infoln(fileLine);
1566 infoln(printCollationKey(prevKey));
1567 infoln(printCollationKey(key));
1568 return FALSE;
1569 }
1570 order = coll->compare(s, prevString, errorCode);
1571 if(order != -expectedOrder || errorCode.isFailure()) {
1572 infoln(fileTestName);
1573 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1574 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1575 infoln(prevFileLine);
1576 infoln(fileLine);
1577 infoln(printCollationKey(prevKey));
1578 infoln(printCollationKey(key));
1579 return FALSE;
1580 }
1581 // Test NUL-termination if the strings do not contain NUL characters.
1582 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1583 if(!containNUL) {
1584 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1585 if(order != expectedOrder || errorCode.isFailure()) {
1586 infoln(fileTestName);
1587 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1588 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1589 infoln(prevFileLine);
1590 infoln(fileLine);
1591 infoln(printCollationKey(prevKey));
1592 infoln(printCollationKey(key));
1593 return FALSE;
1594 }
1595 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1596 if(order != -expectedOrder || errorCode.isFailure()) {
1597 infoln(fileTestName);
1598 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1599 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1600 infoln(prevFileLine);
1601 infoln(fileLine);
1602 infoln(printCollationKey(prevKey));
1603 infoln(printCollationKey(key));
1604 return FALSE;
1605 }
1606 }
1607
57a6839d
A
1608 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1609 // Unpaired surrogates cannot be converted to UTF-8.
1610 // Create valid UTF-16 strings if necessary, and use those for
1611 // both the expected compare() result and for the input to compare(UTF-8).
1612 UnicodeString prevBuffer, sBuffer;
1613 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1614 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1615 std::string prevUTF8, sUTF8;
1616 UnicodeString(prevValid).toUTF8String(prevUTF8);
1617 UnicodeString(sValid).toUTF8String(sUTF8);
1618 UCollationResult expectedUTF8Order;
1619 if(&prevValid == &prevString && &sValid == &s) {
1620 expectedUTF8Order = expectedOrder;
1621 } else {
1622 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1623 }
1624
1625 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1626 if(order != expectedUTF8Order || errorCode.isFailure()) {
1627 infoln(fileTestName);
1628 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1629 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1630 infoln(prevFileLine);
1631 infoln(fileLine);
1632 infoln(printCollationKey(prevKey));
1633 infoln(printCollationKey(key));
1634 return FALSE;
1635 }
1636 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1637 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1638 infoln(fileTestName);
1639 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1640 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1641 infoln(prevFileLine);
1642 infoln(fileLine);
1643 infoln(printCollationKey(prevKey));
1644 infoln(printCollationKey(key));
1645 return FALSE;
1646 }
1647 // Test NUL-termination if the strings do not contain NUL characters.
1648 if(!containNUL) {
1649 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1650 if(order != expectedUTF8Order || errorCode.isFailure()) {
1651 infoln(fileTestName);
1652 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1653 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1654 infoln(prevFileLine);
1655 infoln(fileLine);
1656 infoln(printCollationKey(prevKey));
1657 infoln(printCollationKey(key));
1658 return FALSE;
1659 }
1660 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1661 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1662 infoln(fileTestName);
1663 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1664 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1665 infoln(prevFileLine);
1666 infoln(fileLine);
1667 infoln(printCollationKey(prevKey));
1668 infoln(printCollationKey(key));
1669 return FALSE;
1670 }
1671 }
57a6839d
A
1672
1673 UCharIterator leftIter;
1674 UCharIterator rightIter;
1675 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1676 uiter_setString(&rightIter, s.getBuffer(), s.length());
1677 order = coll->compare(leftIter, rightIter, errorCode);
1678 if(order != expectedOrder || errorCode.isFailure()) {
1679 infoln(fileTestName);
1680 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1681 "wrong order: %d != %d (%s)",
1682 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1683 infoln(prevFileLine);
1684 infoln(fileLine);
1685 infoln(printCollationKey(prevKey));
1686 infoln(printCollationKey(key));
1687 return FALSE;
1688 }
1689
1690 order = prevKey.compareTo(key, errorCode);
1691 if(order != expectedOrder || errorCode.isFailure()) {
1692 infoln(fileTestName);
1693 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1694 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1695 infoln(prevFileLine);
1696 infoln(fileLine);
1697 infoln(printCollationKey(prevKey));
1698 infoln(printCollationKey(key));
1699 return FALSE;
1700 }
b331163b
A
1701 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1702 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
57a6839d 1703 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
57a6839d
A
1704 if(level != expectedLevel) {
1705 infoln(fileTestName);
1706 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1707 (int)fileLineNumber, norm, order, level, expectedLevel);
1708 infoln(prevFileLine);
1709 infoln(fileLine);
1710 infoln(printCollationKey(prevKey));
1711 infoln(printCollationKey(key));
1712 return FALSE;
1713 }
1714 }
b331163b
A
1715
1716 // If either string contains U+FFFE, then their sort keys must compare the same as
1717 // the merged sort keys of each string's between-FFFE segments.
1718 //
1719 // It is not required that
1720 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1721 // only that those two methods yield the same order.
1722 //
1723 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1724 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1725 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1726 errorCode.isFailure()) {
1727 order = prevKey.compareTo(key, errorCode);
1728 if(order != expectedOrder || errorCode.isFailure()) {
1729 infoln(fileTestName);
1730 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1731 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1732 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1733 infoln(prevFileLine);
1734 infoln(fileLine);
1735 infoln(printCollationKey(prevKey));
1736 infoln(printCollationKey(key));
1737 return FALSE;
1738 }
1739 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1740 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1741 if(mergedLevel != level) {
1742 infoln(fileTestName);
1743 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1744 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1745 (int)fileLineNumber, norm, order, mergedLevel, level);
1746 infoln(prevFileLine);
1747 infoln(fileLine);
1748 infoln(printCollationKey(prevKey));
1749 infoln(printCollationKey(key));
1750 return FALSE;
1751 }
1752 }
1753 }
57a6839d
A
1754 return TRUE;
1755}
1756
1757void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1758 if(errorCode.isFailure()) { return; }
1759 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1760 UnicodeString prevString, s;
1761 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
b331163b
A
1762 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1763 // Parse the line even if it will be ignored (when we do not have a Collator)
1764 // in order to report syntax issues.
57a6839d
A
1765 Collation::Level relation = parseRelationAndString(s, errorCode);
1766 if(errorCode.isFailure()) {
1767 errorCode.reset();
1768 break;
1769 }
b331163b
A
1770 if(coll == NULL) {
1771 // We were unable to create the Collator but continue with tests.
1772 // Ignore test data for this Collator.
1773 // The next Collator creation might work.
1774 continue;
1775 }
57a6839d
A
1776 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1777 Collation::Level expectedLevel = relation;
1778 s.getTerminatedBuffer(); // Ensure NUL-termination.
1779 UBool isOk = TRUE;
1780 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1781 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1782 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1783 expectedOrder, expectedLevel, errorCode);
1784 }
1785 if(isOk) {
1786 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1787 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1788 expectedOrder, expectedLevel, errorCode);
1789 }
1790 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1791 UnicodeString pn = nfd->normalize(prevString, errorCode);
1792 UnicodeString n = nfd->normalize(s, errorCode);
1793 pn.getTerminatedBuffer();
1794 n.getTerminatedBuffer();
1795 errorCode.assertSuccess();
1796 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1797 expectedOrder, expectedLevel, errorCode);
1798 }
1799 if(!isOk) {
1800 errorCode.reset(); // already reported
1801 }
1802 prevFileLine = fileLine;
1803 prevString = s;
1804 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1805 }
1806}
1807
1808void CollationTest::TestDataDriven() {
1809 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1810
1811 fcd = Normalizer2Factory::getFCDInstance(errorCode);
b331163b 1812 nfd = Normalizer2::getNFDInstance(errorCode);
57a6839d
A
1813 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1814 return;
1815 }
1816
1817 CharString path(getSourceTestData(errorCode), errorCode);
1818 path.appendPathPart("collationtest.txt", errorCode);
1819 const char *codePage = "UTF-8";
1820 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1821 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1822 return;
1823 }
b331163b
A
1824 // Read a new line if necessary.
1825 // Sub-parsers leave the first line set that they do not handle.
1826 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
57a6839d
A
1827 if(!isSectionStarter(fileLine[0])) {
1828 errln("syntax error on line %d", (int)fileLineNumber);
1829 infoln(fileLine);
1830 return;
1831 }
1832 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1833 fileTestName = fileLine;
1834 logln(fileLine);
1835 fileLine.remove();
1836 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1837 setRootCollator(errorCode);
1838 fileLine.remove();
1839 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1840 setLocaleCollator(errorCode);
1841 fileLine.remove();
1842 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1843 buildTailoring(f.getAlias(), errorCode);
1844 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1845 parseAndSetAttribute(errorCode);
1846 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1847 checkCompareStrings(f.getAlias(), errorCode);
1848 } else {
1849 errln("syntax error on line %d", (int)fileLineNumber);
1850 infoln(fileLine);
1851 return;
1852 }
1853 }
1854}
1855
1856#endif // !UCONFIG_NO_COLLATION