]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/collationtest.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / test / intltest / collationtest.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationtest.cpp
7 *
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
39 #include "cstring.h"
40 #include "intltest.h"
41 #include "normalizer2impl.h"
42 #include "ucbuf.h"
43 #include "uhash.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
47 #include "uvectr32.h"
48 #include "uvectr64.h"
49 #include "writesrc.h"
50
51 // TODO: Move to ucbuf.h
52 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
53
54 class CodePointIterator;
55
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57
58 class CollationTest : public IntlTest {
59 public:
60 CollationTest()
61 : fcd(NULL), nfd(NULL),
62 fileLineNumber(0),
63 coll(NULL) {}
64
65 ~CollationTest() {
66 delete coll;
67 }
68
69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
70
71 void TestMinMax();
72 void TestImplicits();
73 void TestNulTerminated();
74 void TestIllegalUTF8();
75 void TestShortFCDData();
76 void TestFCD();
77 void TestCollationWeights();
78 void TestRootElements();
79 void TestTailoredElements();
80 void TestDataDriven();
81
82 private:
83 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
84 void checkAllocWeights(CollationWeights &cw,
85 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
86 int32_t someLength, int32_t minCount);
87
88 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
89 static UnicodeString printCollationKey(const CollationKey &key);
90
91 // Helpers & fields for data-driven test.
92 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
93 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
94 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
95 int32_t skipSpaces(int32_t i) {
96 while(isSpace(fileLine[i])) { ++i; }
97 return i;
98 }
99
100 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
101 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
102 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
103 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
104 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
105 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
106 void setRootCollator(IcuTestErrorCode &errorCode);
107 void setLocaleCollator(IcuTestErrorCode &errorCode);
108
109 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
110
111 UBool getSortKeyParts(const UChar *s, int32_t length,
112 CharString &dest, int32_t partSize,
113 IcuTestErrorCode &errorCode);
114 UBool getCollationKey(const char *norm, const UnicodeString &line,
115 const UChar *s, int32_t length,
116 CollationKey &key, IcuTestErrorCode &errorCode);
117 UBool getMergedCollationKey(const UChar *s, int32_t length,
118 CollationKey &key, IcuTestErrorCode &errorCode);
119 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120 const UnicodeString &prevString, const UnicodeString &s,
121 UCollationResult expectedOrder, Collation::Level expectedLevel,
122 IcuTestErrorCode &errorCode);
123 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125 const Normalizer2 *fcd, *nfd;
126 UnicodeString fileLine;
127 int32_t fileLineNumber;
128 UnicodeString fileTestName;
129 Collator *coll;
130 };
131
132 extern IntlTest *createCollationTest() {
133 return new CollationTest();
134 }
135
136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137 if(exec) {
138 logln("TestSuite CollationTest: ");
139 }
140 TESTCASE_AUTO_BEGIN;
141 TESTCASE_AUTO(TestMinMax);
142 TESTCASE_AUTO(TestImplicits);
143 TESTCASE_AUTO(TestNulTerminated);
144 TESTCASE_AUTO(TestIllegalUTF8);
145 TESTCASE_AUTO(TestShortFCDData);
146 TESTCASE_AUTO(TestFCD);
147 TESTCASE_AUTO(TestCollationWeights);
148 TESTCASE_AUTO(TestRootElements);
149 TESTCASE_AUTO(TestTailoredElements);
150 TESTCASE_AUTO(TestDataDriven);
151 TESTCASE_AUTO_END;
152 }
153
154 void CollationTest::TestMinMax() {
155 IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157 setRootCollator(errorCode);
158 if(errorCode.isFailure()) {
159 errorCode.reset();
160 return;
161 }
162 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163 if(rbc == NULL) {
164 errln("the root collator is not a RuleBasedCollator");
165 return;
166 }
167
168 static const UChar s[2] = { 0xfffe, 0xffff };
169 UVector64 ces(errorCode);
170 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171 errorCode.assertSuccess();
172 if(ces.size() != 2) {
173 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174 return;
175 }
176 int64_t ce = ces.elementAti(0);
177 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
178 if(ce != expected) {
179 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
180 }
181
182 ce = ces.elementAti(1);
183 expected = Collation::makeCE(Collation::MAX_PRIMARY);
184 if(ce != expected) {
185 errln("CE(U+ffff)=%04lx != max..", (long)ce);
186 }
187 }
188
189 void CollationTest::TestImplicits() {
190 IcuTestErrorCode errorCode(*this, "TestImplicits");
191
192 const CollationData *cd = CollationRoot::getData(errorCode);
193 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
194 return;
195 }
196
197 // Implicit primary weights should be assigned for the following sets,
198 // and sort in ascending order by set and then code point.
199 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
200
201 // core Han Unified Ideographs
202 UnicodeSet coreHan("[\\p{unified_ideograph}&"
203 "[\\p{Block=CJK_Unified_Ideographs}"
204 "\\p{Block=CJK_Compatibility_Ideographs}]]",
205 errorCode);
206 // all other Unified Han ideographs
207 UnicodeSet otherHan("[\\p{unified ideograph}-"
208 "[\\p{Block=CJK_Unified_Ideographs}"
209 "\\p{Block=CJK_Compatibility_Ideographs}]]",
210 errorCode);
211 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
212 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
213
214 // Starting with CLDR 26/ICU 54, the root Han order may instead be
215 // the Unihan radical-stroke order.
216 // The tests should pass either way, so we only test the order of a small set of Han characters
217 // whose radical-stroke order is the same as their code point order.
218 UnicodeSet someHanInCPOrder(
219 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
220 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
221 errorCode);
222 UnicodeSet inOrder(someHanInCPOrder);
223 inOrder.addAll(unassigned).freeze();
224 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
225 return;
226 }
227 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
228 UChar32 prev = 0;
229 uint32_t prevPrimary = 0;
230 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
231 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
232 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
233 while(iter->next()) {
234 UChar32 c = iter->getCodepoint();
235 UnicodeString s(c);
236 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
237 int64_t ce = ci.nextCE(errorCode);
238 int64_t ce2 = ci.nextCE(errorCode);
239 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
240 return;
241 }
242 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
243 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
244 continue;
245 }
246 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
247 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
248 (long)c, (long)(ce & 0xffffffff));
249 continue;
250 }
251 uint32_t primary = (uint32_t)(ce >> 32);
252 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
253 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
254 (long)c, (long)primary, (long)prev, (long)prevPrimary);
255 }
256 prev = c;
257 prevPrimary = primary;
258 }
259 }
260 }
261
262 void CollationTest::TestNulTerminated() {
263 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
264 const CollationData *data = CollationRoot::getData(errorCode);
265 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
266 return;
267 }
268
269 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
270
271 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
272 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
273 for(int32_t i = 0;; ++i) {
274 int64_t ce1 = ci1.nextCE(errorCode);
275 int64_t ce2 = ci2.nextCE(errorCode);
276 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
277 return;
278 }
279 if(ce1 != ce2) {
280 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
281 break;
282 }
283 if(ce1 == Collation::NO_CE) { break; }
284 }
285 }
286
287 void CollationTest::TestIllegalUTF8() {
288 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
289
290 setRootCollator(errorCode);
291 if(errorCode.isFailure()) {
292 errorCode.reset();
293 return;
294 }
295 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
296
297 static const char *strings[] = {
298 // U+FFFD
299 "a\xef\xbf\xbdz",
300 // illegal byte sequences
301 "a\x80z", // trail byte
302 "a\xc1\x81z", // non-shortest form
303 "a\xe0\x82\x83z", // non-shortest form
304 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
305 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
306 "a\xf0\x8f\xbf\xbfz", // non-shortest form
307 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
308 };
309
310 StringPiece fffd(strings[0]);
311 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
312 StringPiece illegal(strings[i]);
313 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
314 if(order != UCOL_EQUAL) {
315 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
316 (int)i, order);
317 }
318 }
319 }
320
321 namespace {
322
323 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
324 for(UChar32 c = 0x10000; c < 0x110000;) {
325 UChar32 next = c + 0x400;
326 if(src.containsSome(c, next - 1)) {
327 dest.add(U16_LEAD(c));
328 }
329 c = next;
330 }
331 }
332
333 } // namespace
334
335 void CollationTest::TestShortFCDData() {
336 // See CollationFCD class comments.
337 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
338 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
339 errorCode.assertSuccess();
340 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
341 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
342 UnicodeSet lccc; // actual
343 for(UChar32 c = 0; c <= 0xffff; ++c) {
344 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
345 }
346 UnicodeSet diff(expectedLccc);
347 diff.removeAll(lccc);
348 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
349 UnicodeString empty("[]");
350 UnicodeString diffString;
351 diff.toPattern(diffString, TRUE);
352 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
353 diff = lccc;
354 diff.removeAll(expectedLccc);
355 diff.toPattern(diffString, TRUE);
356 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
357
358 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
359 if (errorCode.isSuccess()) {
360 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
361 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
362 UnicodeSet tccc; // actual
363 for(UChar32 c = 0; c <= 0xffff; ++c) {
364 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
365 }
366 diff = expectedTccc;
367 diff.removeAll(tccc);
368 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
369 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
370 diff = tccc;
371 diff.removeAll(expectedTccc);
372 diff.toPattern(diffString, TRUE);
373 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
374 }
375 }
376
377 class CodePointIterator {
378 public:
379 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
380 void resetToStart() { pos = 0; }
381 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
382 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
383 int32_t getLength() const { return length; }
384 int getIndex() const { return (int)pos; }
385 private:
386 const UChar32 *cp;
387 int32_t length;
388 int32_t pos;
389 };
390
391 void CollationTest::checkFCD(const char *name,
392 CollationIterator &ci, CodePointIterator &cpi) {
393 IcuTestErrorCode errorCode(*this, "checkFCD");
394
395 // Iterate forward to the limit.
396 for(;;) {
397 UChar32 c1 = ci.nextCodePoint(errorCode);
398 UChar32 c2 = cpi.next();
399 if(c1 != c2) {
400 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
401 name, (long)c1, (long)c2, cpi.getIndex());
402 return;
403 }
404 if(c1 < 0) { break; }
405 }
406
407 // Iterate backward most of the way.
408 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
409 UChar32 c1 = ci.previousCodePoint(errorCode);
410 UChar32 c2 = cpi.previous();
411 if(c1 != c2) {
412 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
413 name, (long)c1, (long)c2, cpi.getIndex());
414 return;
415 }
416 }
417
418 // Forward again.
419 for(;;) {
420 UChar32 c1 = ci.nextCodePoint(errorCode);
421 UChar32 c2 = cpi.next();
422 if(c1 != c2) {
423 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
424 name, (long)c1, (long)c2, cpi.getIndex());
425 return;
426 }
427 if(c1 < 0) { break; }
428 }
429
430 // Iterate backward to the start.
431 for(;;) {
432 UChar32 c1 = ci.previousCodePoint(errorCode);
433 UChar32 c2 = cpi.previous();
434 if(c1 != c2) {
435 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
436 name, (long)c1, (long)c2, cpi.getIndex());
437 return;
438 }
439 if(c1 < 0) { break; }
440 }
441 }
442
443 void CollationTest::TestFCD() {
444 IcuTestErrorCode errorCode(*this, "TestFCD");
445 const CollationData *data = CollationRoot::getData(errorCode);
446 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
447 return;
448 }
449
450 // Input string, not FCD, NUL-terminated.
451 static const UChar s[] = {
452 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
454 0x327, 0x308, // ccc=202, 230
455 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
456 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
458 0xac01,
459 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
460 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
461 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
462 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
463 0x4e00, 0xf81,
464 0
465 };
466 // Expected code points.
467 static const UChar32 cp[] = {
468 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
469 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
470 0x1D15F, 0x1D16D,
471 0xac01,
472 0x63, 0x327, 0x1D165, 0x1D16D,
473 0x61,
474 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
475 0x4e00, 0xf71, 0xf80
476 };
477
478 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
479 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
480 return;
481 }
482 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
483 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
484
485 #if U_HAVE_STD_STRING
486 cpi.resetToStart();
487 std::string utf8;
488 UnicodeString(s).toUTF8String(utf8);
489 FCDUTF8CollationIterator u8ci(data, FALSE,
490 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
491 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
492 return;
493 }
494 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
495 #endif
496
497 cpi.resetToStart();
498 UCharIterator iter;
499 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
500 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
501 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
502 return;
503 }
504 checkFCD("FCDUIterCollationIterator", uici, cpi);
505 }
506
507 void CollationTest::checkAllocWeights(CollationWeights &cw,
508 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
509 int32_t someLength, int32_t minCount) {
510 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
511 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
512 (long)lowerLimit, (long)upperLimit, (long)n);
513 return;
514 }
515 uint32_t previous = lowerLimit;
516 int32_t count = 0; // number of weights that have someLength
517 for(int32_t i = 0; i < n; ++i) {
518 uint32_t w = cw.nextWeight();
519 if(w == 0xffffffff) {
520 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
521 "returns only %ld weights",
522 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
523 return;
524 }
525 if(!(previous < w && w < upperLimit)) {
526 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
527 "number %ld -> %lx not between %lx and %lx",
528 (long)lowerLimit, (long)upperLimit, (long)n,
529 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
530 return;
531 }
532 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
533 }
534 if(count < minCount) {
535 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
536 "returns only %ld < %ld weights of length %d",
537 (long)lowerLimit, (long)upperLimit, (long)n,
538 (long)count, (long)minCount, (int)someLength);
539 }
540 }
541
542 void CollationTest::TestCollationWeights() {
543 CollationWeights cw;
544
545 // Non-compressible primaries use 254 second bytes 02..FF.
546 logln("CollationWeights.initForPrimary(non-compressible)");
547 cw.initForPrimary(FALSE);
548 // Expect 1 weight 11 and 254 weights 12xx.
549 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
550 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
551 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
552 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
553 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
554 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
555 // Expect 254^2=64516 three-byte weights.
556 // During computation, there should be 3 three-byte ranges
557 // 10ffff, 11xxxx, 120202.
558 // The middle one should be split 64515:1,
559 // and the newly-split-off range and the last ranged lengthened.
560 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
561 // Expect weights 1102 & 1103.
562 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
563 // Expect weights 102102 & 102103.
564 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
565
566 // Compressible primaries use 251 second bytes 04..FE.
567 logln("CollationWeights.initForPrimary(compressible)");
568 cw.initForPrimary(TRUE);
569 // Expect 1 weight 11 and 251 weights 12xx.
570 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
571 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
572 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
573 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
574 // Expect weights 1104 & 1105.
575 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
576 // Expect weights 102102 & 102103.
577 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
578
579 // Secondary and tertiary weights use only bytes 3 & 4.
580 logln("CollationWeights.initForSecondary()");
581 cw.initForSecondary();
582 // Expect weights fbxx and all four fc..ff.
583 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
584
585 logln("CollationWeights.initForTertiary()");
586 cw.initForTertiary();
587 // Expect weights 3dxx and both 3e & 3f.
588 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
589 }
590
591 namespace {
592
593 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
594 uint32_t p, uint32_t s, uint32_t ctq) {
595 uint32_t p1 = p >> 24;
596 uint32_t p2 = (p >> 16) & 0xff;
597 uint32_t p3 = (p >> 8) & 0xff;
598 uint32_t p4 = p & 0xff;
599 uint32_t s1 = s >> 8;
600 uint32_t s2 = s & 0xff;
601 // ctq = Case, Tertiary, Quaternary
602 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
603 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
604 uint32_t t1 = t >> 8;
605 uint32_t t2 = t & 0xff;
606 uint32_t q = ctq & Collation::QUATERNARY_MASK;
607 // No leading zero bytes.
608 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
609 return FALSE;
610 }
611 // No intermediate zero bytes.
612 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
613 return FALSE;
614 }
615 if(p2 != 0 && p3 == 0 && p4 != 0) {
616 return FALSE;
617 }
618 // Minimum & maximum lead bytes.
619 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
620 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
621 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
622 return FALSE;
623 }
624 if(c > 2) {
625 return FALSE;
626 }
627 // The valid byte range for the second primary byte depends on compressibility.
628 if(p2 != 0) {
629 if(data.isCompressibleLeadByte(p1)) {
630 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
631 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
632 return FALSE;
633 }
634 } else {
635 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
636 return FALSE;
637 }
638 }
639 }
640 // Other bytes just need to avoid the level separator.
641 // Trailing zeros are ok.
642 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
643 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
644 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
645 return FALSE;
646 }
647 // Well-formed CEs.
648 if(p == 0) {
649 if(s == 0) {
650 if(t == 0) {
651 // Completely ignorable CE.
652 // Quaternary CEs are not supported.
653 if(c != 0 || q != 0) {
654 return FALSE;
655 }
656 } else {
657 // Tertiary CE.
658 if(t < re.getTertiaryBoundary() || c != 2) {
659 return FALSE;
660 }
661 }
662 } else {
663 // Secondary CE.
664 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
665 return FALSE;
666 }
667 }
668 } else {
669 // Primary CE.
670 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
671 s >= re.getSecondaryBoundary()) {
672 return FALSE;
673 }
674 if(t == 0 || t >= re.getTertiaryBoundary()) {
675 return FALSE;
676 }
677 }
678 return TRUE;
679 }
680
681 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
682 uint32_t p = (uint32_t)(ce >> 32);
683 uint32_t secTer = (uint32_t)ce;
684 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
685 }
686
687 class RootElementsIterator {
688 public:
689 RootElementsIterator(const CollationData &root)
690 : data(root),
691 elements(root.rootElements), length(root.rootElementsLength),
692 pri(0), secTer(0),
693 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
694
695 UBool next() {
696 if(index >= length) { return FALSE; }
697 uint32_t p = elements[index];
698 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
699 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
700 ++index;
701 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
702 return TRUE;
703 }
704 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
705 // End of a range, enumerate the primaries in the range.
706 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
707 p &= 0xffffff00;
708 if(pri == p) {
709 // Finished the range, return the next CE after it.
710 ++index;
711 return next();
712 }
713 U_ASSERT(pri < p);
714 // Return the next primary in this range.
715 UBool isCompressible = data.isCompressiblePrimary(pri);
716 if((pri & 0xffff) == 0) {
717 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
718 } else {
719 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
720 }
721 return TRUE;
722 }
723 // Simple primary CE.
724 ++index;
725 pri = p;
726 // Does this have an explicit below-common sec/ter unit,
727 // or does it imply a common one?
728 if(index == length) {
729 secTer = Collation::COMMON_SEC_AND_TER_CE;
730 } else {
731 secTer = elements[index];
732 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
733 // No sec/ter delta.
734 secTer = Collation::COMMON_SEC_AND_TER_CE;
735 } else {
736 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
737 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
738 // Implied sec/ter.
739 secTer = Collation::COMMON_SEC_AND_TER_CE;
740 } else {
741 // Explicit sec/ter below common/common.
742 ++index;
743 }
744 }
745 }
746 return TRUE;
747 }
748
749 uint32_t getPrimary() const { return pri; }
750 uint32_t getSecTer() const { return secTer; }
751
752 private:
753 const CollationData &data;
754 const uint32_t *elements;
755 int32_t length;
756
757 uint32_t pri;
758 uint32_t secTer;
759 int32_t index;
760 };
761
762 } // namespace
763
764 void CollationTest::TestRootElements() {
765 IcuTestErrorCode errorCode(*this, "TestRootElements");
766 const CollationData *root = CollationRoot::getData(errorCode);
767 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
768 return;
769 }
770 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
771 RootElementsIterator iter(*root);
772
773 // We check each root CE for validity,
774 // and we also verify that there is a tailoring gap between each two CEs.
775 CollationWeights cw1c; // compressible primary weights
776 CollationWeights cw1u; // uncompressible primary weights
777 CollationWeights cw2;
778 CollationWeights cw3;
779
780 cw1c.initForPrimary(TRUE);
781 cw1u.initForPrimary(FALSE);
782 cw2.initForSecondary();
783 cw3.initForTertiary();
784
785 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
786 // nor the special merge-separator CE for U+FFFE.
787 uint32_t prevPri = 0;
788 uint32_t prevSec = 0;
789 uint32_t prevTer = 0;
790 while(iter.next()) {
791 uint32_t pri = iter.getPrimary();
792 uint32_t secTer = iter.getSecTer();
793 // CollationRootElements CEs must have 0 case and quaternary bits.
794 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
795 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
796 (long)pri, (long)secTer);
797 }
798 uint32_t sec = secTer >> 16;
799 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
800 uint32_t ctq = ter;
801 if(pri == 0 && sec == 0 && ter != 0) {
802 // Tertiary CEs must have uppercase bits,
803 // but they are not stored in the CollationRootElements.
804 ctq |= 0x8000;
805 }
806 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
807 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
808 } else {
809 if(pri != prevPri) {
810 uint32_t newWeight = 0;
811 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
812 // There is currently no tailoring gap after primary ignorables,
813 // and we forbid tailoring after U+FFFD and U+FFFF.
814 } else if(root->isCompressiblePrimary(prevPri)) {
815 if(!cw1c.allocWeights(prevPri, pri, 1)) {
816 errln("no primary/compressible tailoring gap between %08lx and %08lx",
817 (long)prevPri, (long)pri);
818 } else {
819 newWeight = cw1c.nextWeight();
820 }
821 } else {
822 if(!cw1u.allocWeights(prevPri, pri, 1)) {
823 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
824 (long)prevPri, (long)pri);
825 } else {
826 newWeight = cw1u.nextWeight();
827 }
828 }
829 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
830 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
831 (long)prevPri, (long)newWeight, (long)pri);
832 }
833 } else if(sec != prevSec) {
834 uint32_t lowerLimit =
835 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
836 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
837 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
838 } else {
839 uint32_t newWeight = cw2.nextWeight();
840 if(!(prevSec < newWeight && newWeight < sec)) {
841 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
842 (long)lowerLimit, (long)newWeight, (long)sec);
843 }
844 }
845 } else if(ter != prevTer) {
846 uint32_t lowerLimit =
847 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
848 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
849 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
850 } else {
851 uint32_t newWeight = cw3.nextWeight();
852 if(!(prevTer < newWeight && newWeight < ter)) {
853 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
854 (long)lowerLimit, (long)newWeight, (long)ter);
855 }
856 }
857 } else {
858 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
859 }
860 }
861 prevPri = pri;
862 prevSec = sec;
863 prevTer = ter;
864 }
865 }
866
867 void CollationTest::TestTailoredElements() {
868 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
869 const CollationData *root = CollationRoot::getData(errorCode);
870 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
871 return;
872 }
873 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
874
875 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
876 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
877 return;
878 }
879 uhash_setKeyDeleter(prevLocales, uprv_free);
880 // TestRootElements() tests the root collator which does not have tailorings.
881 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
882 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
883 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
884
885 UVector64 ces(errorCode);
886 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
887 U_ASSERT(locales.isValid());
888 const char *localeID = "root";
889 do {
890 Locale locale(localeID);
891 LocalPointer<StringEnumeration> types(
892 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
893 errorCode.assertSuccess();
894 const char *type; // first: default type
895 while((type = types->next(NULL, errorCode)) != NULL) {
896 if(strncmp(type, "private-", 8) == 0) {
897 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
898 localeID, type);
899 }
900 Locale localeWithType(locale);
901 localeWithType.setKeywordValue("collation", type, errorCode);
902 errorCode.assertSuccess();
903 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
904 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
905 localeWithType.getName())) {
906 continue;
907 }
908 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
909 if(uhash_geti(prevLocales, actual.getName()) != 0) {
910 continue;
911 }
912 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
913 errorCode.assertSuccess();
914 logln("TestTailoredElements(): requested %s -> actual %s",
915 localeWithType.getName(), actual.getName());
916 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
917 if(rbc == NULL) {
918 continue;
919 }
920 // Note: It would be better to get tailored strings such that we can
921 // identify the prefix, and only get the CEs for the prefix+string,
922 // not also for the prefix.
923 // There is currently no API for that.
924 // It would help in an unusual case where a contraction starting in the prefix
925 // extends past its end, and we do not see the intended mapping.
926 // For example, for a mapping p|st, if there is also a contraction ps,
927 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
928 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
929 errorCode.assertSuccess();
930 UnicodeSetIterator iter(*tailored);
931 while(iter.next()) {
932 const UnicodeString &s = iter.getString();
933 ces.removeAllElements();
934 rbc->internalGetCEs(s, ces, errorCode);
935 errorCode.assertSuccess();
936 for(int32_t i = 0; i < ces.size(); ++i) {
937 int64_t ce = ces.elementAti(i);
938 if(!isValidCE(rootElements, *root, ce)) {
939 errln("invalid tailored CE %016llx at CE index %d from string:",
940 (long long)ce, (int)i);
941 infoln(prettify(s));
942 }
943 }
944 }
945 }
946 } while((localeID = locales->next(NULL, errorCode)) != NULL);
947 uhash_close(prevLocales);
948 }
949
950 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
951 UnicodeString s;
952 for(int32_t i = 0; i < length; ++i) {
953 if(i > 0) { s.append((UChar)0x20); }
954 uint8_t b = p[i];
955 if(b == 0) {
956 s.append((UChar)0x2e); // period
957 } else if(b == 1) {
958 s.append((UChar)0x7c); // vertical bar
959 } else {
960 appendHex(b, 2, s);
961 }
962 }
963 return s;
964 }
965
966 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
967 int32_t length;
968 const uint8_t *p = key.getByteArray(length);
969 return printSortKey(p, length);
970 }
971
972 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
973 for(;;) {
974 int32_t lineLength;
975 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
976 if(line == NULL || errorCode.isFailure()) {
977 fileLine.remove();
978 return FALSE;
979 }
980 ++fileLineNumber;
981 // Strip trailing CR/LF, comments, and spaces.
982 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
983 if(comment != NULL) {
984 lineLength = (int32_t)(comment - line);
985 } else {
986 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
987 }
988 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
989 if(lineLength != 0) {
990 fileLine.setTo(FALSE, line, lineLength);
991 return TRUE;
992 }
993 // Empty line, continue.
994 }
995 }
996
997 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
998 UErrorCode &errorCode) {
999 int32_t length = fileLine.length();
1000 int32_t i;
1001 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1002 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
1003 if(pipeIndex >= 0) {
1004 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1005 if(prefix.isEmpty()) {
1006 errln("empty prefix on line %d", (int)fileLineNumber);
1007 infoln(fileLine);
1008 errorCode = U_PARSE_ERROR;
1009 return;
1010 }
1011 start = pipeIndex + 1;
1012 } else {
1013 prefix.remove();
1014 }
1015 s = fileLine.tempSubStringBetween(start, i).unescape();
1016 if(s.isEmpty()) {
1017 errln("empty string on line %d", (int)fileLineNumber);
1018 infoln(fileLine);
1019 errorCode = U_PARSE_ERROR;
1020 return;
1021 }
1022 start = i;
1023 }
1024
1025 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1026 Collation::Level relation;
1027 int32_t start;
1028 if(fileLine[0] == 0x3c) { // <
1029 UChar second = fileLine[1];
1030 start = 2;
1031 switch(second) {
1032 case 0x31: // <1
1033 relation = Collation::PRIMARY_LEVEL;
1034 break;
1035 case 0x32: // <2
1036 relation = Collation::SECONDARY_LEVEL;
1037 break;
1038 case 0x33: // <3
1039 relation = Collation::TERTIARY_LEVEL;
1040 break;
1041 case 0x34: // <4
1042 relation = Collation::QUATERNARY_LEVEL;
1043 break;
1044 case 0x63: // <c
1045 relation = Collation::CASE_LEVEL;
1046 break;
1047 case 0x69: // <i
1048 relation = Collation::IDENTICAL_LEVEL;
1049 break;
1050 default: // just <
1051 relation = Collation::NO_LEVEL;
1052 start = 1;
1053 break;
1054 }
1055 } else if(fileLine[0] == 0x3d) { // =
1056 relation = Collation::ZERO_LEVEL;
1057 start = 1;
1058 } else {
1059 start = 0;
1060 }
1061 if(start == 0 || !isSpace(fileLine[start])) {
1062 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1063 infoln(fileLine);
1064 errorCode.set(U_PARSE_ERROR);
1065 return Collation::NO_LEVEL;
1066 }
1067 start = skipSpaces(start);
1068 UnicodeString prefix;
1069 parseString(start, prefix, s, errorCode);
1070 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1071 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1072 infoln(fileLine);
1073 errorCode.set(U_PARSE_ERROR);
1074 return Collation::NO_LEVEL;
1075 }
1076 if(start < fileLine.length()) {
1077 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1078 infoln(fileLine);
1079 errorCode.set(U_PARSE_ERROR);
1080 return Collation::NO_LEVEL;
1081 }
1082 return relation;
1083 }
1084
1085 static const struct {
1086 const char *name;
1087 UColAttribute attr;
1088 } attributes[] = {
1089 { "backwards", UCOL_FRENCH_COLLATION },
1090 { "alternate", UCOL_ALTERNATE_HANDLING },
1091 { "caseFirst", UCOL_CASE_FIRST },
1092 { "caseLevel", UCOL_CASE_LEVEL },
1093 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1094 { "strength", UCOL_STRENGTH },
1095 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1096 { "numeric", UCOL_NUMERIC_COLLATION }
1097 };
1098
1099 static const struct {
1100 const char *name;
1101 UColAttributeValue value;
1102 } attributeValues[] = {
1103 { "default", UCOL_DEFAULT },
1104 { "primary", UCOL_PRIMARY },
1105 { "secondary", UCOL_SECONDARY },
1106 { "tertiary", UCOL_TERTIARY },
1107 { "quaternary", UCOL_QUATERNARY },
1108 { "identical", UCOL_IDENTICAL },
1109 { "off", UCOL_OFF },
1110 { "on", UCOL_ON },
1111 { "shifted", UCOL_SHIFTED },
1112 { "non-ignorable", UCOL_NON_IGNORABLE },
1113 { "lower", UCOL_LOWER_FIRST },
1114 { "upper", UCOL_UPPER_FIRST }
1115 };
1116
1117 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1118 // Parse attributes even if the Collator could not be created,
1119 // in order to report syntax errors.
1120 int32_t start = skipSpaces(1);
1121 int32_t equalPos = fileLine.indexOf(0x3d);
1122 if(equalPos < 0) {
1123 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1124 parseAndSetReorderCodes(start + 7, errorCode);
1125 return;
1126 }
1127 errln("missing '=' on line %d", (int)fileLineNumber);
1128 infoln(fileLine);
1129 errorCode.set(U_PARSE_ERROR);
1130 return;
1131 }
1132
1133 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1134 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1135 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1136 UColReorderCode max;
1137 if(valueString == UNICODE_STRING("space", 5)) {
1138 max = UCOL_REORDER_CODE_SPACE;
1139 } else if(valueString == UNICODE_STRING("punct", 5)) {
1140 max = UCOL_REORDER_CODE_PUNCTUATION;
1141 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1142 max = UCOL_REORDER_CODE_SYMBOL;
1143 } else if(valueString == UNICODE_STRING("currency", 8)) {
1144 max = UCOL_REORDER_CODE_CURRENCY;
1145 } else {
1146 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1147 infoln(fileLine);
1148 errorCode.set(U_PARSE_ERROR);
1149 return;
1150 }
1151 if(coll != NULL) {
1152 coll->setMaxVariable(max, errorCode);
1153 if(errorCode.isFailure()) {
1154 errln("setMaxVariable() failed on line %d: %s",
1155 (int)fileLineNumber, errorCode.errorName());
1156 infoln(fileLine);
1157 return;
1158 }
1159 }
1160 fileLine.remove();
1161 return;
1162 }
1163
1164 UColAttribute attr;
1165 for(int32_t i = 0;; ++i) {
1166 if(i == UPRV_LENGTHOF(attributes)) {
1167 errln("invalid attribute name on line %d", (int)fileLineNumber);
1168 infoln(fileLine);
1169 errorCode.set(U_PARSE_ERROR);
1170 return;
1171 }
1172 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1173 attr = attributes[i].attr;
1174 break;
1175 }
1176 }
1177
1178 UColAttributeValue value;
1179 for(int32_t i = 0;; ++i) {
1180 if(i == UPRV_LENGTHOF(attributeValues)) {
1181 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1182 infoln(fileLine);
1183 errorCode.set(U_PARSE_ERROR);
1184 return;
1185 }
1186 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1187 value = attributeValues[i].value;
1188 break;
1189 }
1190 }
1191
1192 if(coll != NULL) {
1193 coll->setAttribute(attr, value, errorCode);
1194 if(errorCode.isFailure()) {
1195 errln("illegal attribute=value combination on line %d: %s",
1196 (int)fileLineNumber, errorCode.errorName());
1197 infoln(fileLine);
1198 return;
1199 }
1200 }
1201 fileLine.remove();
1202 }
1203
1204 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1205 UVector32 reorderCodes(errorCode);
1206 while(start < fileLine.length()) {
1207 start = skipSpaces(start);
1208 int32_t limit = start;
1209 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1210 CharString name;
1211 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1212 int32_t code = CollationRuleParser::getReorderCode(name.data());
1213 if(code < 0) {
1214 if(uprv_stricmp(name.data(), "default") == 0) {
1215 code = UCOL_REORDER_CODE_DEFAULT; // -1
1216 } else {
1217 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1218 infoln(fileLine);
1219 errorCode.set(U_PARSE_ERROR);
1220 return;
1221 }
1222 }
1223 reorderCodes.addElement(code, errorCode);
1224 start = limit;
1225 }
1226 if(coll != NULL) {
1227 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1228 if(errorCode.isFailure()) {
1229 errln("setReorderCodes() failed on line %d: %s",
1230 (int)fileLineNumber, errorCode.errorName());
1231 infoln(fileLine);
1232 return;
1233 }
1234 }
1235 fileLine.remove();
1236 }
1237
1238 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1239 UnicodeString rules;
1240 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1241 rules.append(fileLine.unescape());
1242 }
1243 if(errorCode.isFailure()) { return; }
1244 logln(rules);
1245
1246 UParseError parseError;
1247 UnicodeString reason;
1248 delete coll;
1249 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1250 if(coll == NULL) {
1251 errln("unable to allocate a new collator");
1252 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1253 return;
1254 }
1255 if(errorCode.isFailure()) {
1256 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1257 infoln(UnicodeString(" reason: ") + reason);
1258 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1259 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1260 infoln(UnicodeString(" snippet: ...") +
1261 parseError.preContext + "(!)" + parseError.postContext + "...");
1262 }
1263 delete coll;
1264 coll = NULL;
1265 errorCode.reset();
1266 } else {
1267 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1268 UnicodeString(), reason);
1269 }
1270 }
1271
1272 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1273 if(errorCode.isFailure()) { return; }
1274 delete coll;
1275 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1276 if(errorCode.isFailure()) {
1277 dataerrln("unable to create a root collator");
1278 return;
1279 }
1280 }
1281
1282 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1283 if(errorCode.isFailure()) { return; }
1284 delete coll;
1285 coll = NULL;
1286 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1287 if(at >= 0) {
1288 fileLine.setCharAt(at, (UChar)0x2a); // *
1289 }
1290 CharString localeID;
1291 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1292 if(at >= 0) {
1293 localeID.data()[at - 9] = '@';
1294 }
1295 Locale locale(localeID.data());
1296 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1297 errln("invalid language tag on line %d", (int)fileLineNumber);
1298 infoln(fileLine);
1299 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1300 return;
1301 }
1302
1303 logln("creating a collator for locale ID %s", locale.getName());
1304 coll = Collator::createInstance(locale, errorCode);
1305 if(errorCode.isFailure()) {
1306 dataerrln("unable to create a collator for locale %s on line %d",
1307 locale.getName(), (int)fileLineNumber);
1308 infoln(fileLine);
1309 delete coll;
1310 coll = NULL;
1311 errorCode.reset();
1312 }
1313 }
1314
1315 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1316 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1317 // In some sequences with Tibetan composite vowel signs,
1318 // even if the string passes the FCD check,
1319 // those composites must be decomposed.
1320 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1321 int32_t index = 0;
1322 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1323 if(++index < s.length()) {
1324 UChar c = s[index];
1325 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1326 }
1327 }
1328 return FALSE;
1329 }
1330
1331 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1332 CharString &dest, int32_t partSize,
1333 IcuTestErrorCode &errorCode) {
1334 if(errorCode.isFailure()) { return FALSE; }
1335 uint8_t part[32];
1336 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1337 UCharIterator iter;
1338 uiter_setString(&iter, s, length);
1339 uint32_t state[2] = { 0, 0 };
1340 for(;;) {
1341 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1342 UBool done = partLength < partSize;
1343 if(done) {
1344 // At the end, append the next byte as well which should be 00.
1345 ++partLength;
1346 }
1347 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1348 if(done) {
1349 return errorCode.isSuccess();
1350 }
1351 }
1352 }
1353
1354 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1355 const UChar *s, int32_t length,
1356 CollationKey &key, IcuTestErrorCode &errorCode) {
1357 if(errorCode.isFailure()) { return FALSE; }
1358 coll->getCollationKey(s, length, key, errorCode);
1359 if(errorCode.isFailure()) {
1360 infoln(fileTestName);
1361 errln("Collator(%s).getCollationKey() failed: %s",
1362 norm, errorCode.errorName());
1363 infoln(line);
1364 return FALSE;
1365 }
1366 int32_t keyLength;
1367 const uint8_t *keyBytes = key.getByteArray(keyLength);
1368 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1369 infoln(fileTestName);
1370 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1371 norm);
1372 infoln(line);
1373 infoln(printCollationKey(key));
1374 return FALSE;
1375 }
1376
1377 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1378 if(numLevels < UCOL_IDENTICAL) {
1379 ++numLevels;
1380 } else {
1381 numLevels = 5;
1382 }
1383 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1384 ++numLevels;
1385 }
1386 errorCode.assertSuccess();
1387 int32_t numLevelSeparators = 0;
1388 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1389 uint8_t b = keyBytes[i];
1390 if(b == 0) {
1391 infoln(fileTestName);
1392 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1393 infoln(line);
1394 infoln(printCollationKey(key));
1395 return FALSE;
1396 }
1397 if(b == 1) { ++numLevelSeparators; }
1398 }
1399 if(numLevelSeparators != (numLevels - 1)) {
1400 infoln(fileTestName);
1401 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1402 norm, (int)numLevelSeparators, (int)numLevels);
1403 infoln(line);
1404 infoln(printCollationKey(key));
1405 return FALSE;
1406 }
1407
1408 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1409 static const int32_t partSizes[] = { 32, 3, 1 };
1410 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1411 int32_t partSize = partSizes[psi];
1412 CharString parts;
1413 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1414 infoln(fileTestName);
1415 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1416 norm, (int)partSize, errorCode.errorName());
1417 infoln(line);
1418 return FALSE;
1419 }
1420 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1421 infoln(fileTestName);
1422 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1423 norm, (int)partSize);
1424 infoln(line);
1425 infoln(printCollationKey(key));
1426 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1427 return FALSE;
1428 }
1429 }
1430 return TRUE;
1431 }
1432
1433 /**
1434 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1435 * Leaves key unchanged if s does not contain U+FFFE.
1436 * @return TRUE if the key was successfully changed
1437 */
1438 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1439 CollationKey &key, IcuTestErrorCode &errorCode) {
1440 if(errorCode.isFailure()) { return FALSE; }
1441 LocalMemory<uint8_t> mergedKey;
1442 int32_t mergedKeyLength = 0;
1443 int32_t mergedKeyCapacity = 0;
1444 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1445 int32_t segmentStart = 0;
1446 for(int32_t i = 0;;) {
1447 if(i == sLength) {
1448 if(segmentStart == 0) {
1449 // s does not contain any U+FFFE.
1450 return FALSE;
1451 }
1452 } else if(s[i] != 0xfffe) {
1453 ++i;
1454 continue;
1455 }
1456 // Get the sort key for another segment and merge it into mergedKey.
1457 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1458 CollationKey key2;
1459 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1460 int32_t key1Length, key2Length;
1461 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1462 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1463 uint8_t *dest;
1464 int32_t minCapacity = key1Length + key2Length;
1465 if(key1Length > 0) { --minCapacity; }
1466 if(minCapacity <= mergedKeyCapacity) {
1467 dest = mergedKey.getAlias();
1468 } else {
1469 if(minCapacity <= 200) {
1470 mergedKeyCapacity = 200;
1471 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1472 mergedKeyCapacity *= 2;
1473 } else {
1474 mergedKeyCapacity = minCapacity;
1475 }
1476 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1477 }
1478 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1479 if(key1Length == 0) {
1480 // key2 is the sort key for the first segment.
1481 uprv_memcpy(dest, key2Bytes, key2Length);
1482 mergedKeyLength = key2Length;
1483 } else {
1484 mergedKeyLength =
1485 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1486 dest, mergedKeyCapacity);
1487 }
1488 if(i == sLength) { break; }
1489 segmentStart = ++i;
1490 }
1491 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1492 return TRUE;
1493 }
1494
1495 namespace {
1496
1497 /**
1498 * Replaces unpaired surrogates with U+FFFD.
1499 * Returns s if no replacement was made, otherwise buffer.
1500 */
1501 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1502 int32_t i = 0;
1503 while(i < s.length()) {
1504 UChar32 c = s.char32At(i);
1505 if(U_IS_SURROGATE(c)) {
1506 if(buffer.length() < i) {
1507 buffer.append(s, buffer.length(), i - buffer.length());
1508 }
1509 buffer.append((UChar)0xfffd);
1510 }
1511 i += U16_LENGTH(c);
1512 }
1513 if(buffer.isEmpty()) {
1514 return s;
1515 }
1516 if(buffer.length() < i) {
1517 buffer.append(s, buffer.length(), i - buffer.length());
1518 }
1519 return buffer;
1520 }
1521
1522 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1523 UCollationResult order, UBool collHasCaseLevel) {
1524 if(order == UCOL_EQUAL) {
1525 return Collation::NO_LEVEL;
1526 }
1527 int32_t prevKeyLength;
1528 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1529 int32_t keyLength;
1530 const uint8_t *bytes = key.getByteArray(keyLength);
1531 int32_t level = Collation::PRIMARY_LEVEL;
1532 for(int32_t i = 0;; ++i) {
1533 uint8_t b = prevBytes[i];
1534 if(b != bytes[i]) { break; }
1535 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1536 ++level;
1537 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1538 ++level;
1539 }
1540 }
1541 }
1542 return level;
1543 }
1544
1545 }
1546
1547 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1548 const UnicodeString &prevString, const UnicodeString &s,
1549 UCollationResult expectedOrder, Collation::Level expectedLevel,
1550 IcuTestErrorCode &errorCode) {
1551 if(errorCode.isFailure()) { return FALSE; }
1552
1553 // Get the sort keys first, for error debug output.
1554 CollationKey prevKey;
1555 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1556 prevKey, errorCode)) {
1557 return FALSE;
1558 }
1559 CollationKey key;
1560 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1561
1562 UCollationResult order = coll->compare(prevString, s, errorCode);
1563 if(order != expectedOrder || errorCode.isFailure()) {
1564 infoln(fileTestName);
1565 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1566 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1567 infoln(prevFileLine);
1568 infoln(fileLine);
1569 infoln(printCollationKey(prevKey));
1570 infoln(printCollationKey(key));
1571 return FALSE;
1572 }
1573 order = coll->compare(s, prevString, errorCode);
1574 if(order != -expectedOrder || errorCode.isFailure()) {
1575 infoln(fileTestName);
1576 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1577 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1578 infoln(prevFileLine);
1579 infoln(fileLine);
1580 infoln(printCollationKey(prevKey));
1581 infoln(printCollationKey(key));
1582 return FALSE;
1583 }
1584 // Test NUL-termination if the strings do not contain NUL characters.
1585 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1586 if(!containNUL) {
1587 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1588 if(order != expectedOrder || errorCode.isFailure()) {
1589 infoln(fileTestName);
1590 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1591 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1592 infoln(prevFileLine);
1593 infoln(fileLine);
1594 infoln(printCollationKey(prevKey));
1595 infoln(printCollationKey(key));
1596 return FALSE;
1597 }
1598 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1599 if(order != -expectedOrder || errorCode.isFailure()) {
1600 infoln(fileTestName);
1601 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1602 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1603 infoln(prevFileLine);
1604 infoln(fileLine);
1605 infoln(printCollationKey(prevKey));
1606 infoln(printCollationKey(key));
1607 return FALSE;
1608 }
1609 }
1610
1611 #if U_HAVE_STD_STRING
1612 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1613 // Unpaired surrogates cannot be converted to UTF-8.
1614 // Create valid UTF-16 strings if necessary, and use those for
1615 // both the expected compare() result and for the input to compare(UTF-8).
1616 UnicodeString prevBuffer, sBuffer;
1617 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1618 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1619 std::string prevUTF8, sUTF8;
1620 UnicodeString(prevValid).toUTF8String(prevUTF8);
1621 UnicodeString(sValid).toUTF8String(sUTF8);
1622 UCollationResult expectedUTF8Order;
1623 if(&prevValid == &prevString && &sValid == &s) {
1624 expectedUTF8Order = expectedOrder;
1625 } else {
1626 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1627 }
1628
1629 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1630 if(order != expectedUTF8Order || errorCode.isFailure()) {
1631 infoln(fileTestName);
1632 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1633 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1634 infoln(prevFileLine);
1635 infoln(fileLine);
1636 infoln(printCollationKey(prevKey));
1637 infoln(printCollationKey(key));
1638 return FALSE;
1639 }
1640 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1641 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1642 infoln(fileTestName);
1643 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1644 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1645 infoln(prevFileLine);
1646 infoln(fileLine);
1647 infoln(printCollationKey(prevKey));
1648 infoln(printCollationKey(key));
1649 return FALSE;
1650 }
1651 // Test NUL-termination if the strings do not contain NUL characters.
1652 if(!containNUL) {
1653 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1654 if(order != expectedUTF8Order || errorCode.isFailure()) {
1655 infoln(fileTestName);
1656 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1657 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1658 infoln(prevFileLine);
1659 infoln(fileLine);
1660 infoln(printCollationKey(prevKey));
1661 infoln(printCollationKey(key));
1662 return FALSE;
1663 }
1664 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1665 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1666 infoln(fileTestName);
1667 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1668 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1669 infoln(prevFileLine);
1670 infoln(fileLine);
1671 infoln(printCollationKey(prevKey));
1672 infoln(printCollationKey(key));
1673 return FALSE;
1674 }
1675 }
1676 #endif
1677
1678 UCharIterator leftIter;
1679 UCharIterator rightIter;
1680 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1681 uiter_setString(&rightIter, s.getBuffer(), s.length());
1682 order = coll->compare(leftIter, rightIter, errorCode);
1683 if(order != expectedOrder || errorCode.isFailure()) {
1684 infoln(fileTestName);
1685 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1686 "wrong order: %d != %d (%s)",
1687 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1688 infoln(prevFileLine);
1689 infoln(fileLine);
1690 infoln(printCollationKey(prevKey));
1691 infoln(printCollationKey(key));
1692 return FALSE;
1693 }
1694
1695 order = prevKey.compareTo(key, errorCode);
1696 if(order != expectedOrder || errorCode.isFailure()) {
1697 infoln(fileTestName);
1698 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1699 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1700 infoln(prevFileLine);
1701 infoln(fileLine);
1702 infoln(printCollationKey(prevKey));
1703 infoln(printCollationKey(key));
1704 return FALSE;
1705 }
1706 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1707 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1708 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1709 if(level != expectedLevel) {
1710 infoln(fileTestName);
1711 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1712 (int)fileLineNumber, norm, order, level, expectedLevel);
1713 infoln(prevFileLine);
1714 infoln(fileLine);
1715 infoln(printCollationKey(prevKey));
1716 infoln(printCollationKey(key));
1717 return FALSE;
1718 }
1719 }
1720
1721 // If either string contains U+FFFE, then their sort keys must compare the same as
1722 // the merged sort keys of each string's between-FFFE segments.
1723 //
1724 // It is not required that
1725 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1726 // only that those two methods yield the same order.
1727 //
1728 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1729 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1730 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1731 errorCode.isFailure()) {
1732 order = prevKey.compareTo(key, errorCode);
1733 if(order != expectedOrder || errorCode.isFailure()) {
1734 infoln(fileTestName);
1735 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1736 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1737 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1738 infoln(prevFileLine);
1739 infoln(fileLine);
1740 infoln(printCollationKey(prevKey));
1741 infoln(printCollationKey(key));
1742 return FALSE;
1743 }
1744 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1745 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1746 if(mergedLevel != level) {
1747 infoln(fileTestName);
1748 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1749 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1750 (int)fileLineNumber, norm, order, mergedLevel, level);
1751 infoln(prevFileLine);
1752 infoln(fileLine);
1753 infoln(printCollationKey(prevKey));
1754 infoln(printCollationKey(key));
1755 return FALSE;
1756 }
1757 }
1758 }
1759 return TRUE;
1760 }
1761
1762 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1763 if(errorCode.isFailure()) { return; }
1764 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1765 UnicodeString prevString, s;
1766 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1767 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1768 // Parse the line even if it will be ignored (when we do not have a Collator)
1769 // in order to report syntax issues.
1770 Collation::Level relation = parseRelationAndString(s, errorCode);
1771 if(errorCode.isFailure()) {
1772 errorCode.reset();
1773 break;
1774 }
1775 if(coll == NULL) {
1776 // We were unable to create the Collator but continue with tests.
1777 // Ignore test data for this Collator.
1778 // The next Collator creation might work.
1779 continue;
1780 }
1781 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1782 Collation::Level expectedLevel = relation;
1783 s.getTerminatedBuffer(); // Ensure NUL-termination.
1784 UBool isOk = TRUE;
1785 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1786 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1787 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1788 expectedOrder, expectedLevel, errorCode);
1789 }
1790 if(isOk) {
1791 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1792 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1793 expectedOrder, expectedLevel, errorCode);
1794 }
1795 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1796 UnicodeString pn = nfd->normalize(prevString, errorCode);
1797 UnicodeString n = nfd->normalize(s, errorCode);
1798 pn.getTerminatedBuffer();
1799 n.getTerminatedBuffer();
1800 errorCode.assertSuccess();
1801 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1802 expectedOrder, expectedLevel, errorCode);
1803 }
1804 if(!isOk) {
1805 errorCode.reset(); // already reported
1806 }
1807 prevFileLine = fileLine;
1808 prevString = s;
1809 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1810 }
1811 }
1812
1813 void CollationTest::TestDataDriven() {
1814 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1815
1816 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1817 nfd = Normalizer2::getNFDInstance(errorCode);
1818 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1819 return;
1820 }
1821
1822 CharString path(getSourceTestData(errorCode), errorCode);
1823 path.appendPathPart("collationtest.txt", errorCode);
1824 const char *codePage = "UTF-8";
1825 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1826 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1827 return;
1828 }
1829 // Read a new line if necessary.
1830 // Sub-parsers leave the first line set that they do not handle.
1831 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1832 if(!isSectionStarter(fileLine[0])) {
1833 errln("syntax error on line %d", (int)fileLineNumber);
1834 infoln(fileLine);
1835 return;
1836 }
1837 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1838 fileTestName = fileLine;
1839 logln(fileLine);
1840 fileLine.remove();
1841 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1842 setRootCollator(errorCode);
1843 fileLine.remove();
1844 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1845 setLocaleCollator(errorCode);
1846 fileLine.remove();
1847 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1848 buildTailoring(f.getAlias(), errorCode);
1849 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1850 parseAndSetAttribute(errorCode);
1851 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1852 checkCompareStrings(f.getAlias(), errorCode);
1853 } else {
1854 errln("syntax error on line %d", (int)fileLineNumber);
1855 infoln(fileLine);
1856 return;
1857 }
1858 }
1859 }
1860
1861 #endif // !UCONFIG_NO_COLLATION