]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/collationtest.cpp
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / test / intltest / collationtest.cpp
CommitLineData
57a6839d
A
1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* collationtest.cpp
7*
8* created on: 2012apr27
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/coll.h"
17#include "unicode/errorcode.h"
18#include "unicode/localpointer.h"
19#include "unicode/normalizer2.h"
20#include "unicode/sortkey.h"
21#include "unicode/std_string.h"
22#include "unicode/strenum.h"
23#include "unicode/tblcoll.h"
24#include "unicode/uiter.h"
25#include "unicode/uniset.h"
26#include "unicode/unistr.h"
27#include "unicode/usetiter.h"
28#include "unicode/ustring.h"
29#include "charstr.h"
30#include "cmemory.h"
31#include "collation.h"
32#include "collationdata.h"
33#include "collationfcd.h"
34#include "collationiterator.h"
35#include "collationroot.h"
36#include "collationrootelements.h"
37#include "collationruleparser.h"
38#include "collationweights.h"
39#include "cstring.h"
40#include "intltest.h"
41#include "normalizer2impl.h"
42#include "ucbuf.h"
43#include "uhash.h"
44#include "uitercollationiterator.h"
45#include "utf16collationiterator.h"
46#include "utf8collationiterator.h"
47#include "uvectr32.h"
48#include "uvectr64.h"
49#include "writesrc.h"
50
51#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52
53// TODO: Move to ucbuf.h
54U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
55
56class CodePointIterator;
57
58// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
59
60class CollationTest : public IntlTest {
61public:
62 CollationTest()
63 : fcd(NULL), nfd(NULL),
64 fileLineNumber(0),
65 coll(NULL) {}
66
67 ~CollationTest() {
68 delete coll;
69 }
70
71 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
72
73 void TestMinMax();
74 void TestImplicits();
75 void TestNulTerminated();
76 void TestIllegalUTF8();
77 void TestShortFCDData();
78 void TestFCD();
79 void TestCollationWeights();
80 void TestRootElements();
81 void TestTailoredElements();
82 void TestDataDriven();
83
84private:
85 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86 void checkAllocWeights(CollationWeights &cw,
87 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88 int32_t someLength, int32_t minCount);
89
90 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91 static UnicodeString printCollationKey(const CollationKey &key);
92
93 // Helpers & fields for data-driven test.
94 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
95 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
96 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
97 int32_t skipSpaces(int32_t i) {
98 while(isSpace(fileLine[i])) { ++i; }
99 return i;
100 }
101
102 UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108 void setRootCollator(IcuTestErrorCode &errorCode);
109 void setLocaleCollator(IcuTestErrorCode &errorCode);
110
111 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112
113 UBool getSortKeyParts(const UChar *s, int32_t length,
114 CharString &dest, int32_t partSize,
115 IcuTestErrorCode &errorCode);
116 UBool getCollationKey(const char *norm, const UnicodeString &line,
117 const UChar *s, int32_t length,
118 CollationKey &key, IcuTestErrorCode &errorCode);
119 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120 const UnicodeString &prevString, const UnicodeString &s,
121 UCollationResult expectedOrder, Collation::Level expectedLevel,
122 IcuTestErrorCode &errorCode);
123 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125 const Normalizer2 *fcd, *nfd;
126 UnicodeString fileLine;
127 int32_t fileLineNumber;
128 UnicodeString fileTestName;
129 Collator *coll;
130};
131
132extern IntlTest *createCollationTest() {
133 return new CollationTest();
134}
135
136void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137 if(exec) {
138 logln("TestSuite CollationTest: ");
139 }
140 TESTCASE_AUTO_BEGIN;
141 TESTCASE_AUTO(TestMinMax);
142 TESTCASE_AUTO(TestImplicits);
143 TESTCASE_AUTO(TestNulTerminated);
144 TESTCASE_AUTO(TestIllegalUTF8);
145 TESTCASE_AUTO(TestShortFCDData);
146 TESTCASE_AUTO(TestFCD);
147 TESTCASE_AUTO(TestCollationWeights);
148 TESTCASE_AUTO(TestRootElements);
149 TESTCASE_AUTO(TestTailoredElements);
150 TESTCASE_AUTO(TestDataDriven);
151 TESTCASE_AUTO_END;
152}
153
154void CollationTest::TestMinMax() {
155 IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157 setRootCollator(errorCode);
158 if(errorCode.isFailure()) {
159 errorCode.reset();
160 return;
161 }
162 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163 if(rbc == NULL) {
164 errln("the root collator is not a RuleBasedCollator");
165 return;
166 }
167
168 static const UChar s[2] = { 0xfffe, 0xffff };
169 UVector64 ces(errorCode);
170 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171 errorCode.assertSuccess();
172 if(ces.size() != 2) {
173 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174 return;
175 }
176 int64_t ce = ces.elementAti(0);
177 int64_t expected =
178 ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
179 Collation::MERGE_SEPARATOR_LOWER32;
180 if(ce != expected) {
181 errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
182 }
183
184 ce = ces.elementAti(1);
185 expected = Collation::makeCE(Collation::MAX_PRIMARY);
186 if(ce != expected) {
187 errln("CE(U+ffff)=%04lx != max..", (long)ce);
188 }
189}
190
191void CollationTest::TestImplicits() {
192 IcuTestErrorCode errorCode(*this, "TestImplicits");
193
194 const CollationData *cd = CollationRoot::getData(errorCode);
195 if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
196 return;
197 }
198
199 // Implicit primary weights should be assigned for the following sets,
200 // and sort in ascending order by set and then code point.
201 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202 // core Han Unified Ideographs
203 UnicodeSet coreHan("[\\p{unified_ideograph}&"
204 "[\\p{Block=CJK_Unified_Ideographs}"
205 "\\p{Block=CJK_Compatibility_Ideographs}]]",
206 errorCode);
207 // all other Unified Han ideographs
208 UnicodeSet otherHan("[\\p{unified ideograph}-"
209 "[\\p{Block=CJK_Unified_Ideographs}"
210 "\\p{Block=CJK_Compatibility_Ideographs}]]",
211 errorCode);
212 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
213 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
214 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
215 return;
216 }
217 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
218 UChar32 prev = 0;
219 uint32_t prevPrimary = 0;
220 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
221 for(int32_t i = 0; i < LENGTHOF(sets); ++i) {
222 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
223 while(iter->next()) {
224 UChar32 c = iter->getCodepoint();
225 UnicodeString s(c);
226 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
227 int64_t ce = ci.nextCE(errorCode);
228 int64_t ce2 = ci.nextCE(errorCode);
229 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
230 return;
231 }
232 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
233 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
234 continue;
235 }
236 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
237 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
238 (long)c, (long)(ce & 0xffffffff));
239 continue;
240 }
241 uint32_t primary = (uint32_t)(ce >> 32);
242 if(!(primary > prevPrimary)) {
243 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
244 (long)c, (long)primary, (long)prev, (long)prevPrimary);
245 }
246 prev = c;
247 prevPrimary = primary;
248 }
249 }
250}
251
252void CollationTest::TestNulTerminated() {
253 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
254 const CollationData *data = CollationRoot::getData(errorCode);
255 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
256 return;
257 }
258
259 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
260
261 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
262 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
263 for(int32_t i = 0;; ++i) {
264 int64_t ce1 = ci1.nextCE(errorCode);
265 int64_t ce2 = ci2.nextCE(errorCode);
266 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
267 return;
268 }
269 if(ce1 != ce2) {
270 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
271 break;
272 }
273 if(ce1 == Collation::NO_CE) { break; }
274 }
275}
276
277void CollationTest::TestIllegalUTF8() {
278 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
279
280 setRootCollator(errorCode);
281 if(errorCode.isFailure()) {
282 errorCode.reset();
283 return;
284 }
285 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
286
287 static const char *strings[] = {
288 // U+FFFD
289 "a\xef\xbf\xbdz",
290 // illegal byte sequences
291 "a\x80z", // trail byte
292 "a\xc1\x81z", // non-shortest form
293 "a\xe0\x82\x83z", // non-shortest form
294 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
295 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
296 "a\xf0\x8f\xbf\xbfz", // non-shortest form
297 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
298 };
299
300 StringPiece fffd(strings[0]);
301 for(int32_t i = 1; i < LENGTHOF(strings); ++i) {
302 StringPiece illegal(strings[i]);
303 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
304 if(order != UCOL_EQUAL) {
305 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
306 (int)i, order);
307 }
308 }
309}
310
311namespace {
312
313void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
314 for(UChar32 c = 0x10000; c < 0x110000;) {
315 UChar32 next = c + 0x400;
316 if(src.containsSome(c, next - 1)) {
317 dest.add(U16_LEAD(c));
318 }
319 c = next;
320 }
321}
322
323} // namespace
324
325void CollationTest::TestShortFCDData() {
326 // See CollationFCD class comments.
327 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
328 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
329 errorCode.assertSuccess();
330 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
331 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
332 UnicodeSet lccc; // actual
333 for(UChar32 c = 0; c <= 0xffff; ++c) {
334 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
335 }
336 UnicodeSet diff(expectedLccc);
337 diff.removeAll(lccc);
338 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
339 UnicodeString empty("[]");
340 UnicodeString diffString;
341 diff.toPattern(diffString, TRUE);
342 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
343 diff = lccc;
344 diff.removeAll(expectedLccc);
345 diff.toPattern(diffString, TRUE);
346 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
347
348 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
349 if (errorCode.isSuccess()) {
350 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
351 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
352 UnicodeSet tccc; // actual
353 for(UChar32 c = 0; c <= 0xffff; ++c) {
354 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
355 }
356 diff = expectedTccc;
357 diff.removeAll(tccc);
358 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
359 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
360 diff = tccc;
361 diff.removeAll(expectedTccc);
362 diff.toPattern(diffString, TRUE);
363 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
364 }
365}
366
367class CodePointIterator {
368public:
369 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
370 void resetToStart() { pos = 0; }
371 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
372 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
373 int32_t getLength() const { return length; }
374 int getIndex() const { return (int)pos; }
375private:
376 const UChar32 *cp;
377 int32_t length;
378 int32_t pos;
379};
380
381void CollationTest::checkFCD(const char *name,
382 CollationIterator &ci, CodePointIterator &cpi) {
383 IcuTestErrorCode errorCode(*this, "checkFCD");
384
385 // Iterate forward to the limit.
386 for(;;) {
387 UChar32 c1 = ci.nextCodePoint(errorCode);
388 UChar32 c2 = cpi.next();
389 if(c1 != c2) {
390 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
391 name, (long)c1, (long)c2, cpi.getIndex());
392 return;
393 }
394 if(c1 < 0) { break; }
395 }
396
397 // Iterate backward most of the way.
398 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
399 UChar32 c1 = ci.previousCodePoint(errorCode);
400 UChar32 c2 = cpi.previous();
401 if(c1 != c2) {
402 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
403 name, (long)c1, (long)c2, cpi.getIndex());
404 return;
405 }
406 }
407
408 // Forward again.
409 for(;;) {
410 UChar32 c1 = ci.nextCodePoint(errorCode);
411 UChar32 c2 = cpi.next();
412 if(c1 != c2) {
413 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
414 name, (long)c1, (long)c2, cpi.getIndex());
415 return;
416 }
417 if(c1 < 0) { break; }
418 }
419
420 // Iterate backward to the start.
421 for(;;) {
422 UChar32 c1 = ci.previousCodePoint(errorCode);
423 UChar32 c2 = cpi.previous();
424 if(c1 != c2) {
425 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
426 name, (long)c1, (long)c2, cpi.getIndex());
427 return;
428 }
429 if(c1 < 0) { break; }
430 }
431}
432
433void CollationTest::TestFCD() {
434 IcuTestErrorCode errorCode(*this, "TestFCD");
435 const CollationData *data = CollationRoot::getData(errorCode);
436 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
437 return;
438 }
439
440 // Input string, not FCD, NUL-terminated.
441 static const UChar s[] = {
442 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
443 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
444 0x327, 0x308, // ccc=202, 230
445 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
446 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
447 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
448 0xac01,
449 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
450 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
451 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
452 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
453 0x4e00, 0xf81,
454 0
455 };
456 // Expected code points.
457 static const UChar32 cp[] = {
458 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
459 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
460 0x1D15F, 0x1D16D,
461 0xac01,
462 0x63, 0x327, 0x1D165, 0x1D16D,
463 0x61,
464 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
465 0x4e00, 0xf71, 0xf80
466 };
467
468 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
469 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
470 return;
471 }
472 CodePointIterator cpi(cp, LENGTHOF(cp));
473 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
474
475#if U_HAVE_STD_STRING
476 cpi.resetToStart();
477 std::string utf8;
478 UnicodeString(s).toUTF8String(utf8);
479 FCDUTF8CollationIterator u8ci(data, FALSE,
480 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
481 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
482 return;
483 }
484 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
485#endif
486
487 cpi.resetToStart();
488 UCharIterator iter;
489 uiter_setString(&iter, s, LENGTHOF(s) - 1); // -1: without the terminating NUL
490 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
491 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
492 return;
493 }
494 checkFCD("FCDUIterCollationIterator", uici, cpi);
495}
496
497void CollationTest::checkAllocWeights(CollationWeights &cw,
498 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
499 int32_t someLength, int32_t minCount) {
500 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
501 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
502 (long)lowerLimit, (long)upperLimit, (long)n);
503 return;
504 }
505 uint32_t previous = lowerLimit;
506 int32_t count = 0; // number of weights that have someLength
507 for(int32_t i = 0; i < n; ++i) {
508 uint32_t w = cw.nextWeight();
509 if(w == 0xffffffff) {
510 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
511 "returns only %ld weights",
512 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
513 return;
514 }
515 if(!(previous < w && w < upperLimit)) {
516 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517 "number %ld -> %lx not between %lx and %lx",
518 (long)lowerLimit, (long)upperLimit, (long)n,
519 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
520 return;
521 }
522 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
523 }
524 if(count < minCount) {
525 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526 "returns only %ld < %ld weights of length %d",
527 (long)lowerLimit, (long)upperLimit, (long)n,
528 (long)count, (long)minCount, (int)someLength);
529 }
530}
531
532void CollationTest::TestCollationWeights() {
533 CollationWeights cw;
534
535 // Non-compressible primaries use 254 second bytes 02..FF.
536 logln("CollationWeights.initForPrimary(non-compressible)");
537 cw.initForPrimary(FALSE);
538 // Expect 1 weight 11 and 254 weights 12xx.
539 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
540 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
541 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
542 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
543 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
544 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
545 // Expect 254^2=64516 three-byte weights.
546 // During computation, there should be 3 three-byte ranges
547 // 10ffff, 11xxxx, 120202.
548 // The middle one should be split 64515:1,
549 // and the newly-split-off range and the last ranged lengthened.
550 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
551 // Expect weights 1102 & 1103.
552 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
553 // Expect weights 102102 & 102103.
554 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
555
556 // Compressible primaries use 251 second bytes 04..FE.
557 logln("CollationWeights.initForPrimary(compressible)");
558 cw.initForPrimary(TRUE);
559 // Expect 1 weight 11 and 251 weights 12xx.
560 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
561 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
562 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
563 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
564 // Expect weights 1104 & 1105.
565 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
566 // Expect weights 102102 & 102103.
567 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
568
569 // Secondary and tertiary weights use only bytes 3 & 4.
570 logln("CollationWeights.initForSecondary()");
571 cw.initForSecondary();
572 // Expect weights fbxx and all four fc..ff.
573 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
574
575 logln("CollationWeights.initForTertiary()");
576 cw.initForTertiary();
577 // Expect weights 3dxx and both 3e & 3f.
578 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
579}
580
581namespace {
582
583UBool isValidCE(const CollationRootElements &re, const CollationData &data,
584 uint32_t p, uint32_t s, uint32_t ctq) {
585 uint32_t p1 = p >> 24;
586 uint32_t p2 = (p >> 16) & 0xff;
587 uint32_t p3 = (p >> 8) & 0xff;
588 uint32_t p4 = p & 0xff;
589 uint32_t s1 = s >> 8;
590 uint32_t s2 = s & 0xff;
591 // ctq = Case, Tertiary, Quaternary
592 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
593 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
594 uint32_t t1 = t >> 8;
595 uint32_t t2 = t & 0xff;
596 uint32_t q = ctq & Collation::QUATERNARY_MASK;
597 // No leading zero bytes.
598 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
599 return FALSE;
600 }
601 // No intermediate zero bytes.
602 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
603 return FALSE;
604 }
605 if(p2 != 0 && p3 == 0 && p4 != 0) {
606 return FALSE;
607 }
608 // Minimum & maximum lead bytes.
609 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
610 (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
611 (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
612 return FALSE;
613 }
614 if(t1 != 0 && t1 > 0x3f) {
615 return FALSE;
616 }
617 if(c > 2) {
618 return FALSE;
619 }
620 // The valid byte range for the second primary byte depends on compressibility.
621 if(p2 != 0) {
622 if(data.isCompressibleLeadByte(p1)) {
623 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
624 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
625 return FALSE;
626 }
627 } else {
628 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
629 return FALSE;
630 }
631 }
632 }
633 // Other bytes just need to avoid the level separator.
634 // Trailing zeros are ok.
635 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
636 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
637 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
638 return FALSE;
639 }
640 // Well-formed CEs.
641 if(p == 0) {
642 if(s == 0) {
643 if(t == 0) {
644 // Completely ignorable CE.
645 // Quaternary CEs are not supported.
646 if(c != 0 || q != 0) {
647 return FALSE;
648 }
649 } else {
650 // Tertiary CE.
651 if(t < re.getTertiaryBoundary() || c != 2) {
652 return FALSE;
653 }
654 }
655 } else {
656 // Secondary CE.
657 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
658 return FALSE;
659 }
660 }
661 } else {
662 // Primary CE.
663 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
664 s >= re.getSecondaryBoundary()) {
665 return FALSE;
666 }
667 if(t == 0 || t >= re.getTertiaryBoundary()) {
668 return FALSE;
669 }
670 }
671 return TRUE;
672}
673
674UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
675 uint32_t p = (uint32_t)(ce >> 32);
676 uint32_t secTer = (uint32_t)ce;
677 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
678}
679
680class RootElementsIterator {
681public:
682 RootElementsIterator(const CollationData &root)
683 : data(root),
684 elements(root.rootElements), length(root.rootElementsLength),
685 pri(0), secTer(0),
686 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
687
688 UBool next() {
689 if(index >= length) { return FALSE; }
690 uint32_t p = elements[index];
691 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
692 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
693 ++index;
694 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
695 return TRUE;
696 }
697 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
698 // End of a range, enumerate the primaries in the range.
699 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
700 p &= 0xffffff00;
701 if(pri == p) {
702 // Finished the range, return the next CE after it.
703 ++index;
704 return next();
705 }
706 U_ASSERT(pri < p);
707 // Return the next primary in this range.
708 UBool isCompressible = data.isCompressiblePrimary(pri);
709 if((pri & 0xffff) == 0) {
710 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
711 } else {
712 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
713 }
714 return TRUE;
715 }
716 // Simple primary CE.
717 ++index;
718 pri = p;
719 secTer = Collation::COMMON_SEC_AND_TER_CE;
720 return TRUE;
721 }
722
723 uint32_t getPrimary() const { return pri; }
724 uint32_t getSecTer() const { return secTer; }
725
726private:
727 const CollationData &data;
728 const uint32_t *elements;
729 int32_t length;
730
731 uint32_t pri;
732 uint32_t secTer;
733 int32_t index;
734};
735
736} // namespace
737
738void CollationTest::TestRootElements() {
739 IcuTestErrorCode errorCode(*this, "TestRootElements");
740 const CollationData *root = CollationRoot::getData(errorCode);
741 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
742 return;
743 }
744 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
745 RootElementsIterator iter(*root);
746
747 // We check each root CE for validity,
748 // and we also verify that there is a tailoring gap between each two CEs.
749 CollationWeights cw1c; // compressible primary weights
750 CollationWeights cw1u; // uncompressible primary weights
751 CollationWeights cw2;
752 CollationWeights cw3;
753
754 cw1c.initForPrimary(TRUE);
755 cw1u.initForPrimary(FALSE);
756 cw2.initForSecondary();
757 cw3.initForTertiary();
758
759 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
760 // nor the special merge-separator CE for U+FFFE.
761 uint32_t prevPri = 0;
762 uint32_t prevSec = 0;
763 uint32_t prevTer = 0;
764 while(iter.next()) {
765 uint32_t pri = iter.getPrimary();
766 uint32_t secTer = iter.getSecTer();
767 // CollationRootElements CEs must have 0 case and quaternary bits.
768 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
769 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
770 (long)pri, (long)secTer);
771 }
772 uint32_t sec = secTer >> 16;
773 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
774 uint32_t ctq = ter;
775 if(pri == 0 && sec == 0 && ter != 0) {
776 // Tertiary CEs must have uppercase bits,
777 // but they are not stored in the CollationRootElements.
778 ctq |= 0x8000;
779 }
780 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
781 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
782 } else {
783 if(pri != prevPri) {
784 uint32_t newWeight = 0;
785 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
786 // There is currently no tailoring gap after primary ignorables,
787 // and we forbid tailoring after U+FFFD and U+FFFF.
788 } else if(root->isCompressiblePrimary(prevPri)) {
789 if(!cw1c.allocWeights(prevPri, pri, 1)) {
790 errln("no primary/compressible tailoring gap between %08lx and %08lx",
791 (long)prevPri, (long)pri);
792 } else {
793 newWeight = cw1c.nextWeight();
794 }
795 } else {
796 if(!cw1u.allocWeights(prevPri, pri, 1)) {
797 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
798 (long)prevPri, (long)pri);
799 } else {
800 newWeight = cw1u.nextWeight();
801 }
802 }
803 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
804 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
805 (long)prevPri, (long)newWeight, (long)pri);
806 }
807 } else if(sec != prevSec) {
808 uint32_t lowerLimit =
809 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
810 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
811 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
812 } else {
813 uint32_t newWeight = cw2.nextWeight();
814 if(!(prevSec < newWeight && newWeight < sec)) {
815 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
816 (long)lowerLimit, (long)newWeight, (long)sec);
817 }
818 }
819 } else if(ter != prevTer) {
820 uint32_t lowerLimit =
821 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
822 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
823 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
824 } else {
825 uint32_t newWeight = cw3.nextWeight();
826 if(!(prevTer < newWeight && newWeight < ter)) {
827 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
828 (long)lowerLimit, (long)newWeight, (long)ter);
829 }
830 }
831 } else {
832 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
833 }
834 }
835 prevPri = pri;
836 prevSec = sec;
837 prevTer = ter;
838 }
839}
840
841void CollationTest::TestTailoredElements() {
842 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
843 const CollationData *root = CollationRoot::getData(errorCode);
844 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
845 return;
846 }
847 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
848
849 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
850 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
851 return;
852 }
853 uhash_setKeyDeleter(prevLocales, uprv_free);
854 // TestRootElements() tests the root collator which does not have tailorings.
855 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
856 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
857 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
858
859 UVector64 ces(errorCode);
860 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
861 U_ASSERT(locales.isValid());
862 const char *localeID = "root";
863 do {
864 Locale locale(localeID);
865 LocalPointer<StringEnumeration> types(
866 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
867 errorCode.assertSuccess();
868 const char *type = NULL; // default type
869 do {
870 Locale localeWithType(locale);
871 if(type != NULL) {
872 localeWithType.setKeywordValue("collation", type, errorCode);
873 }
874 errorCode.assertSuccess();
875 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
876 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
877 localeWithType.getName())) {
878 continue;
879 }
880 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
881 if(uhash_geti(prevLocales, actual.getName()) != 0) {
882 continue;
883 }
884 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
885 errorCode.assertSuccess();
886 logln("TestTailoredElements(): requested %s -> actual %s",
887 localeWithType.getName(), actual.getName());
888 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
889 if(rbc == NULL) {
890 continue;
891 }
892 // Note: It would be better to get tailored strings such that we can
893 // identify the prefix, and only get the CEs for the prefix+string,
894 // not also for the prefix.
895 // There is currently no API for that.
896 // It would help in an unusual case where a contraction starting in the prefix
897 // extends past its end, and we do not see the intended mapping.
898 // For example, for a mapping p|st, if there is also a contraction ps,
899 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
900 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
901 errorCode.assertSuccess();
902 UnicodeSetIterator iter(*tailored);
903 while(iter.next()) {
904 const UnicodeString &s = iter.getString();
905 ces.removeAllElements();
906 rbc->internalGetCEs(s, ces, errorCode);
907 errorCode.assertSuccess();
908 for(int32_t i = 0; i < ces.size(); ++i) {
909 int64_t ce = ces.elementAti(i);
910 if(!isValidCE(rootElements, *root, ce)) {
911 errln("invalid tailored CE %016llx at CE index %d from string:",
912 (long long)ce, (int)i);
913 infoln(prettify(s));
914 }
915 }
916 }
917 } while((type = types->next(NULL, errorCode)) != NULL);
918 } while((localeID = locales->next(NULL, errorCode)) != NULL);
919 uhash_close(prevLocales);
920}
921
922UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
923 UnicodeString s;
924 for(int32_t i = 0; i < length; ++i) {
925 if(i > 0) { s.append((UChar)0x20); }
926 uint8_t b = p[i];
927 if(b == 0) {
928 s.append((UChar)0x2e); // period
929 } else if(b == 1) {
930 s.append((UChar)0x7c); // vertical bar
931 } else {
932 appendHex(b, 2, s);
933 }
934 }
935 return s;
936}
937
938UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
939 int32_t length;
940 const uint8_t *p = key.getByteArray(length);
941 return printSortKey(p, length);
942}
943
944UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
945 int32_t lineLength;
946 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
947 if(line == NULL || errorCode.isFailure()) {
948 fileLine.remove();
949 return FALSE;
950 }
951 ++fileLineNumber;
952 // Strip trailing CR/LF, comments, and spaces.
953 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
954 if(comment != NULL) {
955 lineLength = (int32_t)(comment - line);
956 } else {
957 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
958 }
959 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
960 fileLine.setTo(FALSE, line, lineLength);
961 return TRUE;
962}
963
964void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
965 UErrorCode &errorCode) {
966 int32_t length = fileLine.length();
967 int32_t i;
968 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
969 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
970 if(pipeIndex >= 0) {
971 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
972 if(prefix.isEmpty()) {
973 errln("empty prefix on line %d", (int)fileLineNumber);
974 infoln(fileLine);
975 errorCode = U_PARSE_ERROR;
976 return;
977 }
978 start = pipeIndex + 1;
979 } else {
980 prefix.remove();
981 }
982 s = fileLine.tempSubStringBetween(start, i).unescape();
983 if(s.isEmpty()) {
984 errln("empty string on line %d", (int)fileLineNumber);
985 infoln(fileLine);
986 errorCode = U_PARSE_ERROR;
987 return;
988 }
989 start = i;
990}
991
992Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
993 Collation::Level relation;
994 int32_t start;
995 if(fileLine[0] == 0x3c) { // <
996 UChar second = fileLine[1];
997 start = 2;
998 switch(second) {
999 case 0x31: // <1
1000 relation = Collation::PRIMARY_LEVEL;
1001 break;
1002 case 0x32: // <2
1003 relation = Collation::SECONDARY_LEVEL;
1004 break;
1005 case 0x33: // <3
1006 relation = Collation::TERTIARY_LEVEL;
1007 break;
1008 case 0x34: // <4
1009 relation = Collation::QUATERNARY_LEVEL;
1010 break;
1011 case 0x63: // <c
1012 relation = Collation::CASE_LEVEL;
1013 break;
1014 case 0x69: // <i
1015 relation = Collation::IDENTICAL_LEVEL;
1016 break;
1017 default: // just <
1018 relation = Collation::NO_LEVEL;
1019 start = 1;
1020 break;
1021 }
1022 } else if(fileLine[0] == 0x3d) { // =
1023 relation = Collation::ZERO_LEVEL;
1024 start = 1;
1025 } else {
1026 start = 0;
1027 }
1028 if(start == 0 || !isSpace(fileLine[start])) {
1029 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1030 infoln(fileLine);
1031 errorCode.set(U_PARSE_ERROR);
1032 return Collation::NO_LEVEL;
1033 }
1034 start = skipSpaces(start);
1035 UnicodeString prefix;
1036 parseString(start, prefix, s, errorCode);
1037 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1038 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1039 infoln(fileLine);
1040 errorCode.set(U_PARSE_ERROR);
1041 return Collation::NO_LEVEL;
1042 }
1043 if(start < fileLine.length()) {
1044 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1045 infoln(fileLine);
1046 errorCode.set(U_PARSE_ERROR);
1047 return Collation::NO_LEVEL;
1048 }
1049 return relation;
1050}
1051
1052static const struct {
1053 const char *name;
1054 UColAttribute attr;
1055} attributes[] = {
1056 { "backwards", UCOL_FRENCH_COLLATION },
1057 { "alternate", UCOL_ALTERNATE_HANDLING },
1058 { "caseFirst", UCOL_CASE_FIRST },
1059 { "caseLevel", UCOL_CASE_LEVEL },
1060 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061 { "strength", UCOL_STRENGTH },
1062 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063 { "numeric", UCOL_NUMERIC_COLLATION }
1064};
1065
1066static const struct {
1067 const char *name;
1068 UColAttributeValue value;
1069} attributeValues[] = {
1070 { "default", UCOL_DEFAULT },
1071 { "primary", UCOL_PRIMARY },
1072 { "secondary", UCOL_SECONDARY },
1073 { "tertiary", UCOL_TERTIARY },
1074 { "quaternary", UCOL_QUATERNARY },
1075 { "identical", UCOL_IDENTICAL },
1076 { "off", UCOL_OFF },
1077 { "on", UCOL_ON },
1078 { "shifted", UCOL_SHIFTED },
1079 { "non-ignorable", UCOL_NON_IGNORABLE },
1080 { "lower", UCOL_LOWER_FIRST },
1081 { "upper", UCOL_UPPER_FIRST }
1082};
1083
1084void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1085 int32_t start = skipSpaces(1);
1086 int32_t equalPos = fileLine.indexOf(0x3d);
1087 if(equalPos < 0) {
1088 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089 parseAndSetReorderCodes(start + 7, errorCode);
1090 return;
1091 }
1092 errln("missing '=' on line %d", (int)fileLineNumber);
1093 infoln(fileLine);
1094 errorCode.set(U_PARSE_ERROR);
1095 return;
1096 }
1097
1098 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1099 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1100 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1101 UColReorderCode max;
1102 if(valueString == UNICODE_STRING("space", 5)) {
1103 max = UCOL_REORDER_CODE_SPACE;
1104 } else if(valueString == UNICODE_STRING("punct", 5)) {
1105 max = UCOL_REORDER_CODE_PUNCTUATION;
1106 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1107 max = UCOL_REORDER_CODE_SYMBOL;
1108 } else if(valueString == UNICODE_STRING("currency", 8)) {
1109 max = UCOL_REORDER_CODE_CURRENCY;
1110 } else {
1111 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1112 infoln(fileLine);
1113 errorCode.set(U_PARSE_ERROR);
1114 return;
1115 }
1116 coll->setMaxVariable(max, errorCode);
1117 if(errorCode.isFailure()) {
1118 errln("setMaxVariable() failed on line %d: %s",
1119 (int)fileLineNumber, errorCode.errorName());
1120 infoln(fileLine);
1121 return;
1122 }
1123 fileLine.remove();
1124 return;
1125 }
1126
1127 UColAttribute attr;
1128 for(int32_t i = 0;; ++i) {
1129 if(i == LENGTHOF(attributes)) {
1130 errln("invalid attribute name on line %d", (int)fileLineNumber);
1131 infoln(fileLine);
1132 errorCode.set(U_PARSE_ERROR);
1133 return;
1134 }
1135 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1136 attr = attributes[i].attr;
1137 break;
1138 }
1139 }
1140
1141 UColAttributeValue value;
1142 for(int32_t i = 0;; ++i) {
1143 if(i == LENGTHOF(attributeValues)) {
1144 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145 infoln(fileLine);
1146 errorCode.set(U_PARSE_ERROR);
1147 return;
1148 }
1149 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1150 value = attributeValues[i].value;
1151 break;
1152 }
1153 }
1154
1155 coll->setAttribute(attr, value, errorCode);
1156 if(errorCode.isFailure()) {
1157 errln("illegal attribute=value combination on line %d: %s",
1158 (int)fileLineNumber, errorCode.errorName());
1159 infoln(fileLine);
1160 return;
1161 }
1162 fileLine.remove();
1163}
1164
1165void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1166 UVector32 reorderCodes(errorCode);
1167 while(start < fileLine.length()) {
1168 start = skipSpaces(start);
1169 int32_t limit = start;
1170 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1171 CharString name;
1172 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1173 int32_t code = CollationRuleParser::getReorderCode(name.data());
1174 if(code < -1) {
1175 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1176 infoln(fileLine);
1177 errorCode.set(U_PARSE_ERROR);
1178 return;
1179 }
1180 reorderCodes.addElement(code, errorCode);
1181 start = limit;
1182 }
1183 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1184 if(errorCode.isFailure()) {
1185 errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName());
1186 infoln(fileLine);
1187 return;
1188 }
1189 fileLine.remove();
1190}
1191
1192void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1193 UnicodeString rules;
1194 while(readLine(f, errorCode)) {
1195 if(fileLine.isEmpty()) { continue; }
1196 if(isSectionStarter(fileLine[0])) { break; }
1197 rules.append(fileLine.unescape());
1198 }
1199 if(errorCode.isFailure()) { return; }
1200 logln(rules);
1201
1202 UParseError parseError;
1203 UnicodeString reason;
1204 delete coll;
1205 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1206 if(coll == NULL) {
1207 errln("unable to allocate a new collator");
1208 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1209 return;
1210 }
1211 if(errorCode.isFailure()) {
1212 errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1213 infoln(UnicodeString(" reason: ") + reason);
1214 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1215 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1216 infoln(UnicodeString(" snippet: ...") +
1217 parseError.preContext + "(!)" + parseError.postContext + "...");
1218 }
1219 } else {
1220 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221 UnicodeString(), reason);
1222 }
1223}
1224
1225void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1226 if(errorCode.isFailure()) { return; }
1227 delete coll;
1228 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1229 if(errorCode.isFailure()) {
1230 dataerrln("unable to create a root collator");
1231 return;
1232 }
1233}
1234
1235void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1236 if(errorCode.isFailure()) { return; }
1237 CharString langTag;
1238 langTag.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1239 char localeID[ULOC_FULLNAME_CAPACITY];
1240 int32_t parsedLength;
1241 (void)uloc_forLanguageTag(
1242 langTag.data(), localeID, LENGTHOF(localeID), &parsedLength, errorCode);
1243 Locale locale(localeID);
1244 if(fileLine.length() == 9 ||
1245 errorCode.isFailure() || errorCode.get() == U_STRING_NOT_TERMINATED_WARNING ||
1246 parsedLength != langTag.length() || locale.isBogus()) {
1247 errln("invalid language tag on line %d", (int)fileLineNumber);
1248 infoln(fileLine);
1249 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1250 return;
1251 }
1252
1253 logln("creating a collator for locale ID %s", locale.getName());
1254 Collator *newColl = Collator::createInstance(locale, errorCode);
1255 if(errorCode.isFailure()) {
1256 dataerrln("unable to create a collator for locale %s on line %d",
1257 locale.getName(), (int)fileLineNumber);
1258 infoln(fileLine);
1259 return;
1260 }
1261 delete coll;
1262 coll = newColl;
1263}
1264
1265UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1266 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1267 // In some sequences with Tibetan composite vowel signs,
1268 // even if the string passes the FCD check,
1269 // those composites must be decomposed.
1270 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1271 int32_t index = 0;
1272 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1273 if(++index < s.length()) {
1274 UChar c = s[index];
1275 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1276 }
1277 }
1278 return FALSE;
1279}
1280
1281UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1282 CharString &dest, int32_t partSize,
1283 IcuTestErrorCode &errorCode) {
1284 if(errorCode.isFailure()) { return FALSE; }
1285 uint8_t part[32];
1286 U_ASSERT(partSize <= LENGTHOF(part));
1287 UCharIterator iter;
1288 uiter_setString(&iter, s, length);
1289 uint32_t state[2] = { 0, 0 };
1290 for(;;) {
1291 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1292 UBool done = partLength < partSize;
1293 if(done) {
1294 // At the end, append the next byte as well which should be 00.
1295 ++partLength;
1296 }
1297 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1298 if(done) {
1299 return errorCode.isSuccess();
1300 }
1301 }
1302}
1303
1304UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1305 const UChar *s, int32_t length,
1306 CollationKey &key, IcuTestErrorCode &errorCode) {
1307 if(errorCode.isFailure()) { return FALSE; }
1308 coll->getCollationKey(s, length, key, errorCode);
1309 if(errorCode.isFailure()) {
1310 infoln(fileTestName);
1311 errln("Collator(%s).getCollationKey() failed: %s",
1312 norm, errorCode.errorName());
1313 infoln(line);
1314 return FALSE;
1315 }
1316 int32_t keyLength;
1317 const uint8_t *keyBytes = key.getByteArray(keyLength);
1318 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1319 infoln(fileTestName);
1320 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1321 norm);
1322 infoln(line);
1323 infoln(printCollationKey(key));
1324 return FALSE;
1325 }
1326
1327 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1328 if(numLevels < UCOL_IDENTICAL) {
1329 ++numLevels;
1330 } else {
1331 numLevels = 5;
1332 }
1333 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1334 ++numLevels;
1335 }
1336 errorCode.assertSuccess();
1337 int32_t numLevelSeparators = 0;
1338 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1339 uint8_t b = keyBytes[i];
1340 if(b == 0) {
1341 infoln(fileTestName);
1342 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1343 infoln(line);
1344 infoln(printCollationKey(key));
1345 return FALSE;
1346 }
1347 if(b == 1) { ++numLevelSeparators; }
1348 }
1349 if(numLevelSeparators != (numLevels - 1)) {
1350 infoln(fileTestName);
1351 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1352 norm, (int)numLevelSeparators, (int)numLevels);
1353 infoln(line);
1354 infoln(printCollationKey(key));
1355 return FALSE;
1356 }
1357
1358 // If s contains U+FFFE, check that merged segments make the same key.
1359 LocalMemory<uint8_t> mergedKey;
1360 int32_t mergedKeyLength = 0;
1361 int32_t mergedKeyCapacity = 0;
1362 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1363 int32_t segmentStart = 0;
1364 for(int32_t i = 0;;) {
1365 if(i == sLength) {
1366 if(segmentStart == 0) {
1367 // s does not contain any U+FFFE.
1368 break;
1369 }
1370 } else if(s[i] != 0xfffe) {
1371 ++i;
1372 continue;
1373 }
1374 // Get the sort key for another segment and merge it into mergedKey.
1375 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1376 CollationKey key2;
1377 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1378 int32_t key1Length, key2Length;
1379 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1380 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1381 uint8_t *dest;
1382 int32_t minCapacity = key1Length + key2Length;
1383 if(key1Length > 0) { --minCapacity; }
1384 if(minCapacity <= mergedKeyCapacity) {
1385 dest = mergedKey.getAlias();
1386 } else {
1387 if(minCapacity <= 200) {
1388 mergedKeyCapacity = 200;
1389 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1390 mergedKeyCapacity *= 2;
1391 } else {
1392 mergedKeyCapacity = minCapacity;
1393 }
1394 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1395 }
1396 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1397 if(key1Length == 0) {
1398 // key2 is the sort key for the first segment.
1399 uprv_memcpy(dest, key2Bytes, key2Length);
1400 mergedKeyLength = key2Length;
1401 } else {
1402 mergedKeyLength =
1403 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1404 dest, mergedKeyCapacity);
1405 }
1406 if(i == sLength) { break; }
1407 segmentStart = ++i;
1408 }
1409 if(segmentStart != 0 &&
1410 (mergedKeyLength != keyLength ||
1411 uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1412 infoln(fileTestName);
1413 errln("Collator(%s).getCollationKey(with U+FFFE) != "
1414 "ucol_mergeSortkeys(segments)",
1415 norm);
1416 infoln(line);
1417 infoln(printCollationKey(key));
1418 infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1419 return FALSE;
1420 }
1421
1422 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1423 static const int32_t partSizes[] = { 32, 3, 1 };
1424 for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) {
1425 int32_t partSize = partSizes[psi];
1426 CharString parts;
1427 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1428 infoln(fileTestName);
1429 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1430 norm, (int)partSize, errorCode.errorName());
1431 infoln(line);
1432 return FALSE;
1433 }
1434 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1435 infoln(fileTestName);
1436 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1437 norm, (int)partSize);
1438 infoln(line);
1439 infoln(printCollationKey(key));
1440 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1441 return FALSE;
1442 }
1443 }
1444 return TRUE;
1445}
1446
1447namespace {
1448
1449/**
1450 * Replaces unpaired surrogates with U+FFFD.
1451 * Returns s if no replacement was made, otherwise buffer.
1452 */
1453const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1454 int32_t i = 0;
1455 while(i < s.length()) {
1456 UChar32 c = s.char32At(i);
1457 if(U_IS_SURROGATE(c)) {
1458 if(buffer.length() < i) {
1459 buffer.append(s, buffer.length(), i - buffer.length());
1460 }
1461 buffer.append((UChar)0xfffd);
1462 }
1463 i += U16_LENGTH(c);
1464 }
1465 if(buffer.isEmpty()) {
1466 return s;
1467 }
1468 if(buffer.length() < i) {
1469 buffer.append(s, buffer.length(), i - buffer.length());
1470 }
1471 return buffer;
1472}
1473
1474}
1475
1476UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1477 const UnicodeString &prevString, const UnicodeString &s,
1478 UCollationResult expectedOrder, Collation::Level expectedLevel,
1479 IcuTestErrorCode &errorCode) {
1480 if(errorCode.isFailure()) { return FALSE; }
1481
1482 // Get the sort keys first, for error debug output.
1483 CollationKey prevKey;
1484 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1485 prevKey, errorCode)) {
1486 return FALSE;
1487 }
1488 CollationKey key;
1489 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1490
1491 UCollationResult order = coll->compare(prevString, s, errorCode);
1492 if(order != expectedOrder || errorCode.isFailure()) {
1493 infoln(fileTestName);
1494 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1495 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1496 infoln(prevFileLine);
1497 infoln(fileLine);
1498 infoln(printCollationKey(prevKey));
1499 infoln(printCollationKey(key));
1500 return FALSE;
1501 }
1502 order = coll->compare(s, prevString, errorCode);
1503 if(order != -expectedOrder || errorCode.isFailure()) {
1504 infoln(fileTestName);
1505 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1506 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1507 infoln(prevFileLine);
1508 infoln(fileLine);
1509 infoln(printCollationKey(prevKey));
1510 infoln(printCollationKey(key));
1511 return FALSE;
1512 }
1513 // Test NUL-termination if the strings do not contain NUL characters.
1514 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1515 if(!containNUL) {
1516 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1517 if(order != expectedOrder || errorCode.isFailure()) {
1518 infoln(fileTestName);
1519 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1520 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1521 infoln(prevFileLine);
1522 infoln(fileLine);
1523 infoln(printCollationKey(prevKey));
1524 infoln(printCollationKey(key));
1525 return FALSE;
1526 }
1527 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1528 if(order != -expectedOrder || errorCode.isFailure()) {
1529 infoln(fileTestName);
1530 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1531 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1532 infoln(prevFileLine);
1533 infoln(fileLine);
1534 infoln(printCollationKey(prevKey));
1535 infoln(printCollationKey(key));
1536 return FALSE;
1537 }
1538 }
1539
1540#if U_HAVE_STD_STRING
1541 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1542 // Unpaired surrogates cannot be converted to UTF-8.
1543 // Create valid UTF-16 strings if necessary, and use those for
1544 // both the expected compare() result and for the input to compare(UTF-8).
1545 UnicodeString prevBuffer, sBuffer;
1546 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1547 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1548 std::string prevUTF8, sUTF8;
1549 UnicodeString(prevValid).toUTF8String(prevUTF8);
1550 UnicodeString(sValid).toUTF8String(sUTF8);
1551 UCollationResult expectedUTF8Order;
1552 if(&prevValid == &prevString && &sValid == &s) {
1553 expectedUTF8Order = expectedOrder;
1554 } else {
1555 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1556 }
1557
1558 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1559 if(order != expectedUTF8Order || errorCode.isFailure()) {
1560 infoln(fileTestName);
1561 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1562 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1563 infoln(prevFileLine);
1564 infoln(fileLine);
1565 infoln(printCollationKey(prevKey));
1566 infoln(printCollationKey(key));
1567 return FALSE;
1568 }
1569 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1570 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1571 infoln(fileTestName);
1572 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1573 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1574 infoln(prevFileLine);
1575 infoln(fileLine);
1576 infoln(printCollationKey(prevKey));
1577 infoln(printCollationKey(key));
1578 return FALSE;
1579 }
1580 // Test NUL-termination if the strings do not contain NUL characters.
1581 if(!containNUL) {
1582 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1583 if(order != expectedUTF8Order || errorCode.isFailure()) {
1584 infoln(fileTestName);
1585 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1587 infoln(prevFileLine);
1588 infoln(fileLine);
1589 infoln(printCollationKey(prevKey));
1590 infoln(printCollationKey(key));
1591 return FALSE;
1592 }
1593 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1594 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1595 infoln(fileTestName);
1596 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1598 infoln(prevFileLine);
1599 infoln(fileLine);
1600 infoln(printCollationKey(prevKey));
1601 infoln(printCollationKey(key));
1602 return FALSE;
1603 }
1604 }
1605#endif
1606
1607 UCharIterator leftIter;
1608 UCharIterator rightIter;
1609 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1610 uiter_setString(&rightIter, s.getBuffer(), s.length());
1611 order = coll->compare(leftIter, rightIter, errorCode);
1612 if(order != expectedOrder || errorCode.isFailure()) {
1613 infoln(fileTestName);
1614 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1615 "wrong order: %d != %d (%s)",
1616 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1617 infoln(prevFileLine);
1618 infoln(fileLine);
1619 infoln(printCollationKey(prevKey));
1620 infoln(printCollationKey(key));
1621 return FALSE;
1622 }
1623
1624 order = prevKey.compareTo(key, errorCode);
1625 if(order != expectedOrder || errorCode.isFailure()) {
1626 infoln(fileTestName);
1627 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1628 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1629 infoln(prevFileLine);
1630 infoln(fileLine);
1631 infoln(printCollationKey(prevKey));
1632 infoln(printCollationKey(key));
1633 return FALSE;
1634 }
1635 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1636 int32_t prevKeyLength;
1637 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1638 int32_t keyLength;
1639 const uint8_t *bytes = key.getByteArray(keyLength);
1640 int32_t level = Collation::PRIMARY_LEVEL;
1641 for(int32_t i = 0;; ++i) {
1642 uint8_t b = prevBytes[i];
1643 if(b != bytes[i]) { break; }
1644 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1645 ++level;
1646 if(level == Collation::CASE_LEVEL &&
1647 coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
1648 ++level;
1649 }
1650 }
1651 }
1652 if(level != expectedLevel) {
1653 infoln(fileTestName);
1654 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1655 (int)fileLineNumber, norm, order, level, expectedLevel);
1656 infoln(prevFileLine);
1657 infoln(fileLine);
1658 infoln(printCollationKey(prevKey));
1659 infoln(printCollationKey(key));
1660 return FALSE;
1661 }
1662 }
1663 return TRUE;
1664}
1665
1666void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1667 if(errorCode.isFailure()) { return; }
1668 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1669 UnicodeString prevString, s;
1670 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1671 while(readLine(f, errorCode)) {
1672 if(fileLine.isEmpty()) { continue; }
1673 if(isSectionStarter(fileLine[0])) { break; }
1674 Collation::Level relation = parseRelationAndString(s, errorCode);
1675 if(errorCode.isFailure()) {
1676 errorCode.reset();
1677 break;
1678 }
1679 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1680 Collation::Level expectedLevel = relation;
1681 s.getTerminatedBuffer(); // Ensure NUL-termination.
1682 UBool isOk = TRUE;
1683 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1684 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1685 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1686 expectedOrder, expectedLevel, errorCode);
1687 }
1688 if(isOk) {
1689 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1690 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1691 expectedOrder, expectedLevel, errorCode);
1692 }
1693 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1694 UnicodeString pn = nfd->normalize(prevString, errorCode);
1695 UnicodeString n = nfd->normalize(s, errorCode);
1696 pn.getTerminatedBuffer();
1697 n.getTerminatedBuffer();
1698 errorCode.assertSuccess();
1699 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1700 expectedOrder, expectedLevel, errorCode);
1701 }
1702 if(!isOk) {
1703 errorCode.reset(); // already reported
1704 }
1705 prevFileLine = fileLine;
1706 prevString = s;
1707 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1708 }
1709}
1710
1711void CollationTest::TestDataDriven() {
1712 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1713
1714 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1715 nfd = Normalizer2Factory::getNFDInstance(errorCode);
1716 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1717 return;
1718 }
1719
1720 CharString path(getSourceTestData(errorCode), errorCode);
1721 path.appendPathPart("collationtest.txt", errorCode);
1722 const char *codePage = "UTF-8";
1723 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1724 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1725 return;
1726 }
1727 while(errorCode.isSuccess()) {
1728 // Read a new line if necessary.
1729 // Sub-parsers leave the first line set that they do not handle.
1730 if(fileLine.isEmpty()) {
1731 if(!readLine(f.getAlias(), errorCode)) { break; }
1732 continue;
1733 }
1734 if(!isSectionStarter(fileLine[0])) {
1735 errln("syntax error on line %d", (int)fileLineNumber);
1736 infoln(fileLine);
1737 return;
1738 }
1739 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1740 fileTestName = fileLine;
1741 logln(fileLine);
1742 fileLine.remove();
1743 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1744 setRootCollator(errorCode);
1745 fileLine.remove();
1746 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1747 setLocaleCollator(errorCode);
1748 fileLine.remove();
1749 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1750 buildTailoring(f.getAlias(), errorCode);
1751 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1752 parseAndSetAttribute(errorCode);
1753 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1754 checkCompareStrings(f.getAlias(), errorCode);
1755 } else {
1756 errln("syntax error on line %d", (int)fileLineNumber);
1757 infoln(fileLine);
1758 return;
1759 }
1760 }
1761}
1762
1763#endif // !UCONFIG_NO_COLLATION