2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
41 #include "normalizer2impl.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
51 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
53 // TODO: Move to ucbuf.h
54 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer
, UCHARBUF
, ucbuf_close
);
56 class CodePointIterator
;
58 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
60 class CollationTest
: public IntlTest
{
63 : fcd(NULL
), nfd(NULL
),
71 void runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char *par
=NULL
);
75 void TestNulTerminated();
76 void TestIllegalUTF8();
77 void TestShortFCDData();
79 void TestCollationWeights();
80 void TestRootElements();
81 void TestTailoredElements();
82 void TestDataDriven();
85 void checkFCD(const char *name
, CollationIterator
&ci
, CodePointIterator
&cpi
);
86 void checkAllocWeights(CollationWeights
&cw
,
87 uint32_t lowerLimit
, uint32_t upperLimit
, int32_t n
,
88 int32_t someLength
, int32_t minCount
);
90 static UnicodeString
printSortKey(const uint8_t *p
, int32_t length
);
91 static UnicodeString
printCollationKey(const CollationKey
&key
);
93 // Helpers & fields for data-driven test.
94 static UBool
isCROrLF(UChar c
) { return c
== 0xa || c
== 0xd; }
95 static UBool
isSpace(UChar c
) { return c
== 9 || c
== 0x20 || c
== 0x3000; }
96 static UBool
isSectionStarter(UChar c
) { return c
== 0x25 || c
== 0x2a || c
== 0x40; } // %*@
97 int32_t skipSpaces(int32_t i
) {
98 while(isSpace(fileLine
[i
])) { ++i
; }
102 UBool
readLine(UCHARBUF
*f
, IcuTestErrorCode
&errorCode
);
103 void parseString(int32_t &start
, UnicodeString
&prefix
, UnicodeString
&s
, UErrorCode
&errorCode
);
104 Collation::Level
parseRelationAndString(UnicodeString
&s
, IcuTestErrorCode
&errorCode
);
105 void parseAndSetAttribute(IcuTestErrorCode
&errorCode
);
106 void parseAndSetReorderCodes(int32_t start
, IcuTestErrorCode
&errorCode
);
107 void buildTailoring(UCHARBUF
*f
, IcuTestErrorCode
&errorCode
);
108 void setRootCollator(IcuTestErrorCode
&errorCode
);
109 void setLocaleCollator(IcuTestErrorCode
&errorCode
);
111 UBool
needsNormalization(const UnicodeString
&s
, UErrorCode
&errorCode
) const;
113 UBool
getSortKeyParts(const UChar
*s
, int32_t length
,
114 CharString
&dest
, int32_t partSize
,
115 IcuTestErrorCode
&errorCode
);
116 UBool
getCollationKey(const char *norm
, const UnicodeString
&line
,
117 const UChar
*s
, int32_t length
,
118 CollationKey
&key
, IcuTestErrorCode
&errorCode
);
119 UBool
checkCompareTwo(const char *norm
, const UnicodeString
&prevFileLine
,
120 const UnicodeString
&prevString
, const UnicodeString
&s
,
121 UCollationResult expectedOrder
, Collation::Level expectedLevel
,
122 IcuTestErrorCode
&errorCode
);
123 void checkCompareStrings(UCHARBUF
*f
, IcuTestErrorCode
&errorCode
);
125 const Normalizer2
*fcd
, *nfd
;
126 UnicodeString fileLine
;
127 int32_t fileLineNumber
;
128 UnicodeString fileTestName
;
132 extern IntlTest
*createCollationTest() {
133 return new CollationTest();
136 void CollationTest::runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char * /*par*/) {
138 logln("TestSuite CollationTest: ");
141 TESTCASE_AUTO(TestMinMax
);
142 TESTCASE_AUTO(TestImplicits
);
143 TESTCASE_AUTO(TestNulTerminated
);
144 TESTCASE_AUTO(TestIllegalUTF8
);
145 TESTCASE_AUTO(TestShortFCDData
);
146 TESTCASE_AUTO(TestFCD
);
147 TESTCASE_AUTO(TestCollationWeights
);
148 TESTCASE_AUTO(TestRootElements
);
149 TESTCASE_AUTO(TestTailoredElements
);
150 TESTCASE_AUTO(TestDataDriven
);
154 void CollationTest::TestMinMax() {
155 IcuTestErrorCode
errorCode(*this, "TestMinMax");
157 setRootCollator(errorCode
);
158 if(errorCode
.isFailure()) {
162 RuleBasedCollator
*rbc
= dynamic_cast<RuleBasedCollator
*>(coll
);
164 errln("the root collator is not a RuleBasedCollator");
168 static const UChar s
[2] = { 0xfffe, 0xffff };
169 UVector64
ces(errorCode
);
170 rbc
->internalGetCEs(UnicodeString(FALSE
, s
, 2), ces
, errorCode
);
171 errorCode
.assertSuccess();
172 if(ces
.size() != 2) {
173 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces
.size());
176 int64_t ce
= ces
.elementAti(0);
178 ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY
<< 32) |
179 Collation::MERGE_SEPARATOR_LOWER32
;
181 errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce
);
184 ce
= ces
.elementAti(1);
185 expected
= Collation::makeCE(Collation::MAX_PRIMARY
);
187 errln("CE(U+ffff)=%04lx != max..", (long)ce
);
191 void CollationTest::TestImplicits() {
192 IcuTestErrorCode
errorCode(*this, "TestImplicits");
194 const CollationData
*cd
= CollationRoot::getData(errorCode
);
195 if(errorCode
.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
199 // Implicit primary weights should be assigned for the following sets,
200 // and sort in ascending order by set and then code point.
201 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202 // core Han Unified Ideographs
203 UnicodeSet
coreHan("[\\p{unified_ideograph}&"
204 "[\\p{Block=CJK_Unified_Ideographs}"
205 "\\p{Block=CJK_Compatibility_Ideographs}]]",
207 // all other Unified Han ideographs
208 UnicodeSet
otherHan("[\\p{unified ideograph}-"
209 "[\\p{Block=CJK_Unified_Ideographs}"
210 "\\p{Block=CJK_Compatibility_Ideographs}]]",
212 UnicodeSet
unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode
);
213 unassigned
.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
214 if(errorCode
.logIfFailureAndReset("UnicodeSet")) {
217 const UnicodeSet
*sets
[] = { &coreHan
, &otherHan
, &unassigned
};
219 uint32_t prevPrimary
= 0;
220 UTF16CollationIterator
ci(cd
, FALSE
, NULL
, NULL
, NULL
);
221 for(int32_t i
= 0; i
< LENGTHOF(sets
); ++i
) {
222 LocalPointer
<UnicodeSetIterator
> iter(new UnicodeSetIterator(*sets
[i
]));
223 while(iter
->next()) {
224 UChar32 c
= iter
->getCodepoint();
226 ci
.setText(s
.getBuffer(), s
.getBuffer() + s
.length());
227 int64_t ce
= ci
.nextCE(errorCode
);
228 int64_t ce2
= ci
.nextCE(errorCode
);
229 if(errorCode
.logIfFailureAndReset("CollationIterator.nextCE()")) {
232 if(ce
== Collation::NO_CE
|| ce2
!= Collation::NO_CE
) {
233 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c
);
236 if((ce
& 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE
) {
237 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
238 (long)c
, (long)(ce
& 0xffffffff));
241 uint32_t primary
= (uint32_t)(ce
>> 32);
242 if(!(primary
> prevPrimary
)) {
243 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
244 (long)c
, (long)primary
, (long)prev
, (long)prevPrimary
);
247 prevPrimary
= primary
;
252 void CollationTest::TestNulTerminated() {
253 IcuTestErrorCode
errorCode(*this, "TestNulTerminated");
254 const CollationData
*data
= CollationRoot::getData(errorCode
);
255 if(errorCode
.logDataIfFailureAndReset("CollationRoot::getData()")) {
259 static const UChar s
[] = { 0x61, 0x62, 0x61, 0x62, 0 };
261 UTF16CollationIterator
ci1(data
, FALSE
, s
, s
, s
+ 2);
262 UTF16CollationIterator
ci2(data
, FALSE
, s
+ 2, s
+ 2, NULL
);
263 for(int32_t i
= 0;; ++i
) {
264 int64_t ce1
= ci1
.nextCE(errorCode
);
265 int64_t ce2
= ci2
.nextCE(errorCode
);
266 if(errorCode
.logIfFailureAndReset("CollationIterator.nextCE()")) {
270 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i
);
273 if(ce1
== Collation::NO_CE
) { break; }
277 void CollationTest::TestIllegalUTF8() {
278 IcuTestErrorCode
errorCode(*this, "TestIllegalUTF8");
280 setRootCollator(errorCode
);
281 if(errorCode
.isFailure()) {
285 coll
->setAttribute(UCOL_STRENGTH
, UCOL_IDENTICAL
, errorCode
);
287 static const char *strings
[] = {
290 // illegal byte sequences
291 "a\x80z", // trail byte
292 "a\xc1\x81z", // non-shortest form
293 "a\xe0\x82\x83z", // non-shortest form
294 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
295 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
296 "a\xf0\x8f\xbf\xbfz", // non-shortest form
297 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
300 StringPiece
fffd(strings
[0]);
301 for(int32_t i
= 1; i
< LENGTHOF(strings
); ++i
) {
302 StringPiece
illegal(strings
[i
]);
303 UCollationResult order
= coll
->compareUTF8(fffd
, illegal
, errorCode
);
304 if(order
!= UCOL_EQUAL
) {
305 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
313 void addLeadSurrogatesForSupplementary(const UnicodeSet
&src
, UnicodeSet
&dest
) {
314 for(UChar32 c
= 0x10000; c
< 0x110000;) {
315 UChar32 next
= c
+ 0x400;
316 if(src
.containsSome(c
, next
- 1)) {
317 dest
.add(U16_LEAD(c
));
325 void CollationTest::TestShortFCDData() {
326 // See CollationFCD class comments.
327 IcuTestErrorCode
errorCode(*this, "TestShortFCDData");
328 UnicodeSet
expectedLccc("[:^lccc=0:]", errorCode
);
329 errorCode
.assertSuccess();
330 expectedLccc
.add(0xdc00, 0xdfff); // add all trail surrogates
331 addLeadSurrogatesForSupplementary(expectedLccc
, expectedLccc
);
332 UnicodeSet lccc
; // actual
333 for(UChar32 c
= 0; c
<= 0xffff; ++c
) {
334 if(CollationFCD::hasLccc(c
)) { lccc
.add(c
); }
336 UnicodeSet
diff(expectedLccc
);
337 diff
.removeAll(lccc
);
338 diff
.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
339 UnicodeString
empty("[]");
340 UnicodeString diffString
;
341 diff
.toPattern(diffString
, TRUE
);
342 assertEquals("CollationFCD::hasLccc() expected-actual", empty
, diffString
);
344 diff
.removeAll(expectedLccc
);
345 diff
.toPattern(diffString
, TRUE
);
346 assertEquals("CollationFCD::hasLccc() actual-expected", empty
, diffString
, TRUE
);
348 UnicodeSet
expectedTccc("[:^tccc=0:]", errorCode
);
349 if (errorCode
.isSuccess()) {
350 addLeadSurrogatesForSupplementary(expectedLccc
, expectedTccc
);
351 addLeadSurrogatesForSupplementary(expectedTccc
, expectedTccc
);
352 UnicodeSet tccc
; // actual
353 for(UChar32 c
= 0; c
<= 0xffff; ++c
) {
354 if(CollationFCD::hasTccc(c
)) { tccc
.add(c
); }
357 diff
.removeAll(tccc
);
358 diff
.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
359 assertEquals("CollationFCD::hasTccc() expected-actual", empty
, diffString
);
361 diff
.removeAll(expectedTccc
);
362 diff
.toPattern(diffString
, TRUE
);
363 assertEquals("CollationFCD::hasTccc() actual-expected", empty
, diffString
);
367 class CodePointIterator
{
369 CodePointIterator(const UChar32
*cp
, int32_t length
) : cp(cp
), length(length
), pos(0) {}
370 void resetToStart() { pos
= 0; }
371 UChar32
next() { return (pos
< length
) ? cp
[pos
++] : U_SENTINEL
; }
372 UChar32
previous() { return (pos
> 0) ? cp
[--pos
] : U_SENTINEL
; }
373 int32_t getLength() const { return length
; }
374 int getIndex() const { return (int)pos
; }
381 void CollationTest::checkFCD(const char *name
,
382 CollationIterator
&ci
, CodePointIterator
&cpi
) {
383 IcuTestErrorCode
errorCode(*this, "checkFCD");
385 // Iterate forward to the limit.
387 UChar32 c1
= ci
.nextCodePoint(errorCode
);
388 UChar32 c2
= cpi
.next();
390 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
391 name
, (long)c1
, (long)c2
, cpi
.getIndex());
394 if(c1
< 0) { break; }
397 // Iterate backward most of the way.
398 for(int32_t n
= (cpi
.getLength() * 2) / 3; n
> 0; --n
) {
399 UChar32 c1
= ci
.previousCodePoint(errorCode
);
400 UChar32 c2
= cpi
.previous();
402 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
403 name
, (long)c1
, (long)c2
, cpi
.getIndex());
410 UChar32 c1
= ci
.nextCodePoint(errorCode
);
411 UChar32 c2
= cpi
.next();
413 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
414 name
, (long)c1
, (long)c2
, cpi
.getIndex());
417 if(c1
< 0) { break; }
420 // Iterate backward to the start.
422 UChar32 c1
= ci
.previousCodePoint(errorCode
);
423 UChar32 c2
= cpi
.previous();
425 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
426 name
, (long)c1
, (long)c2
, cpi
.getIndex());
429 if(c1
< 0) { break; }
433 void CollationTest::TestFCD() {
434 IcuTestErrorCode
errorCode(*this, "TestFCD");
435 const CollationData
*data
= CollationRoot::getData(errorCode
);
436 if(errorCode
.logDataIfFailureAndReset("CollationRoot::getData()")) {
440 // Input string, not FCD, NUL-terminated.
441 static const UChar s
[] = {
442 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
443 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
444 0x327, 0x308, // ccc=202, 230
445 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
446 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
447 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
449 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
450 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
451 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
452 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
456 // Expected code points.
457 static const UChar32 cp
[] = {
458 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
459 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
462 0x63, 0x327, 0x1D165, 0x1D16D,
464 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
468 FCDUTF16CollationIterator
u16ci(data
, FALSE
, s
, s
, NULL
);
469 if(errorCode
.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
472 CodePointIterator
cpi(cp
, LENGTHOF(cp
));
473 checkFCD("FCDUTF16CollationIterator", u16ci
, cpi
);
475 #if U_HAVE_STD_STRING
478 UnicodeString(s
).toUTF8String(utf8
);
479 FCDUTF8CollationIterator
u8ci(data
, FALSE
,
480 reinterpret_cast<const uint8_t *>(utf8
.c_str()), 0, -1);
481 if(errorCode
.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
484 checkFCD("FCDUTF8CollationIterator", u8ci
, cpi
);
489 uiter_setString(&iter
, s
, LENGTHOF(s
) - 1); // -1: without the terminating NUL
490 FCDUIterCollationIterator
uici(data
, FALSE
, iter
, 0);
491 if(errorCode
.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
494 checkFCD("FCDUIterCollationIterator", uici
, cpi
);
497 void CollationTest::checkAllocWeights(CollationWeights
&cw
,
498 uint32_t lowerLimit
, uint32_t upperLimit
, int32_t n
,
499 int32_t someLength
, int32_t minCount
) {
500 if(!cw
.allocWeights(lowerLimit
, upperLimit
, n
)) {
501 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
502 (long)lowerLimit
, (long)upperLimit
, (long)n
);
505 uint32_t previous
= lowerLimit
;
506 int32_t count
= 0; // number of weights that have someLength
507 for(int32_t i
= 0; i
< n
; ++i
) {
508 uint32_t w
= cw
.nextWeight();
509 if(w
== 0xffffffff) {
510 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
511 "returns only %ld weights",
512 (long)lowerLimit
, (long)upperLimit
, (long)n
, (long)i
);
515 if(!(previous
< w
&& w
< upperLimit
)) {
516 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517 "number %ld -> %lx not between %lx and %lx",
518 (long)lowerLimit
, (long)upperLimit
, (long)n
,
519 (long)(i
+ 1), (long)w
, (long)previous
, (long)upperLimit
);
522 if(CollationWeights::lengthOfWeight(w
) == someLength
) { ++count
; }
524 if(count
< minCount
) {
525 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526 "returns only %ld < %ld weights of length %d",
527 (long)lowerLimit
, (long)upperLimit
, (long)n
,
528 (long)count
, (long)minCount
, (int)someLength
);
532 void CollationTest::TestCollationWeights() {
535 // Non-compressible primaries use 254 second bytes 02..FF.
536 logln("CollationWeights.initForPrimary(non-compressible)");
537 cw
.initForPrimary(FALSE
);
538 // Expect 1 weight 11 and 254 weights 12xx.
539 checkAllocWeights(cw
, 0x10000000, 0x13000000, 255, 1, 1);
540 checkAllocWeights(cw
, 0x10000000, 0x13000000, 255, 2, 254);
541 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
542 checkAllocWeights(cw
, 0x10fefe40, 0x12030300, 260, 2, 255);
543 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
544 checkAllocWeights(cw
, 0x10fefe40, 0x12030300, 600, 2, 254);
545 // Expect 254^2=64516 three-byte weights.
546 // During computation, there should be 3 three-byte ranges
547 // 10ffff, 11xxxx, 120202.
548 // The middle one should be split 64515:1,
549 // and the newly-split-off range and the last ranged lengthened.
550 checkAllocWeights(cw
, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
551 // Expect weights 1102 & 1103.
552 checkAllocWeights(cw
, 0x10ff0000, 0x11040000, 2, 2, 2);
553 // Expect weights 102102 & 102103.
554 checkAllocWeights(cw
, 0x1020ff00, 0x10210400, 2, 3, 2);
556 // Compressible primaries use 251 second bytes 04..FE.
557 logln("CollationWeights.initForPrimary(compressible)");
558 cw
.initForPrimary(TRUE
);
559 // Expect 1 weight 11 and 251 weights 12xx.
560 checkAllocWeights(cw
, 0x10000000, 0x13000000, 252, 1, 1);
561 checkAllocWeights(cw
, 0x10000000, 0x13000000, 252, 2, 251);
562 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
563 checkAllocWeights(cw
, 0x10fdfe40, 0x12050300, 260, 2, 252);
564 // Expect weights 1104 & 1105.
565 checkAllocWeights(cw
, 0x10fe0000, 0x11060000, 2, 2, 2);
566 // Expect weights 102102 & 102103.
567 checkAllocWeights(cw
, 0x1020ff00, 0x10210400, 2, 3, 2);
569 // Secondary and tertiary weights use only bytes 3 & 4.
570 logln("CollationWeights.initForSecondary()");
571 cw
.initForSecondary();
572 // Expect weights fbxx and all four fc..ff.
573 checkAllocWeights(cw
, 0xfb20, 0x10000, 20, 3, 4);
575 logln("CollationWeights.initForTertiary()");
576 cw
.initForTertiary();
577 // Expect weights 3dxx and both 3e & 3f.
578 checkAllocWeights(cw
, 0x3d02, 0x4000, 10, 3, 2);
583 UBool
isValidCE(const CollationRootElements
&re
, const CollationData
&data
,
584 uint32_t p
, uint32_t s
, uint32_t ctq
) {
585 uint32_t p1
= p
>> 24;
586 uint32_t p2
= (p
>> 16) & 0xff;
587 uint32_t p3
= (p
>> 8) & 0xff;
588 uint32_t p4
= p
& 0xff;
589 uint32_t s1
= s
>> 8;
590 uint32_t s2
= s
& 0xff;
591 // ctq = Case, Tertiary, Quaternary
592 uint32_t c
= (ctq
& Collation::CASE_MASK
) >> 14;
593 uint32_t t
= ctq
& Collation::ONLY_TERTIARY_MASK
;
594 uint32_t t1
= t
>> 8;
595 uint32_t t2
= t
& 0xff;
596 uint32_t q
= ctq
& Collation::QUATERNARY_MASK
;
597 // No leading zero bytes.
598 if((p
!= 0 && p1
== 0) || (s
!= 0 && s1
== 0) || (t
!= 0 && t1
== 0)) {
601 // No intermediate zero bytes.
602 if(p1
!= 0 && p2
== 0 && (p
& 0xffff) != 0) {
605 if(p2
!= 0 && p3
== 0 && p4
!= 0) {
608 // Minimum & maximum lead bytes.
609 if((p1
!= 0 && p1
<= Collation::MERGE_SEPARATOR_BYTE
) ||
610 (s1
!= 0 && s1
<= Collation::MERGE_SEPARATOR_BYTE
) ||
611 (t1
!= 0 && t1
<= Collation::MERGE_SEPARATOR_BYTE
)) {
614 if(t1
!= 0 && t1
> 0x3f) {
620 // The valid byte range for the second primary byte depends on compressibility.
622 if(data
.isCompressibleLeadByte(p1
)) {
623 if(p2
<= Collation::PRIMARY_COMPRESSION_LOW_BYTE
||
624 Collation::PRIMARY_COMPRESSION_HIGH_BYTE
<= p2
) {
628 if(p2
<= Collation::LEVEL_SEPARATOR_BYTE
) {
633 // Other bytes just need to avoid the level separator.
634 // Trailing zeros are ok.
635 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE
== 1);
636 if(p3
== Collation::LEVEL_SEPARATOR_BYTE
|| p4
== Collation::LEVEL_SEPARATOR_BYTE
||
637 s2
== Collation::LEVEL_SEPARATOR_BYTE
|| t2
== Collation::LEVEL_SEPARATOR_BYTE
) {
644 // Completely ignorable CE.
645 // Quaternary CEs are not supported.
646 if(c
!= 0 || q
!= 0) {
651 if(t
< re
.getTertiaryBoundary() || c
!= 2) {
657 if(s
< re
.getSecondaryBoundary() || t
== 0 || t
>= re
.getTertiaryBoundary()) {
663 if(s
== 0 || (Collation::COMMON_WEIGHT16
< s
&& s
<= re
.getLastCommonSecondary()) ||
664 s
>= re
.getSecondaryBoundary()) {
667 if(t
== 0 || t
>= re
.getTertiaryBoundary()) {
674 UBool
isValidCE(const CollationRootElements
&re
, const CollationData
&data
, int64_t ce
) {
675 uint32_t p
= (uint32_t)(ce
>> 32);
676 uint32_t secTer
= (uint32_t)ce
;
677 return isValidCE(re
, data
, p
, secTer
>> 16, secTer
& 0xffff);
680 class RootElementsIterator
{
682 RootElementsIterator(const CollationData
&root
)
684 elements(root
.rootElements
), length(root
.rootElementsLength
),
686 index((int32_t)elements
[CollationRootElements::IX_FIRST_TERTIARY_INDEX
]) {}
689 if(index
>= length
) { return FALSE
; }
690 uint32_t p
= elements
[index
];
691 if(p
== CollationRootElements::PRIMARY_SENTINEL
) { return FALSE
; }
692 if((p
& CollationRootElements::SEC_TER_DELTA_FLAG
) != 0) {
694 secTer
= p
& ~CollationRootElements::SEC_TER_DELTA_FLAG
;
697 if((p
& CollationRootElements::PRIMARY_STEP_MASK
) != 0) {
698 // End of a range, enumerate the primaries in the range.
699 int32_t step
= (int32_t)p
& CollationRootElements::PRIMARY_STEP_MASK
;
702 // Finished the range, return the next CE after it.
707 // Return the next primary in this range.
708 UBool isCompressible
= data
.isCompressiblePrimary(pri
);
709 if((pri
& 0xffff) == 0) {
710 pri
= Collation::incTwoBytePrimaryByOffset(pri
, isCompressible
, step
);
712 pri
= Collation::incThreeBytePrimaryByOffset(pri
, isCompressible
, step
);
716 // Simple primary CE.
719 secTer
= Collation::COMMON_SEC_AND_TER_CE
;
723 uint32_t getPrimary() const { return pri
; }
724 uint32_t getSecTer() const { return secTer
; }
727 const CollationData
&data
;
728 const uint32_t *elements
;
738 void CollationTest::TestRootElements() {
739 IcuTestErrorCode
errorCode(*this, "TestRootElements");
740 const CollationData
*root
= CollationRoot::getData(errorCode
);
741 if(errorCode
.logDataIfFailureAndReset("CollationRoot::getData()")) {
744 CollationRootElements
rootElements(root
->rootElements
, root
->rootElementsLength
);
745 RootElementsIterator
iter(*root
);
747 // We check each root CE for validity,
748 // and we also verify that there is a tailoring gap between each two CEs.
749 CollationWeights cw1c
; // compressible primary weights
750 CollationWeights cw1u
; // uncompressible primary weights
751 CollationWeights cw2
;
752 CollationWeights cw3
;
754 cw1c
.initForPrimary(TRUE
);
755 cw1u
.initForPrimary(FALSE
);
756 cw2
.initForSecondary();
757 cw3
.initForTertiary();
759 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
760 // nor the special merge-separator CE for U+FFFE.
761 uint32_t prevPri
= 0;
762 uint32_t prevSec
= 0;
763 uint32_t prevTer
= 0;
765 uint32_t pri
= iter
.getPrimary();
766 uint32_t secTer
= iter
.getSecTer();
767 // CollationRootElements CEs must have 0 case and quaternary bits.
768 if((secTer
& Collation::CASE_AND_QUATERNARY_MASK
) != 0) {
769 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
770 (long)pri
, (long)secTer
);
772 uint32_t sec
= secTer
>> 16;
773 uint32_t ter
= secTer
& Collation::ONLY_TERTIARY_MASK
;
775 if(pri
== 0 && sec
== 0 && ter
!= 0) {
776 // Tertiary CEs must have uppercase bits,
777 // but they are not stored in the CollationRootElements.
780 if(!isValidCE(rootElements
, *root
, pri
, sec
, ctq
)) {
781 errln("invalid root CE %08lx %08lx", (long)pri
, (long)secTer
);
784 uint32_t newWeight
= 0;
785 if(prevPri
== 0 || prevPri
>= Collation::FFFD_PRIMARY
) {
786 // There is currently no tailoring gap after primary ignorables,
787 // and we forbid tailoring after U+FFFD and U+FFFF.
788 } else if(root
->isCompressiblePrimary(prevPri
)) {
789 if(!cw1c
.allocWeights(prevPri
, pri
, 1)) {
790 errln("no primary/compressible tailoring gap between %08lx and %08lx",
791 (long)prevPri
, (long)pri
);
793 newWeight
= cw1c
.nextWeight();
796 if(!cw1u
.allocWeights(prevPri
, pri
, 1)) {
797 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
798 (long)prevPri
, (long)pri
);
800 newWeight
= cw1u
.nextWeight();
803 if(newWeight
!= 0 && !(prevPri
< newWeight
&& newWeight
< pri
)) {
804 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
805 (long)prevPri
, (long)newWeight
, (long)pri
);
807 } else if(sec
!= prevSec
) {
808 uint32_t lowerLimit
=
809 prevSec
== 0 ? rootElements
.getSecondaryBoundary() - 0x100 : prevSec
;
810 if(!cw2
.allocWeights(lowerLimit
, sec
, 1)) {
811 errln("no secondary tailoring gap between %04x and %04x", lowerLimit
, sec
);
813 uint32_t newWeight
= cw2
.nextWeight();
814 if(!(prevSec
< newWeight
&& newWeight
< sec
)) {
815 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
816 (long)lowerLimit
, (long)newWeight
, (long)sec
);
819 } else if(ter
!= prevTer
) {
820 uint32_t lowerLimit
=
821 prevTer
== 0 ? rootElements
.getTertiaryBoundary() - 0x100 : prevTer
;
822 if(!cw3
.allocWeights(lowerLimit
, ter
, 1)) {
823 errln("no teriary tailoring gap between %04x and %04x", lowerLimit
, ter
);
825 uint32_t newWeight
= cw3
.nextWeight();
826 if(!(prevTer
< newWeight
&& newWeight
< ter
)) {
827 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
828 (long)lowerLimit
, (long)newWeight
, (long)ter
);
832 errln("duplicate root CE %08lx %08lx", (long)pri
, (long)secTer
);
841 void CollationTest::TestTailoredElements() {
842 IcuTestErrorCode
errorCode(*this, "TestTailoredElements");
843 const CollationData
*root
= CollationRoot::getData(errorCode
);
844 if(errorCode
.logDataIfFailureAndReset("CollationRoot::getData()")) {
847 CollationRootElements
rootElements(root
->rootElements
, root
->rootElementsLength
);
849 UHashtable
*prevLocales
= uhash_open(uhash_hashChars
, uhash_compareChars
, NULL
, errorCode
);
850 if(errorCode
.logIfFailureAndReset("failed to create a hash table")) {
853 uhash_setKeyDeleter(prevLocales
, uprv_free
);
854 // TestRootElements() tests the root collator which does not have tailorings.
855 uhash_puti(prevLocales
, uprv_strdup(""), 1, errorCode
);
856 uhash_puti(prevLocales
, uprv_strdup("root"), 1, errorCode
);
857 uhash_puti(prevLocales
, uprv_strdup("root@collation=standard"), 1, errorCode
);
859 UVector64
ces(errorCode
);
860 LocalPointer
<StringEnumeration
> locales(Collator::getAvailableLocales());
861 U_ASSERT(locales
.isValid());
862 const char *localeID
= "root";
864 Locale
locale(localeID
);
865 LocalPointer
<StringEnumeration
> types(
866 Collator::getKeywordValuesForLocale("collation", locale
, FALSE
, errorCode
));
867 errorCode
.assertSuccess();
868 const char *type
= NULL
; // default type
870 Locale
localeWithType(locale
);
872 localeWithType
.setKeywordValue("collation", type
, errorCode
);
874 errorCode
.assertSuccess();
875 LocalPointer
<Collator
> coll(Collator::createInstance(localeWithType
, errorCode
));
876 if(errorCode
.logIfFailureAndReset("Collator::createInstance(%s)",
877 localeWithType
.getName())) {
880 Locale actual
= coll
->getLocale(ULOC_ACTUAL_LOCALE
, errorCode
);
881 if(uhash_geti(prevLocales
, actual
.getName()) != 0) {
884 uhash_puti(prevLocales
, uprv_strdup(actual
.getName()), 1, errorCode
);
885 errorCode
.assertSuccess();
886 logln("TestTailoredElements(): requested %s -> actual %s",
887 localeWithType
.getName(), actual
.getName());
888 RuleBasedCollator
*rbc
= dynamic_cast<RuleBasedCollator
*>(coll
.getAlias());
892 // Note: It would be better to get tailored strings such that we can
893 // identify the prefix, and only get the CEs for the prefix+string,
894 // not also for the prefix.
895 // There is currently no API for that.
896 // It would help in an unusual case where a contraction starting in the prefix
897 // extends past its end, and we do not see the intended mapping.
898 // For example, for a mapping p|st, if there is also a contraction ps,
899 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
900 LocalPointer
<UnicodeSet
> tailored(coll
->getTailoredSet(errorCode
));
901 errorCode
.assertSuccess();
902 UnicodeSetIterator
iter(*tailored
);
904 const UnicodeString
&s
= iter
.getString();
905 ces
.removeAllElements();
906 rbc
->internalGetCEs(s
, ces
, errorCode
);
907 errorCode
.assertSuccess();
908 for(int32_t i
= 0; i
< ces
.size(); ++i
) {
909 int64_t ce
= ces
.elementAti(i
);
910 if(!isValidCE(rootElements
, *root
, ce
)) {
911 errln("invalid tailored CE %016llx at CE index %d from string:",
912 (long long)ce
, (int)i
);
917 } while((type
= types
->next(NULL
, errorCode
)) != NULL
);
918 } while((localeID
= locales
->next(NULL
, errorCode
)) != NULL
);
919 uhash_close(prevLocales
);
922 UnicodeString
CollationTest::printSortKey(const uint8_t *p
, int32_t length
) {
924 for(int32_t i
= 0; i
< length
; ++i
) {
925 if(i
> 0) { s
.append((UChar
)0x20); }
928 s
.append((UChar
)0x2e); // period
930 s
.append((UChar
)0x7c); // vertical bar
938 UnicodeString
CollationTest::printCollationKey(const CollationKey
&key
) {
940 const uint8_t *p
= key
.getByteArray(length
);
941 return printSortKey(p
, length
);
944 UBool
CollationTest::readLine(UCHARBUF
*f
, IcuTestErrorCode
&errorCode
) {
946 const UChar
*line
= ucbuf_readline(f
, &lineLength
, errorCode
);
947 if(line
== NULL
|| errorCode
.isFailure()) {
952 // Strip trailing CR/LF, comments, and spaces.
953 const UChar
*comment
= u_memchr(line
, 0x23, lineLength
); // '#'
954 if(comment
!= NULL
) {
955 lineLength
= (int32_t)(comment
- line
);
957 while(lineLength
> 0 && isCROrLF(line
[lineLength
- 1])) { --lineLength
; }
959 while(lineLength
> 0 && isSpace(line
[lineLength
- 1])) { --lineLength
; }
960 fileLine
.setTo(FALSE
, line
, lineLength
);
964 void CollationTest::parseString(int32_t &start
, UnicodeString
&prefix
, UnicodeString
&s
,
965 UErrorCode
&errorCode
) {
966 int32_t length
= fileLine
.length();
968 for(i
= start
; i
< length
&& !isSpace(fileLine
[i
]); ++i
) {}
969 int32_t pipeIndex
= fileLine
.indexOf((UChar
)0x7c, start
, i
- start
); // '|'
971 prefix
= fileLine
.tempSubStringBetween(start
, pipeIndex
).unescape();
972 if(prefix
.isEmpty()) {
973 errln("empty prefix on line %d", (int)fileLineNumber
);
975 errorCode
= U_PARSE_ERROR
;
978 start
= pipeIndex
+ 1;
982 s
= fileLine
.tempSubStringBetween(start
, i
).unescape();
984 errln("empty string on line %d", (int)fileLineNumber
);
986 errorCode
= U_PARSE_ERROR
;
992 Collation::Level
CollationTest::parseRelationAndString(UnicodeString
&s
, IcuTestErrorCode
&errorCode
) {
993 Collation::Level relation
;
995 if(fileLine
[0] == 0x3c) { // <
996 UChar second
= fileLine
[1];
1000 relation
= Collation::PRIMARY_LEVEL
;
1003 relation
= Collation::SECONDARY_LEVEL
;
1006 relation
= Collation::TERTIARY_LEVEL
;
1009 relation
= Collation::QUATERNARY_LEVEL
;
1012 relation
= Collation::CASE_LEVEL
;
1015 relation
= Collation::IDENTICAL_LEVEL
;
1018 relation
= Collation::NO_LEVEL
;
1022 } else if(fileLine
[0] == 0x3d) { // =
1023 relation
= Collation::ZERO_LEVEL
;
1028 if(start
== 0 || !isSpace(fileLine
[start
])) {
1029 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber
);
1031 errorCode
.set(U_PARSE_ERROR
);
1032 return Collation::NO_LEVEL
;
1034 start
= skipSpaces(start
);
1035 UnicodeString prefix
;
1036 parseString(start
, prefix
, s
, errorCode
);
1037 if(errorCode
.isSuccess() && !prefix
.isEmpty()) {
1038 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber
);
1040 errorCode
.set(U_PARSE_ERROR
);
1041 return Collation::NO_LEVEL
;
1043 if(start
< fileLine
.length()) {
1044 errln("unexpected line contents after test string on line %d", (int)fileLineNumber
);
1046 errorCode
.set(U_PARSE_ERROR
);
1047 return Collation::NO_LEVEL
;
1052 static const struct {
1056 { "backwards", UCOL_FRENCH_COLLATION
},
1057 { "alternate", UCOL_ALTERNATE_HANDLING
},
1058 { "caseFirst", UCOL_CASE_FIRST
},
1059 { "caseLevel", UCOL_CASE_LEVEL
},
1060 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061 { "strength", UCOL_STRENGTH
},
1062 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063 { "numeric", UCOL_NUMERIC_COLLATION
}
1066 static const struct {
1068 UColAttributeValue value
;
1069 } attributeValues
[] = {
1070 { "default", UCOL_DEFAULT
},
1071 { "primary", UCOL_PRIMARY
},
1072 { "secondary", UCOL_SECONDARY
},
1073 { "tertiary", UCOL_TERTIARY
},
1074 { "quaternary", UCOL_QUATERNARY
},
1075 { "identical", UCOL_IDENTICAL
},
1076 { "off", UCOL_OFF
},
1078 { "shifted", UCOL_SHIFTED
},
1079 { "non-ignorable", UCOL_NON_IGNORABLE
},
1080 { "lower", UCOL_LOWER_FIRST
},
1081 { "upper", UCOL_UPPER_FIRST
}
1084 void CollationTest::parseAndSetAttribute(IcuTestErrorCode
&errorCode
) {
1085 int32_t start
= skipSpaces(1);
1086 int32_t equalPos
= fileLine
.indexOf(0x3d);
1088 if(fileLine
.compare(start
, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089 parseAndSetReorderCodes(start
+ 7, errorCode
);
1092 errln("missing '=' on line %d", (int)fileLineNumber
);
1094 errorCode
.set(U_PARSE_ERROR
);
1098 UnicodeString attrString
= fileLine
.tempSubStringBetween(start
, equalPos
);
1099 UnicodeString valueString
= fileLine
.tempSubString(equalPos
+1);
1100 if(attrString
== UNICODE_STRING("maxVariable", 11)) {
1101 UColReorderCode max
;
1102 if(valueString
== UNICODE_STRING("space", 5)) {
1103 max
= UCOL_REORDER_CODE_SPACE
;
1104 } else if(valueString
== UNICODE_STRING("punct", 5)) {
1105 max
= UCOL_REORDER_CODE_PUNCTUATION
;
1106 } else if(valueString
== UNICODE_STRING("symbol", 6)) {
1107 max
= UCOL_REORDER_CODE_SYMBOL
;
1108 } else if(valueString
== UNICODE_STRING("currency", 8)) {
1109 max
= UCOL_REORDER_CODE_CURRENCY
;
1111 errln("invalid attribute value name on line %d", (int)fileLineNumber
);
1113 errorCode
.set(U_PARSE_ERROR
);
1116 coll
->setMaxVariable(max
, errorCode
);
1117 if(errorCode
.isFailure()) {
1118 errln("setMaxVariable() failed on line %d: %s",
1119 (int)fileLineNumber
, errorCode
.errorName());
1128 for(int32_t i
= 0;; ++i
) {
1129 if(i
== LENGTHOF(attributes
)) {
1130 errln("invalid attribute name on line %d", (int)fileLineNumber
);
1132 errorCode
.set(U_PARSE_ERROR
);
1135 if(attrString
== UnicodeString(attributes
[i
].name
, -1, US_INV
)) {
1136 attr
= attributes
[i
].attr
;
1141 UColAttributeValue value
;
1142 for(int32_t i
= 0;; ++i
) {
1143 if(i
== LENGTHOF(attributeValues
)) {
1144 errln("invalid attribute value name on line %d", (int)fileLineNumber
);
1146 errorCode
.set(U_PARSE_ERROR
);
1149 if(valueString
== UnicodeString(attributeValues
[i
].name
, -1, US_INV
)) {
1150 value
= attributeValues
[i
].value
;
1155 coll
->setAttribute(attr
, value
, errorCode
);
1156 if(errorCode
.isFailure()) {
1157 errln("illegal attribute=value combination on line %d: %s",
1158 (int)fileLineNumber
, errorCode
.errorName());
1165 void CollationTest::parseAndSetReorderCodes(int32_t start
, IcuTestErrorCode
&errorCode
) {
1166 UVector32
reorderCodes(errorCode
);
1167 while(start
< fileLine
.length()) {
1168 start
= skipSpaces(start
);
1169 int32_t limit
= start
;
1170 while(limit
< fileLine
.length() && !isSpace(fileLine
[limit
])) { ++limit
; }
1172 name
.appendInvariantChars(fileLine
.tempSubStringBetween(start
, limit
), errorCode
);
1173 int32_t code
= CollationRuleParser::getReorderCode(name
.data());
1175 errln("invalid reorder code '%s' on line %d", name
.data(), (int)fileLineNumber
);
1177 errorCode
.set(U_PARSE_ERROR
);
1180 reorderCodes
.addElement(code
, errorCode
);
1183 coll
->setReorderCodes(reorderCodes
.getBuffer(), reorderCodes
.size(), errorCode
);
1184 if(errorCode
.isFailure()) {
1185 errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber
, errorCode
.errorName());
1192 void CollationTest::buildTailoring(UCHARBUF
*f
, IcuTestErrorCode
&errorCode
) {
1193 UnicodeString rules
;
1194 while(readLine(f
, errorCode
)) {
1195 if(fileLine
.isEmpty()) { continue; }
1196 if(isSectionStarter(fileLine
[0])) { break; }
1197 rules
.append(fileLine
.unescape());
1199 if(errorCode
.isFailure()) { return; }
1202 UParseError parseError
;
1203 UnicodeString reason
;
1205 coll
= new RuleBasedCollator(rules
, parseError
, reason
, errorCode
);
1207 errln("unable to allocate a new collator");
1208 errorCode
.set(U_MEMORY_ALLOCATION_ERROR
);
1211 if(errorCode
.isFailure()) {
1212 errln("RuleBasedCollator(rules) failed - %s", errorCode
.errorName());
1213 infoln(UnicodeString(" reason: ") + reason
);
1214 if(parseError
.offset
>= 0) { infoln(" rules offset: %d", (int)parseError
.offset
); }
1215 if(parseError
.preContext
[0] != 0 || parseError
.postContext
[0] != 0) {
1216 infoln(UnicodeString(" snippet: ...") +
1217 parseError
.preContext
+ "(!)" + parseError
.postContext
+ "...");
1220 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221 UnicodeString(), reason
);
1225 void CollationTest::setRootCollator(IcuTestErrorCode
&errorCode
) {
1226 if(errorCode
.isFailure()) { return; }
1228 coll
= Collator::createInstance(Locale::getRoot(), errorCode
);
1229 if(errorCode
.isFailure()) {
1230 dataerrln("unable to create a root collator");
1235 void CollationTest::setLocaleCollator(IcuTestErrorCode
&errorCode
) {
1236 if(errorCode
.isFailure()) { return; }
1238 langTag
.appendInvariantChars(fileLine
.tempSubString(9), errorCode
);
1239 char localeID
[ULOC_FULLNAME_CAPACITY
];
1240 int32_t parsedLength
;
1241 (void)uloc_forLanguageTag(
1242 langTag
.data(), localeID
, LENGTHOF(localeID
), &parsedLength
, errorCode
);
1243 Locale
locale(localeID
);
1244 if(fileLine
.length() == 9 ||
1245 errorCode
.isFailure() || errorCode
.get() == U_STRING_NOT_TERMINATED_WARNING
||
1246 parsedLength
!= langTag
.length() || locale
.isBogus()) {
1247 errln("invalid language tag on line %d", (int)fileLineNumber
);
1249 if(errorCode
.isSuccess()) { errorCode
.set(U_PARSE_ERROR
); }
1253 logln("creating a collator for locale ID %s", locale
.getName());
1254 Collator
*newColl
= Collator::createInstance(locale
, errorCode
);
1255 if(errorCode
.isFailure()) {
1256 dataerrln("unable to create a collator for locale %s on line %d",
1257 locale
.getName(), (int)fileLineNumber
);
1265 UBool
CollationTest::needsNormalization(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
1266 if(U_FAILURE(errorCode
) || !fcd
->isNormalized(s
, errorCode
)) { return TRUE
; }
1267 // In some sequences with Tibetan composite vowel signs,
1268 // even if the string passes the FCD check,
1269 // those composites must be decomposed.
1270 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1272 while((index
= s
.indexOf((UChar
)0xf71, index
)) >= 0) {
1273 if(++index
< s
.length()) {
1275 if(c
== 0xf73 || c
== 0xf75 || c
== 0xf81) { return TRUE
; }
1281 UBool
CollationTest::getSortKeyParts(const UChar
*s
, int32_t length
,
1282 CharString
&dest
, int32_t partSize
,
1283 IcuTestErrorCode
&errorCode
) {
1284 if(errorCode
.isFailure()) { return FALSE
; }
1286 U_ASSERT(partSize
<= LENGTHOF(part
));
1288 uiter_setString(&iter
, s
, length
);
1289 uint32_t state
[2] = { 0, 0 };
1291 int32_t partLength
= coll
->internalNextSortKeyPart(&iter
, state
, part
, partSize
, errorCode
);
1292 UBool done
= partLength
< partSize
;
1294 // At the end, append the next byte as well which should be 00.
1297 dest
.append(reinterpret_cast<char *>(part
), partLength
, errorCode
);
1299 return errorCode
.isSuccess();
1304 UBool
CollationTest::getCollationKey(const char *norm
, const UnicodeString
&line
,
1305 const UChar
*s
, int32_t length
,
1306 CollationKey
&key
, IcuTestErrorCode
&errorCode
) {
1307 if(errorCode
.isFailure()) { return FALSE
; }
1308 coll
->getCollationKey(s
, length
, key
, errorCode
);
1309 if(errorCode
.isFailure()) {
1310 infoln(fileTestName
);
1311 errln("Collator(%s).getCollationKey() failed: %s",
1312 norm
, errorCode
.errorName());
1317 const uint8_t *keyBytes
= key
.getByteArray(keyLength
);
1318 if(keyLength
== 0 || keyBytes
[keyLength
- 1] != 0) {
1319 infoln(fileTestName
);
1320 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1323 infoln(printCollationKey(key
));
1327 int32_t numLevels
= coll
->getAttribute(UCOL_STRENGTH
, errorCode
);
1328 if(numLevels
< UCOL_IDENTICAL
) {
1333 if(coll
->getAttribute(UCOL_CASE_LEVEL
, errorCode
) == UCOL_ON
) {
1336 errorCode
.assertSuccess();
1337 int32_t numLevelSeparators
= 0;
1338 for(int32_t i
= 0; i
< (keyLength
- 1); ++i
) {
1339 uint8_t b
= keyBytes
[i
];
1341 infoln(fileTestName
);
1342 errln("Collator(%s).getCollationKey() contains a 00 byte", norm
);
1344 infoln(printCollationKey(key
));
1347 if(b
== 1) { ++numLevelSeparators
; }
1349 if(numLevelSeparators
!= (numLevels
- 1)) {
1350 infoln(fileTestName
);
1351 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1352 norm
, (int)numLevelSeparators
, (int)numLevels
);
1354 infoln(printCollationKey(key
));
1358 // If s contains U+FFFE, check that merged segments make the same key.
1359 LocalMemory
<uint8_t> mergedKey
;
1360 int32_t mergedKeyLength
= 0;
1361 int32_t mergedKeyCapacity
= 0;
1362 int32_t sLength
= (length
>= 0) ? length
: u_strlen(s
);
1363 int32_t segmentStart
= 0;
1364 for(int32_t i
= 0;;) {
1366 if(segmentStart
== 0) {
1367 // s does not contain any U+FFFE.
1370 } else if(s
[i
] != 0xfffe) {
1374 // Get the sort key for another segment and merge it into mergedKey.
1375 CollationKey
key1(mergedKey
.getAlias(), mergedKeyLength
); // copies the bytes
1377 coll
->getCollationKey(s
+ segmentStart
, i
- segmentStart
, key2
, errorCode
);
1378 int32_t key1Length
, key2Length
;
1379 const uint8_t *key1Bytes
= key1
.getByteArray(key1Length
);
1380 const uint8_t *key2Bytes
= key2
.getByteArray(key2Length
);
1382 int32_t minCapacity
= key1Length
+ key2Length
;
1383 if(key1Length
> 0) { --minCapacity
; }
1384 if(minCapacity
<= mergedKeyCapacity
) {
1385 dest
= mergedKey
.getAlias();
1387 if(minCapacity
<= 200) {
1388 mergedKeyCapacity
= 200;
1389 } else if(minCapacity
<= 2 * mergedKeyCapacity
) {
1390 mergedKeyCapacity
*= 2;
1392 mergedKeyCapacity
= minCapacity
;
1394 dest
= mergedKey
.allocateInsteadAndReset(mergedKeyCapacity
);
1396 U_ASSERT(dest
!= NULL
|| mergedKeyCapacity
== 0);
1397 if(key1Length
== 0) {
1398 // key2 is the sort key for the first segment.
1399 uprv_memcpy(dest
, key2Bytes
, key2Length
);
1400 mergedKeyLength
= key2Length
;
1403 ucol_mergeSortkeys(key1Bytes
, key1Length
, key2Bytes
, key2Length
,
1404 dest
, mergedKeyCapacity
);
1406 if(i
== sLength
) { break; }
1409 if(segmentStart
!= 0 &&
1410 (mergedKeyLength
!= keyLength
||
1411 uprv_memcmp(mergedKey
.getAlias(), keyBytes
, keyLength
) != 0)) {
1412 infoln(fileTestName
);
1413 errln("Collator(%s).getCollationKey(with U+FFFE) != "
1414 "ucol_mergeSortkeys(segments)",
1417 infoln(printCollationKey(key
));
1418 infoln(printSortKey(mergedKey
.getAlias(), mergedKeyLength
));
1422 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1423 static const int32_t partSizes
[] = { 32, 3, 1 };
1424 for(int32_t psi
= 0; psi
< LENGTHOF(partSizes
); ++psi
) {
1425 int32_t partSize
= partSizes
[psi
];
1427 if(!getSortKeyParts(s
, length
, parts
, 32, errorCode
)) {
1428 infoln(fileTestName
);
1429 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1430 norm
, (int)partSize
, errorCode
.errorName());
1434 if(keyLength
!= parts
.length() || uprv_memcmp(keyBytes
, parts
.data(), keyLength
) != 0) {
1435 infoln(fileTestName
);
1436 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1437 norm
, (int)partSize
);
1439 infoln(printCollationKey(key
));
1440 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts
.data()), parts
.length()));
1450 * Replaces unpaired surrogates with U+FFFD.
1451 * Returns s if no replacement was made, otherwise buffer.
1453 const UnicodeString
&surrogatesToFFFD(const UnicodeString
&s
, UnicodeString
&buffer
) {
1455 while(i
< s
.length()) {
1456 UChar32 c
= s
.char32At(i
);
1457 if(U_IS_SURROGATE(c
)) {
1458 if(buffer
.length() < i
) {
1459 buffer
.append(s
, buffer
.length(), i
- buffer
.length());
1461 buffer
.append((UChar
)0xfffd);
1465 if(buffer
.isEmpty()) {
1468 if(buffer
.length() < i
) {
1469 buffer
.append(s
, buffer
.length(), i
- buffer
.length());
1476 UBool
CollationTest::checkCompareTwo(const char *norm
, const UnicodeString
&prevFileLine
,
1477 const UnicodeString
&prevString
, const UnicodeString
&s
,
1478 UCollationResult expectedOrder
, Collation::Level expectedLevel
,
1479 IcuTestErrorCode
&errorCode
) {
1480 if(errorCode
.isFailure()) { return FALSE
; }
1482 // Get the sort keys first, for error debug output.
1483 CollationKey prevKey
;
1484 if(!getCollationKey(norm
, prevFileLine
, prevString
.getBuffer(), prevString
.length(),
1485 prevKey
, errorCode
)) {
1489 if(!getCollationKey(norm
, fileLine
, s
.getBuffer(), s
.length(), key
, errorCode
)) { return FALSE
; }
1491 UCollationResult order
= coll
->compare(prevString
, s
, errorCode
);
1492 if(order
!= expectedOrder
|| errorCode
.isFailure()) {
1493 infoln(fileTestName
);
1494 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1495 (int)fileLineNumber
, norm
, order
, expectedOrder
, errorCode
.errorName());
1496 infoln(prevFileLine
);
1498 infoln(printCollationKey(prevKey
));
1499 infoln(printCollationKey(key
));
1502 order
= coll
->compare(s
, prevString
, errorCode
);
1503 if(order
!= -expectedOrder
|| errorCode
.isFailure()) {
1504 infoln(fileTestName
);
1505 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1506 (int)fileLineNumber
, norm
, order
, -expectedOrder
, errorCode
.errorName());
1507 infoln(prevFileLine
);
1509 infoln(printCollationKey(prevKey
));
1510 infoln(printCollationKey(key
));
1513 // Test NUL-termination if the strings do not contain NUL characters.
1514 UBool containNUL
= prevString
.indexOf((UChar
)0) >= 0 || s
.indexOf((UChar
)0) >= 0;
1516 order
= coll
->compare(prevString
.getBuffer(), -1, s
.getBuffer(), -1, errorCode
);
1517 if(order
!= expectedOrder
|| errorCode
.isFailure()) {
1518 infoln(fileTestName
);
1519 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1520 (int)fileLineNumber
, norm
, order
, expectedOrder
, errorCode
.errorName());
1521 infoln(prevFileLine
);
1523 infoln(printCollationKey(prevKey
));
1524 infoln(printCollationKey(key
));
1527 order
= coll
->compare(s
.getBuffer(), -1, prevString
.getBuffer(), -1, errorCode
);
1528 if(order
!= -expectedOrder
|| errorCode
.isFailure()) {
1529 infoln(fileTestName
);
1530 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1531 (int)fileLineNumber
, norm
, order
, -expectedOrder
, errorCode
.errorName());
1532 infoln(prevFileLine
);
1534 infoln(printCollationKey(prevKey
));
1535 infoln(printCollationKey(key
));
1540 #if U_HAVE_STD_STRING
1541 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1542 // Unpaired surrogates cannot be converted to UTF-8.
1543 // Create valid UTF-16 strings if necessary, and use those for
1544 // both the expected compare() result and for the input to compare(UTF-8).
1545 UnicodeString prevBuffer
, sBuffer
;
1546 const UnicodeString
&prevValid
= surrogatesToFFFD(prevString
, prevBuffer
);
1547 const UnicodeString
&sValid
= surrogatesToFFFD(s
, sBuffer
);
1548 std::string prevUTF8
, sUTF8
;
1549 UnicodeString(prevValid
).toUTF8String(prevUTF8
);
1550 UnicodeString(sValid
).toUTF8String(sUTF8
);
1551 UCollationResult expectedUTF8Order
;
1552 if(&prevValid
== &prevString
&& &sValid
== &s
) {
1553 expectedUTF8Order
= expectedOrder
;
1555 expectedUTF8Order
= coll
->compare(prevValid
, sValid
, errorCode
);
1558 order
= coll
->compareUTF8(prevUTF8
, sUTF8
, errorCode
);
1559 if(order
!= expectedUTF8Order
|| errorCode
.isFailure()) {
1560 infoln(fileTestName
);
1561 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1562 (int)fileLineNumber
, norm
, order
, expectedUTF8Order
, errorCode
.errorName());
1563 infoln(prevFileLine
);
1565 infoln(printCollationKey(prevKey
));
1566 infoln(printCollationKey(key
));
1569 order
= coll
->compareUTF8(sUTF8
, prevUTF8
, errorCode
);
1570 if(order
!= -expectedUTF8Order
|| errorCode
.isFailure()) {
1571 infoln(fileTestName
);
1572 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1573 (int)fileLineNumber
, norm
, order
, -expectedUTF8Order
, errorCode
.errorName());
1574 infoln(prevFileLine
);
1576 infoln(printCollationKey(prevKey
));
1577 infoln(printCollationKey(key
));
1580 // Test NUL-termination if the strings do not contain NUL characters.
1582 order
= coll
->internalCompareUTF8(prevUTF8
.c_str(), -1, sUTF8
.c_str(), -1, errorCode
);
1583 if(order
!= expectedUTF8Order
|| errorCode
.isFailure()) {
1584 infoln(fileTestName
);
1585 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586 (int)fileLineNumber
, norm
, order
, expectedUTF8Order
, errorCode
.errorName());
1587 infoln(prevFileLine
);
1589 infoln(printCollationKey(prevKey
));
1590 infoln(printCollationKey(key
));
1593 order
= coll
->internalCompareUTF8(sUTF8
.c_str(), -1, prevUTF8
.c_str(), -1, errorCode
);
1594 if(order
!= -expectedUTF8Order
|| errorCode
.isFailure()) {
1595 infoln(fileTestName
);
1596 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597 (int)fileLineNumber
, norm
, order
, -expectedUTF8Order
, errorCode
.errorName());
1598 infoln(prevFileLine
);
1600 infoln(printCollationKey(prevKey
));
1601 infoln(printCollationKey(key
));
1607 UCharIterator leftIter
;
1608 UCharIterator rightIter
;
1609 uiter_setString(&leftIter
, prevString
.getBuffer(), prevString
.length());
1610 uiter_setString(&rightIter
, s
.getBuffer(), s
.length());
1611 order
= coll
->compare(leftIter
, rightIter
, errorCode
);
1612 if(order
!= expectedOrder
|| errorCode
.isFailure()) {
1613 infoln(fileTestName
);
1614 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1615 "wrong order: %d != %d (%s)",
1616 (int)fileLineNumber
, norm
, order
, expectedOrder
, errorCode
.errorName());
1617 infoln(prevFileLine
);
1619 infoln(printCollationKey(prevKey
));
1620 infoln(printCollationKey(key
));
1624 order
= prevKey
.compareTo(key
, errorCode
);
1625 if(order
!= expectedOrder
|| errorCode
.isFailure()) {
1626 infoln(fileTestName
);
1627 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1628 (int)fileLineNumber
, norm
, order
, expectedOrder
, errorCode
.errorName());
1629 infoln(prevFileLine
);
1631 infoln(printCollationKey(prevKey
));
1632 infoln(printCollationKey(key
));
1635 if(order
!= UCOL_EQUAL
&& expectedLevel
!= Collation::NO_LEVEL
) {
1636 int32_t prevKeyLength
;
1637 const uint8_t *prevBytes
= prevKey
.getByteArray(prevKeyLength
);
1639 const uint8_t *bytes
= key
.getByteArray(keyLength
);
1640 int32_t level
= Collation::PRIMARY_LEVEL
;
1641 for(int32_t i
= 0;; ++i
) {
1642 uint8_t b
= prevBytes
[i
];
1643 if(b
!= bytes
[i
]) { break; }
1644 if(b
== Collation::LEVEL_SEPARATOR_BYTE
) {
1646 if(level
== Collation::CASE_LEVEL
&&
1647 coll
->getAttribute(UCOL_CASE_LEVEL
, errorCode
) == UCOL_OFF
) {
1652 if(level
!= expectedLevel
) {
1653 infoln(fileTestName
);
1654 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1655 (int)fileLineNumber
, norm
, order
, level
, expectedLevel
);
1656 infoln(prevFileLine
);
1658 infoln(printCollationKey(prevKey
));
1659 infoln(printCollationKey(key
));
1666 void CollationTest::checkCompareStrings(UCHARBUF
*f
, IcuTestErrorCode
&errorCode
) {
1667 if(errorCode
.isFailure()) { return; }
1668 UnicodeString prevFileLine
= UNICODE_STRING("(none)", 6);
1669 UnicodeString prevString
, s
;
1670 prevString
.getTerminatedBuffer(); // Ensure NUL-termination.
1671 while(readLine(f
, errorCode
)) {
1672 if(fileLine
.isEmpty()) { continue; }
1673 if(isSectionStarter(fileLine
[0])) { break; }
1674 Collation::Level relation
= parseRelationAndString(s
, errorCode
);
1675 if(errorCode
.isFailure()) {
1679 UCollationResult expectedOrder
= (relation
== Collation::ZERO_LEVEL
) ? UCOL_EQUAL
: UCOL_LESS
;
1680 Collation::Level expectedLevel
= relation
;
1681 s
.getTerminatedBuffer(); // Ensure NUL-termination.
1683 if(!needsNormalization(prevString
, errorCode
) && !needsNormalization(s
, errorCode
)) {
1684 coll
->setAttribute(UCOL_NORMALIZATION_MODE
, UCOL_OFF
, errorCode
);
1685 isOk
= checkCompareTwo("normalization=on", prevFileLine
, prevString
, s
,
1686 expectedOrder
, expectedLevel
, errorCode
);
1689 coll
->setAttribute(UCOL_NORMALIZATION_MODE
, UCOL_ON
, errorCode
);
1690 isOk
= checkCompareTwo("normalization=off", prevFileLine
, prevString
, s
,
1691 expectedOrder
, expectedLevel
, errorCode
);
1693 if(isOk
&& (!nfd
->isNormalized(prevString
, errorCode
) || !nfd
->isNormalized(s
, errorCode
))) {
1694 UnicodeString pn
= nfd
->normalize(prevString
, errorCode
);
1695 UnicodeString n
= nfd
->normalize(s
, errorCode
);
1696 pn
.getTerminatedBuffer();
1697 n
.getTerminatedBuffer();
1698 errorCode
.assertSuccess();
1699 isOk
= checkCompareTwo("NFD input", prevFileLine
, pn
, n
,
1700 expectedOrder
, expectedLevel
, errorCode
);
1703 errorCode
.reset(); // already reported
1705 prevFileLine
= fileLine
;
1707 prevString
.getTerminatedBuffer(); // Ensure NUL-termination.
1711 void CollationTest::TestDataDriven() {
1712 IcuTestErrorCode
errorCode(*this, "TestDataDriven");
1714 fcd
= Normalizer2Factory::getFCDInstance(errorCode
);
1715 nfd
= Normalizer2Factory::getNFDInstance(errorCode
);
1716 if(errorCode
.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1720 CharString
path(getSourceTestData(errorCode
), errorCode
);
1721 path
.appendPathPart("collationtest.txt", errorCode
);
1722 const char *codePage
= "UTF-8";
1723 LocalUCHARBUFPointer
f(ucbuf_open(path
.data(), &codePage
, TRUE
, FALSE
, errorCode
));
1724 if(errorCode
.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1727 while(errorCode
.isSuccess()) {
1728 // Read a new line if necessary.
1729 // Sub-parsers leave the first line set that they do not handle.
1730 if(fileLine
.isEmpty()) {
1731 if(!readLine(f
.getAlias(), errorCode
)) { break; }
1734 if(!isSectionStarter(fileLine
[0])) {
1735 errln("syntax error on line %d", (int)fileLineNumber
);
1739 if(fileLine
.startsWith(UNICODE_STRING("** test: ", 9))) {
1740 fileTestName
= fileLine
;
1743 } else if(fileLine
== UNICODE_STRING("@ root", 6)) {
1744 setRootCollator(errorCode
);
1746 } else if(fileLine
.startsWith(UNICODE_STRING("@ locale ", 9))) {
1747 setLocaleCollator(errorCode
);
1749 } else if(fileLine
== UNICODE_STRING("@ rules", 7)) {
1750 buildTailoring(f
.getAlias(), errorCode
);
1751 } else if(fileLine
[0] == 0x25 && isSpace(fileLine
[1])) { // %
1752 parseAndSetAttribute(errorCode
);
1753 } else if(fileLine
== UNICODE_STRING("* compare", 9)) {
1754 checkCompareStrings(f
.getAlias(), errorCode
);
1756 errln("syntax error on line %d", (int)fileLineNumber
);
1763 #endif // !UCONFIG_NO_COLLATION