1 /********************************************************************
3 * Copyright (c) 1997-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /********************************************************************************
10 * Modification History:
11 * Date Name Description
12 * Madhu Katragadda Ported for C API
13 * 02/19/01 synwee Modified test case for new collation iterator
14 *********************************************************************************/
16 * Collation Iterator tests.
17 * (Let me reiterate my position...)
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/ucol.h"
25 #include "unicode/ucoleitr.h"
26 #include "unicode/uloc.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ustring.h"
29 #include "unicode/putil.h"
42 extern uint8_t ucol_uprv_getCaseBits(const UChar
*, uint32_t, UErrorCode
*);
44 void addCollIterTest(TestNode
** root
)
46 addTest(root
, &TestPrevious
, "tscoll/citertst/TestPrevious");
47 addTest(root
, &TestOffset
, "tscoll/citertst/TestOffset");
48 addTest(root
, &TestSetText
, "tscoll/citertst/TestSetText");
49 addTest(root
, &TestMaxExpansion
, "tscoll/citertst/TestMaxExpansion");
50 addTest(root
, &TestUnicodeChar
, "tscoll/citertst/TestUnicodeChar");
51 addTest(root
, &TestNormalizedUnicodeChar
,
52 "tscoll/citertst/TestNormalizedUnicodeChar");
53 addTest(root
, &TestNormalization
, "tscoll/citertst/TestNormalization");
54 addTest(root
, &TestBug672
, "tscoll/citertst/TestBug672");
55 addTest(root
, &TestBug672Normalize
, "tscoll/citertst/TestBug672Normalize");
56 addTest(root
, &TestSmallBuffer
, "tscoll/citertst/TestSmallBuffer");
57 addTest(root
, &TestCEs
, "tscoll/citertst/TestCEs");
58 addTest(root
, &TestDiscontiguos
, "tscoll/citertst/TestDiscontiguos");
59 addTest(root
, &TestCEBufferOverflow
, "tscoll/citertst/TestCEBufferOverflow");
60 addTest(root
, &TestCEValidity
, "tscoll/citertst/TestCEValidity");
61 addTest(root
, &TestSortKeyValidity
, "tscoll/citertst/TestSortKeyValidity");
62 addTest(root
, &TestSearchCollatorElements
, "tscoll/citertst/TestSearchCollatorElements");
65 /* The locales we support */
67 static const char * LOCALES
[] = {"en_AU", "en_BE", "en_CA"};
69 static void TestBug672() {
70 UErrorCode status
= U_ZERO_ERROR
;
76 u_uastrcpy(pattern
, "resume");
77 u_uastrcpy(text
, "Time to resume updating my resume.");
79 for (i
= 0; i
< 3; ++ i
) {
80 UCollator
*coll
= ucol_open(LOCALES
[i
], &status
);
81 UCollationElements
*pitr
= ucol_openElements(coll
, pattern
, -1,
83 UCollationElements
*titer
= ucol_openElements(coll
, text
, -1,
85 if (U_FAILURE(status
)) {
86 log_err_status(status
, "ERROR: in creation of either the collator or the collation iterator :%s\n",
91 log_verbose("locale tested %s\n", LOCALES
[i
]);
93 while (ucol_next(pitr
, &status
) != UCOL_NULLORDER
&&
96 if (U_FAILURE(status
)) {
97 log_err("ERROR: reversing collation iterator :%s\n",
103 ucol_setOffset(titer
, u_strlen(pattern
), &status
);
104 if (U_FAILURE(status
)) {
105 log_err("ERROR: setting offset in collator :%s\n",
106 myErrorName(status
));
109 result
[i
][0] = ucol_getOffset(titer
);
110 log_verbose("Text iterator set to offset %d\n", result
[i
][0]);
113 ucol_previous(titer
, &status
);
114 result
[i
][1] = ucol_getOffset(titer
);
115 log_verbose("Current offset %d after previous\n", result
[i
][1]);
117 /* Add one to index */
118 log_verbose("Adding one to current offset...\n");
119 ucol_setOffset(titer
, ucol_getOffset(titer
) + 1, &status
);
120 if (U_FAILURE(status
)) {
121 log_err("ERROR: setting offset in collator :%s\n",
122 myErrorName(status
));
125 result
[i
][2] = ucol_getOffset(titer
);
126 log_verbose("Current offset in text = %d\n", result
[i
][2]);
127 ucol_closeElements(pitr
);
128 ucol_closeElements(titer
);
132 if (uprv_memcmp(result
[0], result
[1], 3) != 0 ||
133 uprv_memcmp(result
[1], result
[2], 3) != 0) {
134 log_err("ERROR: Different locales have different offsets at the same character\n");
140 /* Running this test with normalization enabled showed up a bug in the incremental
141 normalization code. */
142 static void TestBug672Normalize() {
143 UErrorCode status
= U_ZERO_ERROR
;
149 u_uastrcpy(pattern
, "resume");
150 u_uastrcpy(text
, "Time to resume updating my resume.");
152 for (i
= 0; i
< 3; ++ i
) {
153 UCollator
*coll
= ucol_open(LOCALES
[i
], &status
);
154 UCollationElements
*pitr
= NULL
;
155 UCollationElements
*titer
= NULL
;
157 ucol_setAttribute(coll
, UCOL_NORMALIZATION_MODE
, UCOL_ON
, &status
);
159 pitr
= ucol_openElements(coll
, pattern
, -1, &status
);
160 titer
= ucol_openElements(coll
, text
, -1, &status
);
161 if (U_FAILURE(status
)) {
162 log_err_status(status
, "ERROR: in creation of either the collator or the collation iterator :%s\n",
163 myErrorName(status
));
167 log_verbose("locale tested %s\n", LOCALES
[i
]);
169 while (ucol_next(pitr
, &status
) != UCOL_NULLORDER
&&
172 if (U_FAILURE(status
)) {
173 log_err("ERROR: reversing collation iterator :%s\n",
174 myErrorName(status
));
179 ucol_setOffset(titer
, u_strlen(pattern
), &status
);
180 if (U_FAILURE(status
)) {
181 log_err("ERROR: setting offset in collator :%s\n",
182 myErrorName(status
));
185 result
[i
][0] = ucol_getOffset(titer
);
186 log_verbose("Text iterator set to offset %d\n", result
[i
][0]);
189 ucol_previous(titer
, &status
);
190 result
[i
][1] = ucol_getOffset(titer
);
191 log_verbose("Current offset %d after previous\n", result
[i
][1]);
193 /* Add one to index */
194 log_verbose("Adding one to current offset...\n");
195 ucol_setOffset(titer
, ucol_getOffset(titer
) + 1, &status
);
196 if (U_FAILURE(status
)) {
197 log_err("ERROR: setting offset in collator :%s\n",
198 myErrorName(status
));
201 result
[i
][2] = ucol_getOffset(titer
);
202 log_verbose("Current offset in text = %d\n", result
[i
][2]);
203 ucol_closeElements(pitr
);
204 ucol_closeElements(titer
);
208 if (uprv_memcmp(result
[0], result
[1], 3) != 0 ||
209 uprv_memcmp(result
[1], result
[2], 3) != 0) {
210 log_err("ERROR: Different locales have different offsets at the same character\n");
218 * Test for CollationElementIterator previous and next for the whole set of
219 * unicode characters.
221 static void TestUnicodeChar()
225 UCollationElements
*iter
;
226 UErrorCode status
= U_ZERO_ERROR
;
230 en_us
= ucol_open("en_US", &status
);
231 if (U_FAILURE(status
)){
232 log_err_status(status
, "ERROR: in creation of collation data using ucol_open()\n %s\n",
233 myErrorName(status
));
237 for (codepoint
= 1; codepoint
< 0xFFFE;)
241 while (codepoint
% 0xFF != 0)
243 if (u_isdefined(codepoint
))
244 *(test
++) = codepoint
;
248 if (u_isdefined(codepoint
))
249 *(test
++) = codepoint
;
251 if (codepoint
!= 0xFFFF)
255 iter
=ucol_openElements(en_us
, source
, u_strlen(source
), &status
);
256 if(U_FAILURE(status
)){
257 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
258 myErrorName(status
));
262 /* A basic test to see if it's working at all */
263 log_verbose("codepoint testing %x\n", codepoint
);
265 ucol_closeElements(iter
);
267 /* null termination test */
268 iter
=ucol_openElements(en_us
, source
, -1, &status
);
269 if(U_FAILURE(status
)){
270 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
271 myErrorName(status
));
275 /* A basic test to see if it's working at all */
277 ucol_closeElements(iter
);
284 * Test for CollationElementIterator previous and next for the whole set of
285 * unicode characters with normalization on.
287 static void TestNormalizedUnicodeChar()
291 UCollationElements
*iter
;
292 UErrorCode status
= U_ZERO_ERROR
;
296 /* thai should have normalization on */
297 th_th
= ucol_open("th_TH", &status
);
298 if (U_FAILURE(status
)){
299 log_err_status(status
, "ERROR: in creation of thai collation using ucol_open()\n %s\n",
300 myErrorName(status
));
304 for (codepoint
= 1; codepoint
< 0xFFFE;)
308 while (codepoint
% 0xFF != 0)
310 if (u_isdefined(codepoint
))
311 *(test
++) = codepoint
;
315 if (u_isdefined(codepoint
))
316 *(test
++) = codepoint
;
318 if (codepoint
!= 0xFFFF)
322 iter
=ucol_openElements(th_th
, source
, u_strlen(source
), &status
);
323 if(U_FAILURE(status
)){
324 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
325 myErrorName(status
));
331 ucol_closeElements(iter
);
333 iter
=ucol_openElements(th_th
, source
, -1, &status
);
334 if(U_FAILURE(status
)){
335 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
336 myErrorName(status
));
342 ucol_closeElements(iter
);
349 * Test the incremental normalization
351 static void TestNormalization()
353 UErrorCode status
= U_ZERO_ERROR
;
355 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315";
358 int rulelen
= u_unescape(str
, rule
, 50);
360 const char *testdata
[] =
361 {"\\u1ED9", "o\\u0323\\u0302",
362 "\\u0300\\u0315", "\\u0315\\u0300",
363 "A\\u0300\\u0315B", "A\\u0315\\u0300B",
364 "A\\u0316\\u0315B", "A\\u0315\\u0316B",
365 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316",
366 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B",
367 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"};
370 UCollationElements
*iter
;
372 coll
= ucol_openRules(rule
, rulelen
, UCOL_ON
, UCOL_TERTIARY
, NULL
, &status
);
373 ucol_setAttribute(coll
, UCOL_NORMALIZATION_MODE
, UCOL_ON
, &status
);
374 if (U_FAILURE(status
)){
375 log_err_status(status
, "ERROR: in creation of collator using ucol_openRules()\n %s\n",
376 myErrorName(status
));
380 srclen
= u_unescape(testdata
[0], source
, 10);
381 iter
= ucol_openElements(coll
, source
, srclen
, &status
);
383 ucol_closeElements(iter
);
385 srclen
= u_unescape(testdata
[1], source
, 10);
386 iter
= ucol_openElements(coll
, source
, srclen
, &status
);
388 ucol_closeElements(iter
);
391 srclen
= u_unescape(testdata
[count
], source
, 10);
392 iter
= ucol_openElements(coll
, source
, srclen
, &status
);
394 if (U_FAILURE(status
)){
395 log_err("ERROR: in creation of collator element iterator\n %s\n",
396 myErrorName(status
));
400 ucol_closeElements(iter
);
402 iter
= ucol_openElements(coll
, source
, -1, &status
);
404 if (U_FAILURE(status
)){
405 log_err("ERROR: in creation of collator element iterator\n %s\n",
406 myErrorName(status
));
410 ucol_closeElements(iter
);
417 * Test for CollationElementIterator.previous()
419 * @bug 4108758 - Make sure it works with contracting characters
422 static void TestPrevious()
424 UCollator
*coll
=NULL
;
427 UCollator
*c1
, *c2
, *c3
;
428 UCollationElements
*iter
;
429 UErrorCode status
= U_ZERO_ERROR
;
433 u_uastrcpy(test1
, "What subset of all possible test cases?");
434 u_uastrcpy(test2
, "has the highest probability of detecting");
435 coll
= ucol_open("en_US", &status
);
437 iter
=ucol_openElements(coll
, test1
, u_strlen(test1
), &status
);
438 log_verbose("English locale testing back and forth\n");
439 if(U_FAILURE(status
)){
440 log_err_status(status
, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
441 myErrorName(status
));
445 /* A basic test to see if it's working at all */
447 ucol_closeElements(iter
);
450 /* Test with a contracting character sequence */
451 u_uastrcpy(rule
, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH");
452 c1
= ucol_openRules(rule
, u_strlen(rule
), UCOL_OFF
, UCOL_DEFAULT_STRENGTH
, NULL
, &status
);
454 log_verbose("Contraction rule testing back and forth with no normalization\n");
456 if (c1
== NULL
|| U_FAILURE(status
))
458 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n",
459 myErrorName(status
));
462 source
=(UChar
*)malloc(sizeof(UChar
) * 20);
463 u_uastrcpy(source
, "abchdcba");
464 iter
=ucol_openElements(c1
, source
, u_strlen(source
), &status
);
465 if(U_FAILURE(status
)){
466 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
467 myErrorName(status
));
471 ucol_closeElements(iter
);
474 /* Test with an expanding character sequence */
475 u_uastrcpy(rule
, "&a < b < c/abd < d");
476 c2
= ucol_openRules(rule
, u_strlen(rule
), UCOL_OFF
, UCOL_DEFAULT_STRENGTH
, NULL
, &status
);
477 log_verbose("Expansion rule testing back and forth with no normalization\n");
478 if (c2
== NULL
|| U_FAILURE(status
))
480 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
481 myErrorName(status
));
484 u_uastrcpy(source
, "abcd");
485 iter
=ucol_openElements(c2
, source
, u_strlen(source
), &status
);
486 if(U_FAILURE(status
)){
487 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
488 myErrorName(status
));
492 ucol_closeElements(iter
);
495 u_uastrcpy(rule
, "&a < b < c/aba < d < z < ch");
496 c3
= ucol_openRules(rule
, u_strlen(rule
), UCOL_DEFAULT
, UCOL_DEFAULT_STRENGTH
,NULL
, &status
);
497 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n");
499 if (c3
== NULL
|| U_FAILURE(status
))
501 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
502 myErrorName(status
));
505 u_uastrcpy(source
, "abcdbchdc");
506 iter
=ucol_openElements(c3
, source
, u_strlen(source
), &status
);
507 if(U_FAILURE(status
)){
508 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
509 myErrorName(status
));
513 ucol_closeElements(iter
);
525 coll
= ucol_open("th_TH", &status
);
526 log_verbose("Thai locale testing back and forth with normalization\n");
527 iter
=ucol_openElements(coll
, source
, u_strlen(source
), &status
);
528 if(U_FAILURE(status
)){
529 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
530 myErrorName(status
));
534 ucol_closeElements(iter
);
544 coll
= ucol_open("ja_JP", &status
);
545 log_verbose("Japanese locale testing back and forth with normalization\n");
546 iter
=ucol_openElements(coll
, source
, u_strlen(source
), &status
);
547 if(U_FAILURE(status
)){
548 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
549 myErrorName(status
));
553 ucol_closeElements(iter
);
560 * Test for getOffset() and setOffset()
562 static void TestOffset()
564 UErrorCode status
= U_ZERO_ERROR
;
565 UCollator
*en_us
=NULL
;
566 UCollationElements
*iter
, *pristine
;
568 OrderAndOffset
*orders
;
569 int32_t orderLength
=0;
574 u_uastrcpy(test1
, "What subset of all possible test cases?");
575 u_uastrcpy(test2
, "has the highest probability of detecting");
576 en_us
= ucol_open("en_US", &status
);
577 log_verbose("Testing getOffset and setOffset for collations\n");
578 iter
= ucol_openElements(en_us
, test1
, u_strlen(test1
), &status
);
579 if(U_FAILURE(status
)){
580 log_err_status(status
, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
581 myErrorName(status
));
586 /* testing boundaries */
587 ucol_setOffset(iter
, 0, &status
);
588 if (U_FAILURE(status
) || ucol_previous(iter
, &status
) != UCOL_NULLORDER
) {
589 log_err("Error: After setting offset to 0, we should be at the end "
590 "of the backwards iteration");
592 ucol_setOffset(iter
, u_strlen(test1
), &status
);
593 if (U_FAILURE(status
) || ucol_next(iter
, &status
) != UCOL_NULLORDER
) {
594 log_err("Error: After setting offset to end of the string, we should "
595 "be at the end of the backwards iteration");
598 /* Run all the way through the iterator, then get the offset */
600 orders
= getOrders(iter
, &orderLength
);
602 offset
= ucol_getOffset(iter
);
604 if (offset
!= u_strlen(test1
))
606 log_err("offset at end != length %d vs %d\n", offset
,
610 /* Now set the offset back to the beginning and see if it works */
611 pristine
=ucol_openElements(en_us
, test1
, u_strlen(test1
), &status
);
612 if(U_FAILURE(status
)){
613 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
614 myErrorName(status
));
618 status
= U_ZERO_ERROR
;
620 ucol_setOffset(iter
, 0, &status
);
621 if (U_FAILURE(status
))
623 log_err("setOffset failed. %s\n", myErrorName(status
));
627 assertEqual(iter
, pristine
);
630 ucol_closeElements(pristine
);
631 ucol_closeElements(iter
);
634 /* testing offsets in normalization buffer */
640 ucol_setAttribute(en_us
, UCOL_NORMALIZATION_MODE
, UCOL_ON
, &status
);
641 iter
= ucol_openElements(en_us
, test1
, 4, &status
);
642 if(U_FAILURE(status
)){
643 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
644 myErrorName(status
));
650 while (ucol_next(iter
, &status
) != UCOL_NULLORDER
&&
654 if (ucol_getOffset(iter
) != 1) {
655 log_err("ERROR: Offset of iteration should be 1\n");
659 if (ucol_getOffset(iter
) != 4) {
660 log_err("ERROR: Offset of iteration should be 4\n");
664 if (ucol_getOffset(iter
) != 3) {
665 log_err("ERROR: Offset of iteration should be 3\n");
673 while (ucol_previous(iter
, &status
) != UCOL_NULLORDER
&&
678 if (ucol_getOffset(iter
) != 3) {
679 log_err("ERROR: Offset of iteration should be 3\n");
683 if (ucol_getOffset(iter
) != 1) {
684 log_err("ERROR: Offset of iteration should be 1\n");
688 if (ucol_getOffset(iter
) != 0) {
689 log_err("ERROR: Offset of iteration should be 0\n");
695 if(U_FAILURE(status
)){
696 log_err("ERROR: in iterating collation elements %s\n",
697 myErrorName(status
));
700 ucol_closeElements(iter
);
707 static void TestSetText()
710 UErrorCode status
= U_ZERO_ERROR
;
711 UCollator
*en_us
=NULL
;
712 UCollationElements
*iter1
, *iter2
;
716 u_uastrcpy(test1
, "What subset of all possible test cases?");
717 u_uastrcpy(test2
, "has the highest probability of detecting");
718 en_us
= ucol_open("en_US", &status
);
719 log_verbose("testing setText for Collation elements\n");
720 iter1
=ucol_openElements(en_us
, test1
, u_strlen(test1
), &status
);
721 if(U_FAILURE(status
)){
722 log_err_status(status
, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
723 myErrorName(status
));
727 iter2
=ucol_openElements(en_us
, test2
, u_strlen(test2
), &status
);
728 if(U_FAILURE(status
)){
729 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n",
730 myErrorName(status
));
735 /* Run through the second iterator just to exercise it */
736 c
= ucol_next(iter2
, &status
);
739 while ( ++i
< 10 && (c
!= UCOL_NULLORDER
))
741 if (U_FAILURE(status
))
743 log_err("iter2->next() returned an error. %s\n", myErrorName(status
));
744 ucol_closeElements(iter2
);
745 ucol_closeElements(iter1
);
750 c
= ucol_next(iter2
, &status
);
753 /* Now set it to point to the same string as the first iterator */
754 ucol_setText(iter2
, test1
, u_strlen(test1
), &status
);
755 if (U_FAILURE(status
))
757 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status
));
761 assertEqual(iter1
, iter2
);
764 /* Now set it to point to a null string with fake length*/
765 ucol_setText(iter2
, NULL
, 2, &status
);
766 if (U_FAILURE(status
))
768 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status
));
772 if (ucol_next(iter2
, &status
) != UCOL_NULLORDER
) {
773 log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
777 ucol_closeElements(iter2
);
778 ucol_closeElements(iter1
);
783 * Test for getMaxExpansion()
785 static void TestMaxExpansion()
787 UErrorCode status
= U_ZERO_ERROR
;
788 UCollator
*coll
;/*= ucol_open("en_US", &status);*/
790 UChar32 unassigned
= 0xEFFFD;
791 UChar supplementary
[2];
792 uint32_t stringOffset
= 0;
793 UBool isError
= FALSE
;
795 UCollationElements
*iter
;/*= ucol_openElements(coll, &ch, 1, &status);*/
796 uint32_t temporder
= 0;
799 u_uastrcpy(rule
, "&a < ab < c/aba < d < z < ch");
800 coll
= ucol_openRules(rule
, u_strlen(rule
), UCOL_DEFAULT
,
801 UCOL_DEFAULT_STRENGTH
,NULL
, &status
);
802 if(U_SUCCESS(status
) && coll
) {
803 iter
= ucol_openElements(coll
, &ch
, 1, &status
);
805 while (ch
< 0xFFFF && U_SUCCESS(status
)) {
812 ucol_setText(iter
, &ch
, 1, &status
);
813 order
= ucol_previous(iter
, &status
);
815 /* thai management */
817 order
= ucol_previous(iter
, &status
);
819 while (U_SUCCESS(status
) &&
820 ucol_previous(iter
, &status
) != UCOL_NULLORDER
) {
824 size
= ucol_getMaxExpansion(iter
, order
);
825 if (U_FAILURE(status
) || size
< count
) {
826 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
831 /* testing for exact max expansion */
836 ucol_setText(iter
, &ch
, 1, &status
);
837 order
= ucol_previous(iter
, &status
);
838 size
= ucol_getMaxExpansion(iter
, order
);
839 if (U_FAILURE(status
) || size
!= 1) {
840 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
847 ucol_setText(iter
, &ch
, 1, &status
);
848 temporder
= ucol_previous(iter
, &status
);
850 if (U_FAILURE(status
) || ucol_getMaxExpansion(iter
, temporder
) != 3) {
851 log_err("Failure at codepoint %d, maximum expansion count != %d\n",
856 ucol_setText(iter
, &ch
, 1, &status
);
857 temporder
= ucol_previous(iter
, &status
);
859 if (U_FAILURE(status
) || ucol_getMaxExpansion(iter
, temporder
) != 1) {
860 log_err("Failure at codepoint %d, maximum expansion count != %d\n",
864 U16_APPEND(supplementary
, stringOffset
, 2, unassigned
, isError
);
865 ucol_setText(iter
, supplementary
, 2, &status
);
866 sorder
= ucol_previous(iter
, &status
);
868 if (U_FAILURE(status
) || ucol_getMaxExpansion(iter
, sorder
) != 2) {
869 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
876 ucol_setText(iter
, &ch
, 1, &status
);
877 temporder
= ucol_previous(iter
, &status
);
878 if (U_FAILURE(status
) || ucol_getMaxExpansion(iter
, temporder
) > 3) {
879 log_err("Failure at codepoint %d, maximum expansion count > %d\n",
883 ucol_closeElements(iter
);
886 /* testing special jamo &a<\u1160 */
898 coll
= ucol_openRules(rule
, u_strlen(rule
), UCOL_DEFAULT
,
899 UCOL_DEFAULT_STRENGTH
,NULL
, &status
);
900 iter
= ucol_openElements(coll
, &ch
, 1, &status
);
902 temporder
= ucol_previous(iter
, &status
);
903 if (U_FAILURE(status
) || ucol_getMaxExpansion(iter
, temporder
) != 6) {
904 log_err("Failure at codepoint %d, maximum expansion count > %d\n",
908 ucol_closeElements(iter
);
911 log_err_status(status
, "Couldn't open collator -> %s\n", u_errorName(status
));
917 static void assertEqual(UCollationElements
*i1
, UCollationElements
*i2
)
921 UErrorCode status
= U_ZERO_ERROR
;
925 c1
= ucol_next(i1
, &status
);
926 c2
= ucol_next(i2
, &status
);
930 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count
, c1
, c2
);
936 while (c1
!= UCOL_NULLORDER
);
940 * Testing iterators with extremely small buffers
942 static void TestSmallBuffer()
944 UErrorCode status
= U_ZERO_ERROR
;
946 UCollationElements
*testiter
,
949 OrderAndOffset
*testorders
,
953 UChar str
[] = {0x300, 0x31A, 0};
955 creating a long string of decomposable characters,
956 since by default the writable buffer is of size 256
958 while (count
< 500) {
959 if ((count
& 1) == 0) {
960 teststr
[count
++] = 0x300;
963 teststr
[count
++] = 0x31A;
967 coll
= ucol_open("th_TH", &status
);
968 if(U_SUCCESS(status
) && coll
) {
969 testiter
= ucol_openElements(coll
, teststr
, 500, &status
);
970 iter
= ucol_openElements(coll
, str
, 2, &status
);
972 orders
= getOrders(iter
, &count
);
974 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n");
978 this will rearrange the string data to 250 characters of 0x300 first then
979 250 characters of 0x031A
981 testorders
= getOrders(testiter
, &count
);
984 log_err("Error decomposition does not give the right sized collation elements\n");
988 /* UCA collation element for 0x0F76 */
989 if ((count
> 250 && testorders
[-- count
].order
!= orders
[1].order
) ||
990 (count
<= 250 && testorders
[-- count
].order
!= orders
[0].order
)) {
991 log_err("Error decomposition does not give the right collation element at %d count\n", count
);
999 ucol_reset(testiter
);
1001 /* ensures closing of elements done properly to clear writable buffer */
1002 ucol_next(testiter
, &status
);
1003 ucol_next(testiter
, &status
);
1004 ucol_closeElements(testiter
);
1005 ucol_closeElements(iter
);
1008 log_err_status(status
, "Couldn't open collator -> %s\n", u_errorName(status
));
1013 * Sniplets of code from genuca
1015 static int32_t hex2num(char hex
) {
1016 if(hex
>='0' && hex
<='9') {
1018 } else if(hex
>='a' && hex
<='f') {
1020 } else if(hex
>='A' && hex
<='F') {
1028 * Getting codepoints from a string
1029 * @param str character string contain codepoints seperated by space and ended
1031 * @param codepoints array for storage, assuming size > 5
1032 * @return position at the end of the codepoint section
1034 static char *getCodePoints(char *str
, UChar
*codepoints
, UChar
*contextCPs
) {
1035 UErrorCode errorCode
= U_ZERO_ERROR
;
1036 char *semi
= uprv_strchr(str
, ';');
1037 char *pipe
= uprv_strchr(str
, '|');
1042 log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str
);
1046 int32_t contextLength
;
1048 contextLength
= u_parseString(str
, contextCPs
, 99, NULL
, &errorCode
);
1050 if(U_FAILURE(errorCode
)) {
1051 log_err("error parsing precontext string from FractionalUCA.txt %s\n", str
);
1054 /* prepend the precontext string to the codepoints */
1055 u_memcpy(codepoints
, contextCPs
, contextLength
);
1056 codepoints
+= contextLength
;
1057 /* start of the code point string */
1062 u_parseString(s
, codepoints
, 99, NULL
, &errorCode
);
1063 if(U_FAILURE(errorCode
)) {
1064 log_err("error parsing code point string from FractionalUCA.txt %s\n", str
);
1071 * Sniplets of code from genuca
1074 readElement(char **from
, char *to
, char separator
, UErrorCode
*status
)
1076 if (U_SUCCESS(*status
)) {
1079 while (**from
!= separator
) {
1080 if (**from
!= ' ') {
1081 *(buffer
+i
++) = **from
;
1095 * Sniplets of code from genuca
1098 getSingleCEValue(char *primary
, char *secondary
, char *tertiary
,
1101 if (U_SUCCESS(*status
)) {
1103 char primsave
= '\0';
1104 char secsave
= '\0';
1105 char tersave
= '\0';
1106 char *primend
= primary
+4;
1107 char *secend
= secondary
+2;
1108 char *terend
= tertiary
+2;
1113 if (uprv_strlen(primary
) > 4) {
1114 primsave
= *primend
;
1118 if (uprv_strlen(secondary
) > 2) {
1123 if (uprv_strlen(tertiary
) > 2) {
1128 primvalue
= (*primary
!='\0')?uprv_strtoul(primary
, &primend
, 16):0;
1129 secvalue
= (*secondary
!='\0')?uprv_strtoul(secondary
, &secend
, 16):0;
1130 tervalue
= (*tertiary
!='\0')?uprv_strtoul(tertiary
, &terend
, 16):0;
1131 if(primvalue
<= 0xFF) {
1135 value
= ((primvalue
<< UCOL_PRIMARYORDERSHIFT
) & UCOL_PRIMARYORDERMASK
)
1136 | ((secvalue
<< UCOL_SECONDARYORDERSHIFT
) & UCOL_SECONDARYORDERMASK
)
1137 | (tervalue
& UCOL_TERTIARYORDERMASK
);
1139 if(primsave
!='\0') {
1140 *primend
= primsave
;
1154 * Getting collation elements generated from a string
1155 * @param str character string contain collation elements contained in [] and
1156 * seperated by space
1157 * @param ce array for storage, assuming size > 20
1158 * @param status error status
1159 * @return position at the end of the codepoint section
1161 static char * getCEs(char *str
, uint32_t *ces
, UErrorCode
*status
) {
1162 char *pStartCP
= uprv_strchr(str
, '[');
1166 char secondary
[100];
1169 while (*pStartCP
== '[') {
1170 uint32_t primarycount
= 0;
1171 uint32_t secondarycount
= 0;
1172 uint32_t tertiarycount
= 0;
1174 pEndCP
= strchr(pStartCP
, ']');
1175 if(pEndCP
== NULL
) {
1180 primarycount
= readElement(&pStartCP
, primary
, ',', status
);
1181 secondarycount
= readElement(&pStartCP
, secondary
, ',', status
);
1182 tertiarycount
= readElement(&pStartCP
, tertiary
, ']', status
);
1184 /* I want to get the CEs entered right here, including continuation */
1185 ces
[count
++] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
1186 if (U_FAILURE(*status
)) {
1190 while (2 * CEi
< primarycount
|| CEi
< secondarycount
||
1191 CEi
< tertiarycount
) {
1192 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
1193 if (2 * CEi
< primarycount
) {
1194 value
|= ((hex2num(*(primary
+ 4 * CEi
)) & 0xF) << 28);
1195 value
|= ((hex2num(*(primary
+ 4 * CEi
+ 1)) & 0xF) << 24);
1198 if (2 * CEi
+ 1 < primarycount
) {
1199 value
|= ((hex2num(*(primary
+ 4 * CEi
+ 2)) & 0xF) << 20);
1200 value
|= ((hex2num(*(primary
+ 4 * CEi
+ 3)) &0xF) << 16);
1203 if (CEi
< secondarycount
) {
1204 value
|= ((hex2num(*(secondary
+ 2 * CEi
)) & 0xF) << 12);
1205 value
|= ((hex2num(*(secondary
+ 2 * CEi
+ 1)) & 0xF) << 8);
1208 if (CEi
< tertiarycount
) {
1209 value
|= ((hex2num(*(tertiary
+ 2 * CEi
)) & 0x3) << 4);
1210 value
|= (hex2num(*(tertiary
+ 2 * CEi
+ 1)) & 0xF);
1214 ces
[count
++] = value
;
1217 pStartCP
= pEndCP
+ 1;
1224 * Getting the FractionalUCA.txt file stream
1226 static FileStream
* getFractionalUCA(void)
1229 char backupPath
[256];
1230 FileStream
*result
= NULL
;
1232 /* Look inside ICU_DATA first */
1233 uprv_strcpy(newPath
, ctest_dataSrcDir());
1234 uprv_strcat(newPath
, "unidata" U_FILE_SEP_STRING
);
1235 uprv_strcat(newPath
, "FractionalUCA.txt");
1237 /* As a fallback, try to guess where the source data was located
1238 * at the time ICU was built, and look there.
1240 #if defined (U_TOPSRCDIR)
1241 strcpy(backupPath
, U_TOPSRCDIR U_FILE_SEP_STRING
"data");
1244 UErrorCode errorCode
= U_ZERO_ERROR
;
1245 strcpy(backupPath
, loadTestData(&errorCode
));
1246 strcat(backupPath
, U_FILE_SEP_STRING
".." U_FILE_SEP_STRING
".." U_FILE_SEP_STRING
".." U_FILE_SEP_STRING
".." U_FILE_SEP_STRING
"data");
1249 strcat(backupPath
, U_FILE_SEP_STRING
"unidata" U_FILE_SEP_STRING
"FractionalUCA.txt");
1251 result
= T_FileStream_open(newPath
, "rb");
1253 if (result
== NULL
) {
1254 result
= T_FileStream_open(backupPath
, "rb");
1255 if (result
== NULL
) {
1256 log_err("Failed to open either %s or %s\n", newPath
, backupPath
);
1263 * Testing the CEs returned by the iterator
1265 static void TestCEs() {
1266 FileStream
*file
= NULL
;
1269 UChar codepoints
[10];
1271 UErrorCode status
= U_ZERO_ERROR
;
1272 UCollator
*coll
= ucol_open("", &status
);
1273 uint32_t lineNo
= 0;
1274 UChar contextCPs
[5];
1276 if (U_FAILURE(status
)) {
1277 log_err_status(status
, "Error in opening root collator -> %s\n", u_errorName(status
));
1281 file
= getFractionalUCA();
1284 log_err("*** unable to open input FractionalUCA.txt file ***\n");
1289 while (T_FileStream_readLine(file
, line
, sizeof(line
)) != NULL
) {
1291 UCollationElements
*iter
;
1292 int32_t preContextCeLen
=0;
1294 /* skip this line if it is empty or a comment or is a return value
1295 or start of some variable section */
1296 if(line
[0] == 0 || line
[0] == '#' || line
[0] == '\n' ||
1297 line
[0] == 0x000D || line
[0] == '[') {
1301 str
= getCodePoints(line
, codepoints
, contextCPs
);
1303 /* these are 'fake' codepoints in the fractional UCA, and are used just
1304 * for positioning of indirect values. They should not go through this
1307 if(*codepoints
== 0xFDD0) {
1310 if (*contextCPs
!= 0) {
1311 iter
= ucol_openElements(coll
, contextCPs
, -1, &status
);
1312 if (U_FAILURE(status
)) {
1313 log_err("Error in opening collation elements\n");
1316 while((ces
[preContextCeLen
] = ucol_next(iter
, &status
)) != (uint32_t)UCOL_NULLORDER
) {
1319 ucol_closeElements(iter
);
1322 getCEs(str
, ces
+preContextCeLen
, &status
);
1323 if (U_FAILURE(status
)) {
1324 log_err("Error in parsing collation elements in FractionalUCA.txt\n");
1327 iter
= ucol_openElements(coll
, codepoints
, -1, &status
);
1328 if (U_FAILURE(status
)) {
1329 log_err("Error in opening collation elements\n");
1333 uint32_t ce
= (uint32_t)ucol_next(iter
, &status
);
1334 if (ce
== 0xFFFFFFFF) {
1337 /* we now unconditionally reorder Thai/Lao prevowels, so this
1338 * test would fail if we don't skip here.
1340 if(UCOL_ISTHAIPREVOWEL(*codepoints
) && ce
== 0 && count
== 0) {
1343 if (ce
!= ces
[count
] || U_FAILURE(status
)) {
1344 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
1347 if (ces
[count
] == 0) {
1352 ucol_closeElements(iter
);
1355 T_FileStream_close(file
);
1360 * Testing the discontigous contractions
1362 static void TestDiscontiguos() {
1363 const char *rulestr
=
1364 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315";
1366 int rulelen
= u_unescape(rulestr
, rule
, 50);
1367 const char *src
[] = {
1368 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC",
1369 /* base character blocked */
1370 "XD\\u0300", "XD\\u0300\\u0315",
1371 /* non blocking combining character */
1372 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315",
1373 /* blocking combining character */
1374 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315",
1375 /* contraction prefix */
1376 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315",
1377 "X\\u0300\\u031A\\u0315",
1378 /* ends not with a contraction character */
1379 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D",
1380 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D"
1382 const char *tgt
[] = {
1383 /* non blocking combining character */
1384 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC",
1385 /* base character blocked */
1386 "X D \\u0300", "X D \\u0300\\u0315",
1387 /* non blocking combining character */
1388 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319",
1389 /* blocking combining character */
1390 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315",
1391 /* contraction prefix */
1392 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319",
1393 "X\\u0300 \\u031A \\u0315",
1394 /* ends not with a contraction character */
1395 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D",
1396 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D"
1400 UErrorCode status
= U_ZERO_ERROR
;
1402 UCollationElements
*iter
;
1403 UCollationElements
*resultiter
;
1405 coll
= ucol_openRules(rule
, rulelen
, UCOL_OFF
, UCOL_DEFAULT_STRENGTH
,NULL
, &status
);
1406 iter
= ucol_openElements(coll
, rule
, 1, &status
);
1407 resultiter
= ucol_openElements(coll
, rule
, 1, &status
);
1409 if (U_FAILURE(status
)) {
1410 log_err_status(status
, "Error opening collation rules -> %s\n", u_errorName(status
));
1414 while (count
< size
) {
1417 int strLen
= u_unescape(src
[count
], str
, 20);
1420 ucol_setText(iter
, str
, strLen
, &status
);
1421 if (U_FAILURE(status
)) {
1422 log_err("Error opening collation iterator\n");
1426 u_unescape(tgt
[count
], tstr
, 20);
1429 log_verbose("count %d\n", count
);
1433 UChar
*e
= u_strchr(s
, 0x20);
1437 ucol_setText(resultiter
, s
, (int32_t)(e
- s
), &status
);
1438 ce
= ucol_next(resultiter
, &status
);
1439 if (U_FAILURE(status
)) {
1440 log_err("Error manipulating collation iterator\n");
1443 while (ce
!= UCOL_NULLORDER
) {
1444 if (ce
!= (uint32_t)ucol_next(iter
, &status
) ||
1445 U_FAILURE(status
)) {
1446 log_err("Discontiguos contraction test mismatch\n");
1449 ce
= ucol_next(resultiter
, &status
);
1450 if (U_FAILURE(status
)) {
1451 log_err("Error getting next collation element\n");
1464 ucol_closeElements(resultiter
);
1465 ucol_closeElements(iter
);
1469 static void TestCEBufferOverflow()
1471 UChar str
[UCOL_EXPAND_CE_BUFFER_SIZE
+ 1];
1472 UErrorCode status
= U_ZERO_ERROR
;
1475 UCollationElements
*iter
;
1477 u_uastrcpy(rule
, "&z < AB");
1478 coll
= ucol_openRules(rule
, u_strlen(rule
), UCOL_OFF
, UCOL_DEFAULT_STRENGTH
, NULL
,&status
);
1479 if (U_FAILURE(status
)) {
1480 log_err_status(status
, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status
));
1484 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
1485 test. this will cause an overflow in getPrev */
1486 str
[0] = 0x0041; /* 'A' */
1487 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
1488 uprv_memset(str
+ 1, 0xDC, sizeof(UChar
) * UCOL_EXPAND_CE_BUFFER_SIZE
);
1489 str
[UCOL_EXPAND_CE_BUFFER_SIZE
] = 0x0042; /* 'B' */
1490 iter
= ucol_openElements(coll
, str
, UCOL_EXPAND_CE_BUFFER_SIZE
+ 1,
1492 if (ucol_previous(iter
, &status
) == UCOL_NULLORDER
||
1493 status
== U_BUFFER_OVERFLOW_ERROR
) {
1494 log_err("CE buffer should not overflow with long string of trail surrogates\n");
1496 ucol_closeElements(iter
);
1501 * Checking collation element validity.
1503 #define MAX_CODEPOINTS_TO_SHOW 10
1504 static void showCodepoints(const UChar
*codepoints
, int length
, char * codepointText
) {
1505 int i
, lengthToUse
= length
;
1506 if (lengthToUse
> MAX_CODEPOINTS_TO_SHOW
) {
1507 lengthToUse
= MAX_CODEPOINTS_TO_SHOW
;
1509 for (i
= 0; i
< lengthToUse
; ++i
) {
1510 int bytesWritten
= sprintf(codepointText
, " %04X", *codepoints
++);
1511 if (bytesWritten
<= 0) {
1514 codepointText
+= bytesWritten
;
1517 sprintf(codepointText
, " ...");
1521 static UBool
checkCEValidity(const UCollator
*coll
, const UChar
*codepoints
,
1524 UErrorCode status
= U_ZERO_ERROR
;
1525 UCollationElements
*iter
= ucol_openElements(coll
, codepoints
, length
,
1527 UBool result
= FALSE
;
1528 UBool primaryDone
= FALSE
, secondaryDone
= FALSE
, tertiaryDone
= FALSE
;
1529 const char * collLocale
;
1531 if (U_FAILURE(status
)) {
1532 log_err("Error creating iterator for testing validity\n");
1535 collLocale
= ucol_getLocale(coll
, ULOC_VALID_LOCALE
, &status
);
1536 if (U_FAILURE(status
) || collLocale
==NULL
) {
1537 status
= U_ZERO_ERROR
;
1542 uint32_t ce
= ucol_next(iter
, &status
);
1543 uint32_t primary
, p1
, p2
, secondary
, tertiary
;
1544 if (ce
== UCOL_NULLORDER
) {
1551 if (ce
== 0x02000202) {
1552 /* special CE for merge-sort character */
1553 if (*codepoints
== 0xFFFE /* && length == 1 */) {
1555 * Note: We should check for length==1 but the token parser appears
1556 * to give us trailing NUL characters.
1557 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
1558 * rather than the internal collation rule parser
1562 log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
1563 (int)*codepoints
, (int)length
);
1567 primary
= UCOL_PRIMARYORDER(ce
);
1569 p2
= primary
& 0xFF;
1570 secondary
= UCOL_SECONDARYORDER(ce
);
1571 tertiary
= UCOL_TERTIARYORDER(ce
) & UCOL_REMOVE_CONTINUATION
;
1573 if (!isContinuation(ce
)) {
1574 if ((ce
& UCOL_REMOVE_CONTINUATION
) == 0) {
1575 log_err("Empty CE %08lX except for case bits\n", (long)ce
);
1580 log_err("Primary 00 xx in %08lX\n", (long)ce
);
1585 if (p1
<= 2 || p1
>= 0xF0) {
1586 /* Primary first bytes F0..FF are specials. */
1587 log_err("Primary first byte of %08lX out of range\n", (long)ce
);
1593 if (p2
<= 3 || p2
>= 0xFF) {
1594 /* Primary second bytes 03 and FF are sort key compression terminators. */
1595 log_err("Primary second byte of %08lX out of range\n", (long)ce
);
1598 primaryDone
= FALSE
;
1601 if (secondary
== 0) {
1603 log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce
);
1606 secondaryDone
= TRUE
;
1608 if (secondary
<= 2 ||
1609 (UCOL_BYTE_COMMON
< secondary
&& secondary
<= (UCOL_BYTE_COMMON
+ 0x80))
1611 /* Secondary first bytes common+1..+0x80 are used for sort key compression. */
1612 log_err("Secondary byte of %08lX out of range\n", (long)ce
);
1615 secondaryDone
= FALSE
;
1617 if (tertiary
== 0) {
1618 /* We know that ce != 0. */
1619 log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce
);
1622 if (tertiary
<= 2) {
1623 log_err("Tertiary byte of %08lX out of range\n", (long)ce
);
1626 tertiaryDone
= FALSE
;
1628 if ((ce
& UCOL_REMOVE_CONTINUATION
) == 0) {
1629 log_err("Empty continuation %08lX\n", (long)ce
);
1632 if (primaryDone
&& primary
!= 0) {
1633 log_err("Primary was done but continues in %08lX\n", (long)ce
);
1638 log_err("Primary 00 xx in %08lX\n", (long)ce
);
1644 log_err("Primary first byte of %08lX out of range\n", (long)ce
);
1651 log_err("Primary second byte of %08lX out of range\n", (long)ce
);
1656 if (secondaryDone
&& secondary
!= 0) {
1657 log_err("Secondary was done but continues in %08lX\n", (long)ce
);
1660 if (secondary
== 0) {
1661 secondaryDone
= TRUE
;
1663 if (secondary
<= 2) {
1664 log_err("Secondary byte of %08lX out of range\n", (long)ce
);
1668 if (tertiaryDone
&& tertiary
!= 0) {
1669 log_err("Tertiary was done but continues in %08lX\n", (long)ce
);
1672 if (tertiary
== 0) {
1673 tertiaryDone
= TRUE
;
1674 } else if (tertiary
<= 2) {
1675 log_err("Tertiary byte of %08lX out of range\n", (long)ce
);
1681 char codepointText
[5*MAX_CODEPOINTS_TO_SHOW
+ 5];
1682 showCodepoints(codepoints
, length
, codepointText
);
1683 log_err("Locale: %s Code point string: %s\n", collLocale
, codepointText
);
1685 ucol_closeElements(iter
);
1689 static void TestCEValidity()
1691 /* testing UCA collation elements */
1692 UErrorCode status
= U_ZERO_ERROR
;
1693 /* en_US has no tailorings */
1694 UCollator
*coll
= ucol_open("root", &status
);
1695 /* tailored locales */
1696 char locale
[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
1698 FileStream
*file
= NULL
;
1700 UChar codepoints
[11];
1703 UChar contextCPs
[3];
1705 UParseError parseError
;
1706 if (U_FAILURE(status
)) {
1707 log_err_status(status
, "en_US collator creation failed -> %s\n", u_errorName(status
));
1710 log_verbose("Testing UCA elements\n");
1711 file
= getFractionalUCA();
1713 log_err("Fractional UCA data can not be opened\n");
1717 while (T_FileStream_readLine(file
, line
, sizeof(line
)) != NULL
) {
1718 if(line
[0] == 0 || line
[0] == '#' || line
[0] == '\n' ||
1719 line
[0] == 0x000D || line
[0] == '[') {
1723 getCodePoints(line
, codepoints
, contextCPs
);
1724 checkCEValidity(coll
, codepoints
, u_strlen(codepoints
));
1727 log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1728 for (c
= 0; c
<= 0xffff; ++c
) {
1729 if (u_isdefined(c
)) {
1730 codepoints
[0] = (UChar
)c
;
1731 checkCEValidity(coll
, codepoints
, 1);
1734 for (; c
<= 0x10ffff; ++c
) {
1735 if (u_isdefined(c
)) {
1737 U16_APPEND_UNSAFE(codepoints
, i
, c
);
1738 checkCEValidity(coll
, codepoints
, i
);
1744 /* testing tailored collation elements */
1745 log_verbose("Testing tailored elements\n");
1746 if(getTestOption(QUICK_OPTION
)) {
1747 maxCount
= sizeof(locale
)/sizeof(locale
[0]);
1749 maxCount
= uloc_countAvailable();
1751 while (count
< maxCount
) {
1752 const UChar
*rules
= NULL
,
1754 UChar
*rulesCopy
= NULL
;
1755 int32_t ruleLen
= 0;
1757 uint32_t chOffset
= 0;
1759 uint32_t exOffset
= 0;
1761 uint32_t prefixOffset
= 0;
1762 uint32_t prefixLen
= 0;
1763 UBool startOfRules
= TRUE
;
1766 UColTokenParser src
;
1767 uint32_t strength
= 0;
1769 if(getTestOption(QUICK_OPTION
)) {
1770 loc
= locale
[count
];
1772 loc
= uloc_getAvailable(count
);
1773 if(!hasCollationElements(loc
)) {
1779 uprv_memset(&src
, 0, sizeof(UColTokenParser
));
1781 log_verbose("Testing CEs for %s\n", loc
);
1783 coll
= ucol_open(loc
, &status
);
1784 if (U_FAILURE(status
)) {
1785 log_err("%s collator creation failed\n", loc
);
1790 rules
= ucol_getRules(coll
, &ruleLen
);
1793 rulesCopy
= (UChar
*)uprv_malloc((ruleLen
+
1794 UCOL_TOK_EXTRA_RULE_SPACE_SIZE
) * sizeof(UChar
));
1795 uprv_memcpy(rulesCopy
, rules
, ruleLen
* sizeof(UChar
));
1796 src
.current
= src
.source
= rulesCopy
;
1797 src
.end
= rulesCopy
+ ruleLen
;
1798 src
.extraCurrent
= src
.end
;
1799 src
.extraEnd
= src
.end
+ UCOL_TOK_EXTRA_RULE_SPACE_SIZE
;
1801 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
1802 the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
1803 while ((current
= ucol_tok_parseNextToken(&src
, startOfRules
, &parseError
,&status
)) != NULL
) {
1804 strength
= src
.parsedToken
.strength
;
1805 chOffset
= src
.parsedToken
.charsOffset
;
1806 chLen
= src
.parsedToken
.charsLen
;
1807 exOffset
= src
.parsedToken
.extensionOffset
;
1808 exLen
= src
.parsedToken
.extensionLen
;
1809 prefixOffset
= src
.parsedToken
.prefixOffset
;
1810 prefixLen
= src
.parsedToken
.prefixLen
;
1811 specs
= src
.parsedToken
.flags
;
1813 startOfRules
= FALSE
;
1814 uprv_memcpy(codepoints
, src
.source
+ chOffset
,
1815 chLen
* sizeof(UChar
));
1816 codepoints
[chLen
] = 0;
1817 checkCEValidity(coll
, codepoints
, chLen
);
1819 uprv_free(src
.source
);
1825 T_FileStream_close(file
);
1828 static void printSortKeyError(const UChar
*codepoints
, int length
,
1829 uint8_t *sortkey
, int sklen
)
1832 log_err("Sortkey not valid for ");
1833 while (length
> 0) {
1834 log_err("0x%04x ", *codepoints
);
1838 log_err("\nSortkey : ");
1839 while (count
< sklen
) {
1840 log_err("0x%02x ", sortkey
[count
]);
1847 * Checking sort key validity for all levels
1849 static UBool
checkSortKeyValidity(UCollator
*coll
,
1850 const UChar
*codepoints
,
1853 UErrorCode status
= U_ZERO_ERROR
;
1854 UCollationStrength strength
[5] = {UCOL_PRIMARY
, UCOL_SECONDARY
,
1855 UCOL_TERTIARY
, UCOL_QUATERNARY
,
1857 int strengthlen
= 5;
1858 int strengthIndex
= 0;
1861 while (caselevel
< 1) {
1862 if (caselevel
== 0) {
1863 ucol_setAttribute(coll
, UCOL_CASE_LEVEL
, UCOL_OFF
, &status
);
1866 ucol_setAttribute(coll
, UCOL_CASE_LEVEL
, UCOL_ON
, &status
);
1869 while (strengthIndex
< strengthlen
) {
1872 uint8_t sortkey
[128];
1875 ucol_setStrength(coll
, strength
[strengthIndex
]);
1876 sklen
= ucol_getSortKey(coll
, codepoints
, length
, sortkey
, 128);
1877 while (sortkey
[count
] != 0) {
1878 if (sortkey
[count
] == 2 || (sortkey
[count
] == 3 && count01
> 0 && strengthIndex
!= 4)) {
1879 printSortKeyError(codepoints
, length
, sortkey
, sklen
);
1882 if (sortkey
[count
] == 1) {
1888 if (count
+ 1 != sklen
|| (count01
!= strengthIndex
+ caselevel
)) {
1889 printSortKeyError(codepoints
, length
, sortkey
, sklen
);
1899 static void TestSortKeyValidity(void)
1901 /* testing UCA collation elements */
1902 UErrorCode status
= U_ZERO_ERROR
;
1903 /* en_US has no tailorings */
1904 UCollator
*coll
= ucol_open("en_US", &status
);
1905 /* tailored locales */
1906 char locale
[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
1907 FileStream
*file
= NULL
;
1909 UChar codepoints
[10];
1911 UChar contextCPs
[5];
1912 UParseError parseError
;
1913 if (U_FAILURE(status
)) {
1914 log_err_status(status
, "en_US collator creation failed -> %s\n", u_errorName(status
));
1917 log_verbose("Testing UCA elements\n");
1918 file
= getFractionalUCA();
1920 log_err("Fractional UCA data can not be opened\n");
1924 while (T_FileStream_readLine(file
, line
, sizeof(line
)) != NULL
) {
1925 if(line
[0] == 0 || line
[0] == '#' || line
[0] == '\n' ||
1926 line
[0] == 0x000D || line
[0] == '[') {
1930 getCodePoints(line
, codepoints
, contextCPs
);
1931 if(codepoints
[0] == 0xFFFE) {
1932 /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
1935 checkSortKeyValidity(coll
, codepoints
, u_strlen(codepoints
));
1938 log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1941 while (codepoints
[0] < 0xFFFF) {
1942 if (u_isdefined((UChar32
)codepoints
[0])) {
1943 checkSortKeyValidity(coll
, codepoints
, 1);
1950 /* testing tailored collation elements */
1951 log_verbose("Testing tailored elements\n");
1953 const UChar
*rules
= NULL
,
1955 UChar
*rulesCopy
= NULL
;
1956 int32_t ruleLen
= 0;
1958 uint32_t chOffset
= 0;
1960 uint32_t exOffset
= 0;
1962 uint32_t prefixOffset
= 0;
1963 uint32_t prefixLen
= 0;
1964 UBool startOfRules
= TRUE
;
1967 UColTokenParser src
;
1968 uint32_t strength
= 0;
1971 uprv_memset(&src
, 0, sizeof(UColTokenParser
));
1973 coll
= ucol_open(locale
[count
], &status
);
1974 if (U_FAILURE(status
)) {
1975 log_err("%s collator creation failed\n", locale
[count
]);
1980 rules
= ucol_getRules(coll
, &ruleLen
);
1983 rulesCopy
= (UChar
*)uprv_malloc((ruleLen
+
1984 UCOL_TOK_EXTRA_RULE_SPACE_SIZE
) * sizeof(UChar
));
1985 uprv_memcpy(rulesCopy
, rules
, ruleLen
* sizeof(UChar
));
1986 src
.current
= src
.source
= rulesCopy
;
1987 src
.end
= rulesCopy
+ ruleLen
;
1988 src
.extraCurrent
= src
.end
;
1989 src
.extraEnd
= src
.end
+ UCOL_TOK_EXTRA_RULE_SPACE_SIZE
;
1991 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
1992 the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
1993 while ((current
= ucol_tok_parseNextToken(&src
, startOfRules
,&parseError
, &status
)) != NULL
) {
1994 strength
= src
.parsedToken
.strength
;
1995 chOffset
= src
.parsedToken
.charsOffset
;
1996 chLen
= src
.parsedToken
.charsLen
;
1997 exOffset
= src
.parsedToken
.extensionOffset
;
1998 exLen
= src
.parsedToken
.extensionLen
;
1999 prefixOffset
= src
.parsedToken
.prefixOffset
;
2000 prefixLen
= src
.parsedToken
.prefixLen
;
2001 specs
= src
.parsedToken
.flags
;
2003 startOfRules
= FALSE
;
2004 uprv_memcpy(codepoints
, src
.source
+ chOffset
,
2005 chLen
* sizeof(UChar
));
2006 codepoints
[chLen
] = 0;
2007 if(codepoints
[0] == 0xFFFE) {
2008 /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
2011 checkSortKeyValidity(coll
, codepoints
, chLen
);
2013 uprv_free(src
.source
);
2019 T_FileStream_close(file
);
2023 * TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
2024 * normalization on AND jamo tailoring, among other things.
2026 static const UChar tsceText
[] = { /* Nothing in here should be ignorable */
2027 0x0020, 0xAC00, /* simple LV Hangul */
2028 0x0020, 0xAC01, /* simple LVT Hangul */
2029 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */
2030 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */
2031 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */
2032 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */
2033 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */
2034 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */
2035 0x0020, 0x00E6, /* small letter ae, expands */
2036 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */
2039 enum { kLen_tsceText
= sizeof(tsceText
)/sizeof(tsceText
[0]) };
2041 static const int32_t rootStandardOffsets
[] = {
2055 enum { kLen_rootStandardOffsets
= sizeof(rootStandardOffsets
)/sizeof(rootStandardOffsets
[0]) };
2057 static const int32_t rootSearchOffsets
[] = {
2065 20, 21,22,22,23,23,23,24,
2071 enum { kLen_rootSearchOffsets
= sizeof(rootSearchOffsets
)/sizeof(rootSearchOffsets
[0]) };
2074 const char * locale
;
2075 const int32_t * offsets
;
2079 static const TSCEItem tsceItems
[] = {
2080 { "root", rootStandardOffsets
, kLen_rootStandardOffsets
},
2082 /* No jamo tailorings in Apple version of search collator currently */
2083 { "root@collation=search", rootStandardOffsets
, kLen_rootStandardOffsets
},
2085 /* Use this when we do have jamo tailorings */
2086 { "root@collation=search", rootSearchOffsets
, kLen_rootSearchOffsets
},
2091 static void TestSearchCollatorElements(void)
2093 const TSCEItem
* tsceItemPtr
;
2094 for (tsceItemPtr
= tsceItems
; tsceItemPtr
->locale
!= NULL
; tsceItemPtr
++) {
2095 UErrorCode status
= U_ZERO_ERROR
;
2096 UCollator
* ucol
= ucol_open(tsceItemPtr
->locale
, &status
);
2097 if ( U_SUCCESS(status
) ) {
2098 UCollationElements
* uce
= ucol_openElements(ucol
, tsceText
, kLen_tsceText
, &status
);
2099 if ( U_SUCCESS(status
) ) {
2100 int32_t offset
, element
;
2101 const int32_t * nextOffsetPtr
;
2102 const int32_t * limitOffsetPtr
;
2104 nextOffsetPtr
= tsceItemPtr
->offsets
;
2105 limitOffsetPtr
= tsceItemPtr
->offsets
+ tsceItemPtr
->offsetsLen
;
2107 offset
= ucol_getOffset(uce
);
2108 element
= ucol_next(uce
, &status
);
2109 if ( element
== 0 ) {
2110 log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr
->locale
);
2112 if ( nextOffsetPtr
< limitOffsetPtr
) {
2113 if (offset
!= *nextOffsetPtr
) {
2114 log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n",
2115 tsceItemPtr
->locale
, *nextOffsetPtr
, offset
);
2116 nextOffsetPtr
= limitOffsetPtr
;
2121 log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr
->locale
);
2123 } while ( U_SUCCESS(status
) && element
!= UCOL_NULLORDER
);
2124 if ( nextOffsetPtr
< limitOffsetPtr
) {
2125 log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr
->locale
);
2128 ucol_setOffset(uce
, kLen_tsceText
, &status
);
2129 status
= U_ZERO_ERROR
;
2130 nextOffsetPtr
= tsceItemPtr
->offsets
+ tsceItemPtr
->offsetsLen
;
2131 limitOffsetPtr
= tsceItemPtr
->offsets
;
2133 offset
= ucol_getOffset(uce
);
2134 element
= ucol_previous(uce
, &status
);
2135 if ( element
== 0 ) {
2136 log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr
->locale
);
2138 if ( nextOffsetPtr
> limitOffsetPtr
) {
2140 if (offset
!= *nextOffsetPtr
) {
2141 log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n",
2142 tsceItemPtr
->locale
, *nextOffsetPtr
, offset
);
2143 nextOffsetPtr
= limitOffsetPtr
;
2147 log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr
->locale
);
2149 } while ( U_SUCCESS(status
) && element
!= UCOL_NULLORDER
);
2150 if ( nextOffsetPtr
> limitOffsetPtr
) {
2151 log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr
->locale
);
2154 ucol_closeElements(uce
);
2156 log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr
->locale
, u_errorName(status
) );
2160 log_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr
->locale
, u_errorName(status
) );
2165 #endif /* #if !UCONFIG_NO_COLLATION */