]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/citertst.c
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / test / cintltst / citertst.c
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2004, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /********************************************************************************
7 *
8 * File CITERTST.C
9 *
10 * Modification History:
11 * Date Name Description
12 * Madhu Katragadda Ported for C API
13 * 02/19/01 synwee Modified test case for new collation iterator
14 *********************************************************************************/
15 /*
16 * Collation Iterator tests.
17 * (Let me reiterate my position...)
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucol.h"
25 #include "unicode/uloc.h"
26 #include "unicode/uchar.h"
27 #include "unicode/ustring.h"
28 #include "unicode/putil.h"
29 #include "callcoll.h"
30 #include "cmemory.h"
31 #include "cintltst.h"
32 #include "citertst.h"
33 #include "ccolltst.h"
34 #include "filestrm.h"
35 #include "cstring.h"
36 #include "ucol_imp.h"
37 #include "ucol_tok.h"
38 #include <stdio.h>
39
40 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
41
42 void addCollIterTest(TestNode** root)
43 {
44 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious");
45 addTest(root, &TestOffset, "tscoll/citertst/TestOffset");
46 addTest(root, &TestSetText, "tscoll/citertst/TestSetText");
47 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion");
48 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar");
49 addTest(root, &TestNormalizedUnicodeChar,
50 "tscoll/citertst/TestNormalizedUnicodeChar");
51 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization");
52 addTest(root, &TestBug672, "tscoll/citertst/TestBug672");
53 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize");
54 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer");
55 addTest(root, &TestCEs, "tscoll/citertst/TestCEs");
56 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos");
57 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
58 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
59 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
60 }
61
62 /* The locales we support */
63
64 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"};
65
66 static void TestBug672() {
67 UErrorCode status = U_ZERO_ERROR;
68 UChar pattern[20];
69 UChar text[50];
70 int i;
71 int result[3][3];
72
73 u_uastrcpy(pattern, "resume");
74 u_uastrcpy(text, "Time to resume updating my resume.");
75
76 for (i = 0; i < 3; ++ i) {
77 UCollator *coll = ucol_open(LOCALES[i], &status);
78 UCollationElements *pitr = ucol_openElements(coll, pattern, -1,
79 &status);
80 UCollationElements *titer = ucol_openElements(coll, text, -1,
81 &status);
82 if (U_FAILURE(status)) {
83 log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
84 myErrorName(status));
85 return;
86 }
87
88 log_verbose("locale tested %s\n", LOCALES[i]);
89
90 while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
91 U_SUCCESS(status)) {
92 }
93 if (U_FAILURE(status)) {
94 log_err("ERROR: reversing collation iterator :%s\n",
95 myErrorName(status));
96 return;
97 }
98 ucol_reset(pitr);
99
100 ucol_setOffset(titer, u_strlen(pattern), &status);
101 if (U_FAILURE(status)) {
102 log_err("ERROR: setting offset in collator :%s\n",
103 myErrorName(status));
104 return;
105 }
106 result[i][0] = ucol_getOffset(titer);
107 log_verbose("Text iterator set to offset %d\n", result[i][0]);
108
109 /* Use previous() */
110 ucol_previous(titer, &status);
111 result[i][1] = ucol_getOffset(titer);
112 log_verbose("Current offset %d after previous\n", result[i][1]);
113
114 /* Add one to index */
115 log_verbose("Adding one to current offset...\n");
116 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
117 if (U_FAILURE(status)) {
118 log_err("ERROR: setting offset in collator :%s\n",
119 myErrorName(status));
120 return;
121 }
122 result[i][2] = ucol_getOffset(titer);
123 log_verbose("Current offset in text = %d\n", result[i][2]);
124 ucol_closeElements(pitr);
125 ucol_closeElements(titer);
126 ucol_close(coll);
127 }
128
129 if (uprv_memcmp(result[0], result[1], 3) != 0 ||
130 uprv_memcmp(result[1], result[2], 3) != 0) {
131 log_err("ERROR: Different locales have different offsets at the same character\n");
132 }
133 }
134
135
136
137 /* Running this test with normalization enabled showed up a bug in the incremental
138 normalization code. */
139 static void TestBug672Normalize() {
140 UErrorCode status = U_ZERO_ERROR;
141 UChar pattern[20];
142 UChar text[50];
143 int i;
144 int result[3][3];
145
146 u_uastrcpy(pattern, "resume");
147 u_uastrcpy(text, "Time to resume updating my resume.");
148
149 for (i = 0; i < 3; ++ i) {
150 UCollator *coll = ucol_open(LOCALES[i], &status);
151 UCollationElements *pitr = NULL;
152 UCollationElements *titer = NULL;
153
154 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
155
156 pitr = ucol_openElements(coll, pattern, -1, &status);
157 titer = ucol_openElements(coll, text, -1, &status);
158 if (U_FAILURE(status)) {
159 log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
160 myErrorName(status));
161 return;
162 }
163
164 log_verbose("locale tested %s\n", LOCALES[i]);
165
166 while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
167 U_SUCCESS(status)) {
168 }
169 if (U_FAILURE(status)) {
170 log_err("ERROR: reversing collation iterator :%s\n",
171 myErrorName(status));
172 return;
173 }
174 ucol_reset(pitr);
175
176 ucol_setOffset(titer, u_strlen(pattern), &status);
177 if (U_FAILURE(status)) {
178 log_err("ERROR: setting offset in collator :%s\n",
179 myErrorName(status));
180 return;
181 }
182 result[i][0] = ucol_getOffset(titer);
183 log_verbose("Text iterator set to offset %d\n", result[i][0]);
184
185 /* Use previous() */
186 ucol_previous(titer, &status);
187 result[i][1] = ucol_getOffset(titer);
188 log_verbose("Current offset %d after previous\n", result[i][1]);
189
190 /* Add one to index */
191 log_verbose("Adding one to current offset...\n");
192 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
193 if (U_FAILURE(status)) {
194 log_err("ERROR: setting offset in collator :%s\n",
195 myErrorName(status));
196 return;
197 }
198 result[i][2] = ucol_getOffset(titer);
199 log_verbose("Current offset in text = %d\n", result[i][2]);
200 ucol_closeElements(pitr);
201 ucol_closeElements(titer);
202 ucol_close(coll);
203 }
204
205 if (uprv_memcmp(result[0], result[1], 3) != 0 ||
206 uprv_memcmp(result[1], result[2], 3) != 0) {
207 log_err("ERROR: Different locales have different offsets at the same character\n");
208 }
209 }
210
211
212
213
214 /**
215 * Test for CollationElementIterator previous and next for the whole set of
216 * unicode characters.
217 */
218 static void TestUnicodeChar()
219 {
220 UChar source[0x100];
221 UCollator *en_us;
222 UCollationElements *iter;
223 UErrorCode status = U_ZERO_ERROR;
224 UChar codepoint;
225
226 UChar *test;
227 en_us = ucol_open("en_US", &status);
228 if (U_FAILURE(status)){
229 log_err("ERROR: in creation of collation data using ucol_open()\n %s\n",
230 myErrorName(status));
231 return;
232 }
233
234 for (codepoint = 1; codepoint < 0xFFFE;)
235 {
236 test = source;
237
238 while (codepoint % 0xFF != 0)
239 {
240 if (u_isdefined(codepoint))
241 *(test ++) = codepoint;
242 codepoint ++;
243 }
244
245 if (u_isdefined(codepoint))
246 *(test ++) = codepoint;
247
248 if (codepoint != 0xFFFF)
249 codepoint ++;
250
251 *test = 0;
252 iter=ucol_openElements(en_us, source, u_strlen(source), &status);
253 if(U_FAILURE(status)){
254 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
255 myErrorName(status));
256 ucol_close(en_us);
257 return;
258 }
259 /* A basic test to see if it's working at all */
260 log_verbose("codepoint testing %x\n", codepoint);
261 backAndForth(iter);
262 ucol_closeElements(iter);
263
264 /* null termination test */
265 iter=ucol_openElements(en_us, source, -1, &status);
266 if(U_FAILURE(status)){
267 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
268 myErrorName(status));
269 ucol_close(en_us);
270 return;
271 }
272 /* A basic test to see if it's working at all */
273 backAndForth(iter);
274 ucol_closeElements(iter);
275 }
276
277 ucol_close(en_us);
278 }
279
280 /**
281 * Test for CollationElementIterator previous and next for the whole set of
282 * unicode characters with normalization on.
283 */
284 static void TestNormalizedUnicodeChar()
285 {
286 UChar source[0x100];
287 UCollator *th_th;
288 UCollationElements *iter;
289 UErrorCode status = U_ZERO_ERROR;
290 UChar codepoint;
291
292 UChar *test;
293 /* thai should have normalization on */
294 th_th = ucol_open("th_TH", &status);
295 if (U_FAILURE(status)){
296 log_err("ERROR: in creation of thai collation using ucol_open()\n %s\n",
297 myErrorName(status));
298 return;
299 }
300
301 for (codepoint = 1; codepoint < 0xFFFE;)
302 {
303 test = source;
304
305 while (codepoint % 0xFF != 0)
306 {
307 if (u_isdefined(codepoint))
308 *(test ++) = codepoint;
309 codepoint ++;
310 }
311
312 if (u_isdefined(codepoint))
313 *(test ++) = codepoint;
314
315 if (codepoint != 0xFFFF)
316 codepoint ++;
317
318 *test = 0;
319 iter=ucol_openElements(th_th, source, u_strlen(source), &status);
320 if(U_FAILURE(status)){
321 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
322 myErrorName(status));
323 ucol_close(th_th);
324 return;
325 }
326
327 backAndForth(iter);
328 ucol_closeElements(iter);
329
330 iter=ucol_openElements(th_th, source, -1, &status);
331 if(U_FAILURE(status)){
332 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
333 myErrorName(status));
334 ucol_close(th_th);
335 return;
336 }
337
338 backAndForth(iter);
339 ucol_closeElements(iter);
340 }
341
342 ucol_close(th_th);
343 }
344
345 /**
346 * Test the incremental normalization
347 */
348 static void TestNormalization()
349 {
350 UErrorCode status = U_ZERO_ERROR;
351 const char *str =
352 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315";
353 UCollator *coll;
354 UChar rule[50];
355 int rulelen = u_unescape(str, rule, 50);
356 int count = 0;
357 const char *testdata[] =
358 {"\\u1ED9", "o\\u0323\\u0302",
359 "\\u0300\\u0315", "\\u0315\\u0300",
360 "A\\u0300\\u0315B", "A\\u0315\\u0300B",
361 "A\\u0316\\u0315B", "A\\u0315\\u0316B",
362 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316",
363 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B",
364 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"};
365 int32_t srclen;
366 UChar source[10];
367 UCollationElements *iter;
368
369 coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status);
370 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
371 if (U_FAILURE(status)){
372 log_err("ERROR: in creation of collator using ucol_openRules()\n %s\n",
373 myErrorName(status));
374 return;
375 }
376
377 srclen = u_unescape(testdata[0], source, 10);
378 iter = ucol_openElements(coll, source, srclen, &status);
379 backAndForth(iter);
380 ucol_closeElements(iter);
381
382 srclen = u_unescape(testdata[1], source, 10);
383 iter = ucol_openElements(coll, source, srclen, &status);
384 backAndForth(iter);
385 ucol_closeElements(iter);
386
387 while (count < 12) {
388 srclen = u_unescape(testdata[count], source, 10);
389 iter = ucol_openElements(coll, source, srclen, &status);
390
391 if (U_FAILURE(status)){
392 log_err("ERROR: in creation of collator element iterator\n %s\n",
393 myErrorName(status));
394 return;
395 }
396 backAndForth(iter);
397 ucol_closeElements(iter);
398
399 iter = ucol_openElements(coll, source, -1, &status);
400
401 if (U_FAILURE(status)){
402 log_err("ERROR: in creation of collator element iterator\n %s\n",
403 myErrorName(status));
404 return;
405 }
406 backAndForth(iter);
407 ucol_closeElements(iter);
408 count ++;
409 }
410 ucol_close(coll);
411 }
412
413 /**
414 * Test for CollationElementIterator.previous()
415 *
416 * @bug 4108758 - Make sure it works with contracting characters
417 *
418 */
419 static void TestPrevious()
420 {
421 UCollator *coll=NULL;
422 UChar rule[50];
423 UChar *source;
424 UCollator *c1, *c2, *c3;
425 UCollationElements *iter;
426 UErrorCode status = U_ZERO_ERROR;
427
428 test1=(UChar*)malloc(sizeof(UChar) * 50);
429 test2=(UChar*)malloc(sizeof(UChar) * 50);
430 u_uastrcpy(test1, "What subset of all possible test cases?");
431 u_uastrcpy(test2, "has the highest probability of detecting");
432 coll = ucol_open("en_US", &status);
433
434 iter=ucol_openElements(coll, test1, u_strlen(test1), &status);
435 log_verbose("English locale testing back and forth\n");
436 if(U_FAILURE(status)){
437 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
438 myErrorName(status));
439 ucol_close(coll);
440 return;
441 }
442 /* A basic test to see if it's working at all */
443 backAndForth(iter);
444 ucol_closeElements(iter);
445 ucol_close(coll);
446
447 /* Test with a contracting character sequence */
448 u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH");
449 c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
450
451 log_verbose("Contraction rule testing back and forth with no normalization\n");
452
453 if (c1 == NULL || U_FAILURE(status))
454 {
455 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n",
456 myErrorName(status));
457 return;
458 }
459 source=(UChar*)malloc(sizeof(UChar) * 20);
460 u_uastrcpy(source, "abchdcba");
461 iter=ucol_openElements(c1, source, u_strlen(source), &status);
462 if(U_FAILURE(status)){
463 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
464 myErrorName(status));
465 return;
466 }
467 backAndForth(iter);
468 ucol_closeElements(iter);
469 ucol_close(c1);
470
471 /* Test with an expanding character sequence */
472 u_uastrcpy(rule, "&a < b < c/abd < d");
473 c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
474 log_verbose("Expansion rule testing back and forth with no normalization\n");
475 if (c2 == NULL || U_FAILURE(status))
476 {
477 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
478 myErrorName(status));
479 return;
480 }
481 u_uastrcpy(source, "abcd");
482 iter=ucol_openElements(c2, source, u_strlen(source), &status);
483 if(U_FAILURE(status)){
484 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
485 myErrorName(status));
486 return;
487 }
488 backAndForth(iter);
489 ucol_closeElements(iter);
490 ucol_close(c2);
491 /* Now try both */
492 u_uastrcpy(rule, "&a < b < c/aba < d < z < ch");
493 c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,NULL, &status);
494 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n");
495
496 if (c3 == NULL || U_FAILURE(status))
497 {
498 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
499 myErrorName(status));
500 return;
501 }
502 u_uastrcpy(source, "abcdbchdc");
503 iter=ucol_openElements(c3, source, u_strlen(source), &status);
504 if(U_FAILURE(status)){
505 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
506 myErrorName(status));
507 return;
508 }
509 backAndForth(iter);
510 ucol_closeElements(iter);
511 ucol_close(c3);
512 source[0] = 0x0e41;
513 source[1] = 0x0e02;
514 source[2] = 0x0e41;
515 source[3] = 0x0e02;
516 source[4] = 0x0e27;
517 source[5] = 0x61;
518 source[6] = 0x62;
519 source[7] = 0x63;
520 source[8] = 0;
521
522 coll = ucol_open("th_TH", &status);
523 log_verbose("Thai locale testing back and forth with normalization\n");
524 iter=ucol_openElements(coll, source, u_strlen(source), &status);
525 if(U_FAILURE(status)){
526 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
527 myErrorName(status));
528 return;
529 }
530 backAndForth(iter);
531 ucol_closeElements(iter);
532 ucol_close(coll);
533
534 /* prev test */
535 source[0] = 0x0061;
536 source[1] = 0x30CF;
537 source[2] = 0x3099;
538 source[3] = 0x30FC;
539 source[4] = 0;
540
541 coll = ucol_open("ja_JP", &status);
542 log_verbose("Japanese locale testing back and forth with normalization\n");
543 iter=ucol_openElements(coll, source, u_strlen(source), &status);
544 if(U_FAILURE(status)){
545 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
546 myErrorName(status));
547 return;
548 }
549 backAndForth(iter);
550 ucol_closeElements(iter);
551 ucol_close(coll);
552
553 free(source);
554 free(test1);
555 free(test2);
556 }
557
558 /**
559 * Test for getOffset() and setOffset()
560 */
561 static void TestOffset()
562 {
563 UErrorCode status= U_ZERO_ERROR;
564 UCollator *en_us=NULL;
565 UCollationElements *iter, *pristine;
566 int32_t offset;
567 int32_t *orders;
568 int32_t orderLength=0;
569 int count = 0;
570 test1=(UChar*)malloc(sizeof(UChar) * 50);
571 test2=(UChar*)malloc(sizeof(UChar) * 50);
572 u_uastrcpy(test1, "What subset of all possible test cases?");
573 u_uastrcpy(test2, "has the highest probability of detecting");
574 en_us = ucol_open("en_US", &status);
575 log_verbose("Testing getOffset and setOffset for collations\n");
576 iter = ucol_openElements(en_us, test1, u_strlen(test1), &status);
577 if(U_FAILURE(status)){
578 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
579 myErrorName(status));
580 ucol_close(en_us);
581 return;
582 }
583
584 /* testing boundaries */
585 ucol_setOffset(iter, 0, &status);
586 if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) {
587 log_err("Error: After setting offset to 0, we should be at the end "
588 "of the backwards iteration");
589 }
590 ucol_setOffset(iter, u_strlen(test1), &status);
591 if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) {
592 log_err("Error: After setting offset to end of the string, we should "
593 "be at the end of the backwards iteration");
594 }
595
596 /* Run all the way through the iterator, then get the offset */
597
598 orders = getOrders(iter, &orderLength);
599
600 offset = ucol_getOffset(iter);
601
602 if (offset != u_strlen(test1))
603 {
604 log_err("offset at end != length %d vs %d\n", offset,
605 u_strlen(test1) );
606 }
607
608 /* Now set the offset back to the beginning and see if it works */
609 pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status);
610 if(U_FAILURE(status)){
611 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
612 myErrorName(status));
613 ucol_close(en_us);
614 return;
615 }
616 status = U_ZERO_ERROR;
617
618 ucol_setOffset(iter, 0, &status);
619 if (U_FAILURE(status))
620 {
621 log_err("setOffset failed. %s\n", myErrorName(status));
622 }
623 else
624 {
625 assertEqual(iter, pristine);
626 }
627
628 ucol_closeElements(pristine);
629 ucol_closeElements(iter);
630 free(orders);
631
632 /* testing offsets in normalization buffer */
633 test1[0] = 0x61;
634 test1[1] = 0x300;
635 test1[2] = 0x316;
636 test1[3] = 0x62;
637 test1[4] = 0;
638 ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
639 iter = ucol_openElements(en_us, test1, 4, &status);
640 if(U_FAILURE(status)){
641 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
642 myErrorName(status));
643 ucol_close(en_us);
644 return;
645 }
646
647 count = 0;
648 while (ucol_next(iter, &status) != UCOL_NULLORDER &&
649 U_SUCCESS(status)) {
650 switch (count) {
651 case 0:
652 if (ucol_getOffset(iter) != 1) {
653 log_err("ERROR: Offset of iteration should be 0\n");
654 }
655 break;
656 case 3:
657 if (ucol_getOffset(iter) != 4) {
658 log_err("ERROR: Offset of iteration should be 4\n");
659 }
660 break;
661 default:
662 if (ucol_getOffset(iter) != 3) {
663 log_err("ERROR: Offset of iteration should be 3\n");
664 }
665 }
666 count ++;
667 }
668
669 ucol_reset(iter);
670 count = 0;
671 while (ucol_previous(iter, &status) != UCOL_NULLORDER &&
672 U_SUCCESS(status)) {
673 switch (count) {
674 case 0:
675 if (ucol_getOffset(iter) != 3) {
676 log_err("ERROR: Offset of iteration should be 3\n");
677 }
678 break;
679 default:
680 if (ucol_getOffset(iter) != 0) {
681 log_err("ERROR: Offset of iteration should be 0\n");
682 }
683 }
684 count ++;
685 }
686
687 if(U_FAILURE(status)){
688 log_err("ERROR: in iterating collation elements %s\n",
689 myErrorName(status));
690 }
691
692 ucol_closeElements(iter);
693 ucol_close(en_us);
694 free(test1);
695 free(test2);
696 }
697
698 /**
699 * Test for setText()
700 */
701 static void TestSetText()
702 {
703 int32_t c,i;
704 UErrorCode status = U_ZERO_ERROR;
705 UCollator *en_us=NULL;
706 UCollationElements *iter1, *iter2;
707 test1=(UChar*)malloc(sizeof(UChar) * 50);
708 test2=(UChar*)malloc(sizeof(UChar) * 50);
709 u_uastrcpy(test1, "What subset of all possible test cases?");
710 u_uastrcpy(test2, "has the highest probability of detecting");
711 en_us = ucol_open("en_US", &status);
712 log_verbose("testing setText for Collation elements\n");
713 iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status);
714 if(U_FAILURE(status)){
715 log_err("ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
716 myErrorName(status));
717 ucol_close(en_us);
718 return;
719 }
720 iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status);
721 if(U_FAILURE(status)){
722 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n",
723 myErrorName(status));
724 ucol_close(en_us);
725 return;
726 }
727
728 /* Run through the second iterator just to exercise it */
729 c = ucol_next(iter2, &status);
730 i = 0;
731
732 while ( ++i < 10 && (c != UCOL_NULLORDER))
733 {
734 if (U_FAILURE(status))
735 {
736 log_err("iter2->next() returned an error. %s\n", myErrorName(status));
737 ucol_closeElements(iter2);
738 ucol_closeElements(iter1);
739 ucol_close(en_us);
740 return;
741 }
742
743 c = ucol_next(iter2, &status);
744 }
745
746 /* Now set it to point to the same string as the first iterator */
747 ucol_setText(iter2, test1, u_strlen(test1), &status);
748 if (U_FAILURE(status))
749 {
750 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status));
751 }
752 else
753 {
754 assertEqual(iter1, iter2);
755 }
756
757 /* Now set it to point to a null string with fake length*/
758 ucol_setText(iter2, NULL, 2, &status);
759 if (U_FAILURE(status))
760 {
761 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status));
762 }
763 else
764 {
765 if (ucol_next(iter2, &status) != UCOL_NULLORDER) {
766 log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
767 }
768 }
769
770 ucol_closeElements(iter2);
771 ucol_closeElements(iter1);
772 ucol_close(en_us);
773 free(test1);
774 free(test2);
775 }
776
777 /** @bug 4108762
778 * Test for getMaxExpansion()
779 */
780 static void TestMaxExpansion()
781 {
782 UErrorCode status = U_ZERO_ERROR;
783 UCollator *coll ;/*= ucol_open("en_US", &status);*/
784 UChar ch = 0;
785 UChar32 unassigned = 0xEFFFD;
786 UChar supplementary[2];
787 uint32_t index = 0;
788 UBool isError = FALSE;
789 uint32_t sorder = 0;
790 UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/
791 uint32_t temporder = 0;
792
793 UChar rule[256];
794 u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch");
795 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
796 UCOL_DEFAULT_STRENGTH,NULL, &status);
797 if(U_SUCCESS(status) && coll) {
798 iter = ucol_openElements(coll, &ch, 1, &status);
799
800 while (ch < 0xFFFF && U_SUCCESS(status)) {
801 int count = 1;
802 uint32_t order;
803 int32_t size = 0;
804
805 ch ++;
806
807 ucol_setText(iter, &ch, 1, &status);
808 order = ucol_previous(iter, &status);
809
810 /* thai management */
811 if (order == 0)
812 order = ucol_previous(iter, &status);
813
814 while (U_SUCCESS(status) &&
815 ucol_previous(iter, &status) != UCOL_NULLORDER) {
816 count ++;
817 }
818
819 size = ucol_getMaxExpansion(iter, order);
820 if (U_FAILURE(status) || size < count) {
821 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
822 ch, count);
823 }
824 }
825
826 /* testing for exact max expansion */
827 ch = 0;
828 while (ch < 0x61) {
829 uint32_t order;
830 int32_t size;
831 ucol_setText(iter, &ch, 1, &status);
832 order = ucol_previous(iter, &status);
833 size = ucol_getMaxExpansion(iter, order);
834 if (U_FAILURE(status) || size != 1) {
835 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
836 ch, 1);
837 }
838 ch ++;
839 }
840
841 ch = 0x63;
842 ucol_setText(iter, &ch, 1, &status);
843 temporder = ucol_previous(iter, &status);
844
845 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) {
846 log_err("Failure at codepoint %d, maximum expansion count != %d\n",
847 ch, 3);
848 }
849
850 ch = 0x64;
851 ucol_setText(iter, &ch, 1, &status);
852 temporder = ucol_previous(iter, &status);
853
854 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) {
855 log_err("Failure at codepoint %d, maximum expansion count != %d\n",
856 ch, 3);
857 }
858
859 U16_APPEND(supplementary, index, 2, unassigned, isError);
860 ucol_setText(iter, supplementary, 2, &status);
861 sorder = ucol_previous(iter, &status);
862
863 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) {
864 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
865 ch, 2);
866 }
867
868 /* testing jamo */
869 ch = 0x1165;
870
871 ucol_setText(iter, &ch, 1, &status);
872 temporder = ucol_previous(iter, &status);
873 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) {
874 log_err("Failure at codepoint %d, maximum expansion count > %d\n",
875 ch, 3);
876 }
877
878 ucol_closeElements(iter);
879 ucol_close(coll);
880
881 /* testing special jamo &a<\u1160 */
882 rule[0] = 0x26;
883 rule[1] = 0x71;
884 rule[2] = 0x3c;
885 rule[3] = 0x1165;
886 rule[4] = 0x2f;
887 rule[5] = 0x71;
888 rule[6] = 0x71;
889 rule[7] = 0x71;
890 rule[8] = 0x71;
891 rule[9] = 0;
892
893 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
894 UCOL_DEFAULT_STRENGTH,NULL, &status);
895 iter = ucol_openElements(coll, &ch, 1, &status);
896
897 temporder = ucol_previous(iter, &status);
898 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) {
899 log_err("Failure at codepoint %d, maximum expansion count > %d\n",
900 ch, 5);
901 }
902
903 ucol_closeElements(iter);
904 ucol_close(coll);
905 } else {
906 log_data_err("Couldn't open collator\n");
907 }
908
909 }
910
911
912 static void assertEqual(UCollationElements *i1, UCollationElements *i2)
913 {
914 int32_t c1, c2;
915 int32_t count = 0;
916 UErrorCode status = U_ZERO_ERROR;
917
918 do
919 {
920 c1 = ucol_next(i1, &status);
921 c2 = ucol_next(i2, &status);
922
923 if (c1 != c2)
924 {
925 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count, c1, c2);
926 break;
927 }
928
929 count += 1;
930 }
931 while (c1 != UCOL_NULLORDER);
932 }
933
934 /**
935 * Testing iterators with extremely small buffers
936 */
937 static void TestSmallBuffer()
938 {
939 UErrorCode status = U_ZERO_ERROR;
940 UCollator *coll;
941 UCollationElements *testiter,
942 *iter;
943 int32_t count = 0;
944 int32_t *testorders,
945 *orders;
946
947 UChar teststr[500];
948 UChar str[] = {0x300, 0x31A, 0};
949 /*
950 creating a long string of decomposable characters,
951 since by default the writable buffer is of size 256
952 */
953 while (count < 500) {
954 if ((count & 1) == 0) {
955 teststr[count ++] = 0x300;
956 }
957 else {
958 teststr[count ++] = 0x31A;
959 }
960 }
961
962 coll = ucol_open("th_TH", &status);
963 if(U_SUCCESS(status) && coll) {
964 testiter = ucol_openElements(coll, teststr, 500, &status);
965 iter = ucol_openElements(coll, str, 2, &status);
966
967 orders = getOrders(iter, &count);
968 if (count != 2) {
969 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n");
970 }
971
972 /*
973 this will rearrange the string data to 250 characters of 0x300 first then
974 250 characters of 0x031A
975 */
976 testorders = getOrders(testiter, &count);
977
978 if (count != 500) {
979 log_err("Error decomposition does not give the right sized collation elements\n");
980 }
981
982 while (count != 0) {
983 /* UCA collation element for 0x0F76 */
984 if ((count > 250 && testorders[-- count] != orders[1]) ||
985 (count <= 250 && testorders[-- count] != orders[0])) {
986 log_err("Error decomposition does not give the right collation element at %d count\n", count);
987 break;
988 }
989 }
990
991 free(testorders);
992 free(orders);
993
994 ucol_reset(testiter);
995 /* ensures that the writable buffer was cleared */
996 if (testiter->iteratordata_.writableBuffer !=
997 testiter->iteratordata_.stackWritableBuffer) {
998 log_err("Error Writable buffer in collation element iterator not reset\n");
999 }
1000
1001 /* ensures closing of elements done properly to clear writable buffer */
1002 ucol_next(testiter, &status);
1003 ucol_next(testiter, &status);
1004 ucol_closeElements(testiter);
1005 ucol_closeElements(iter);
1006 ucol_close(coll);
1007 } else {
1008 log_data_err("Couldn't open collator\n");
1009 }
1010 }
1011
1012 /**
1013 * Sniplets of code from genuca
1014 */
1015 static int32_t hex2num(char hex) {
1016 if(hex>='0' && hex <='9') {
1017 return hex-'0';
1018 } else if(hex>='a' && hex<='f') {
1019 return hex-'a'+10;
1020 } else if(hex>='A' && hex<='F') {
1021 return hex-'A'+10;
1022 } else {
1023 return 0;
1024 }
1025 }
1026
1027 /**
1028 * Getting codepoints from a string
1029 * @param str character string contain codepoints seperated by space and ended
1030 * by a semicolon
1031 * @param codepoints array for storage, assuming size > 5
1032 * @return position at the end of the codepoint section
1033 */
1034 static char * getCodePoints(char *str, UChar *codepoints) {
1035 char *pStartCP = str;
1036 char *pEndCP = str + 4;
1037
1038 *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
1039 (hex2num(*(pStartCP + 1)) << 8) |
1040 (hex2num(*(pStartCP + 2)) << 4) |
1041 (hex2num(*(pStartCP + 3))));
1042 codepoints ++;
1043 while (*pEndCP != ';') {
1044 pStartCP = pEndCP + 1;
1045 *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
1046 (hex2num(*(pStartCP + 1)) << 8) |
1047 (hex2num(*(pStartCP + 2)) << 4) |
1048 (hex2num(*(pStartCP + 3))));
1049 codepoints ++;
1050 pEndCP = pStartCP + 4;
1051 }
1052 *codepoints = 0;
1053 return pEndCP + 1;
1054 }
1055
1056 /**
1057 * Sniplets of code from genuca
1058 */
1059 static int32_t
1060 readElement(char **from, char *to, char separator, UErrorCode *status)
1061 {
1062 if (U_SUCCESS(*status)) {
1063 char buffer[1024];
1064 int32_t i = 0;
1065 while (**from != separator) {
1066 if (**from != ' ') {
1067 *(buffer+i++) = **from;
1068 }
1069 (*from)++;
1070 }
1071 (*from)++;
1072 *(buffer + i) = 0;
1073 strcpy(to, buffer);
1074 return i/2;
1075 }
1076
1077 return 0;
1078 }
1079
1080 /**
1081 * Sniplets of code from genuca
1082 */
1083 static uint32_t
1084 getSingleCEValue(char *primary, char *secondary, char *tertiary,
1085 UErrorCode *status)
1086 {
1087 if (U_SUCCESS(*status)) {
1088 uint32_t value = 0;
1089 char primsave = '\0';
1090 char secsave = '\0';
1091 char tersave = '\0';
1092 char *primend = primary+4;
1093 char *secend = secondary+2;
1094 char *terend = tertiary+2;
1095 uint32_t primvalue;
1096 uint32_t secvalue;
1097 uint32_t tervalue;
1098
1099 if (uprv_strlen(primary) > 4) {
1100 primsave = *primend;
1101 *primend = '\0';
1102 }
1103
1104 if (uprv_strlen(secondary) > 2) {
1105 secsave = *secend;
1106 *secend = '\0';
1107 }
1108
1109 if (uprv_strlen(tertiary) > 2) {
1110 tersave = *terend;
1111 *terend = '\0';
1112 }
1113
1114 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0;
1115 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0;
1116 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0;
1117 if(primvalue <= 0xFF) {
1118 primvalue <<= 8;
1119 }
1120
1121 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK)
1122 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK)
1123 | (tervalue & UCOL_TERTIARYORDERMASK);
1124
1125 if(primsave!='\0') {
1126 *primend = primsave;
1127 }
1128 if(secsave!='\0') {
1129 *secend = secsave;
1130 }
1131 if(tersave!='\0') {
1132 *terend = tersave;
1133 }
1134 return value;
1135 }
1136 return 0;
1137 }
1138
1139 /**
1140 * Getting collation elements generated from a string
1141 * @param str character string contain collation elements contained in [] and
1142 * seperated by space
1143 * @param ce array for storage, assuming size > 20
1144 * @param status error status
1145 * @return position at the end of the codepoint section
1146 */
1147 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) {
1148 char *pStartCP = uprv_strchr(str, '[');
1149 int count = 0;
1150 char *pEndCP;
1151 char primary[100];
1152 char secondary[100];
1153 char tertiary[100];
1154
1155 while (*pStartCP == '[') {
1156 uint32_t primarycount = 0;
1157 uint32_t secondarycount = 0;
1158 uint32_t tertiarycount = 0;
1159 uint32_t CEi = 1;
1160 pEndCP = strchr(pStartCP, ']');
1161 if(pEndCP == NULL) {
1162 break;
1163 }
1164 pStartCP ++;
1165
1166 primarycount = readElement(&pStartCP, primary, ',', status);
1167 secondarycount = readElement(&pStartCP, secondary, ',', status);
1168 tertiarycount = readElement(&pStartCP, tertiary, ']', status);
1169
1170 /* I want to get the CEs entered right here, including continuation */
1171 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status);
1172 if (U_FAILURE(*status)) {
1173 break;
1174 }
1175
1176 while (2 * CEi < primarycount || CEi < secondarycount ||
1177 CEi < tertiarycount) {
1178 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
1179 if (2 * CEi < primarycount) {
1180 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28);
1181 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24);
1182 }
1183
1184 if (2 * CEi + 1 < primarycount) {
1185 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20);
1186 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16);
1187 }
1188
1189 if (CEi < secondarycount) {
1190 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12);
1191 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8);
1192 }
1193
1194 if (CEi < tertiarycount) {
1195 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4);
1196 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF);
1197 }
1198
1199 CEi ++;
1200 ces[count ++] = value;
1201 }
1202
1203 pStartCP = pEndCP + 1;
1204 }
1205 ces[count] = 0;
1206 return pStartCP;
1207 }
1208
1209 /**
1210 * Getting the FractionalUCA.txt file stream
1211 */
1212 static FileStream * getFractionalUCA(void)
1213 {
1214 char newPath[256];
1215 char backupPath[256];
1216 FileStream *result = NULL;
1217
1218 /* Look inside ICU_DATA first */
1219 uprv_strcpy(newPath, ctest_dataSrcDir());
1220 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
1221 uprv_strcat(newPath, "FractionalUCA.txt");
1222
1223 /* As a fallback, try to guess where the source data was located
1224 * at the time ICU was built, and look there.
1225 */
1226 #if defined (U_TOPSRCDIR)
1227 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data");
1228 #else
1229 {
1230 UErrorCode errorCode = U_ZERO_ERROR;
1231 strcpy(backupPath, loadTestData(&errorCode));
1232 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
1233 }
1234 #endif
1235 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt");
1236
1237 result = T_FileStream_open(newPath, "rb");
1238
1239 if (result == NULL) {
1240 result = T_FileStream_open(backupPath, "rb");
1241 if (result == NULL) {
1242 log_err("Failed to open either %s or %s\n", newPath, backupPath);
1243 }
1244 }
1245 return result;
1246 }
1247
1248 /**
1249 * Testing the CEs returned by the iterator
1250 */
1251 static void TestCEs() {
1252 FileStream *file = NULL;
1253 char line[1024];
1254 char *str;
1255 UChar codepoints[5];
1256 uint32_t ces[20];
1257 UErrorCode status = U_ZERO_ERROR;
1258 UCollator *coll = ucol_open("", &status);
1259 uint32_t lineNo = 0;
1260
1261 if (U_FAILURE(status)) {
1262 log_err("Error in opening root collator\n");
1263 return;
1264 }
1265
1266 file = getFractionalUCA();
1267
1268 if (file == NULL) {
1269 log_err("*** unable to open input FractionalUCA.txt file ***\n");
1270 return;
1271 }
1272
1273
1274 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1275 int count = 0;
1276 UCollationElements *iter;
1277 lineNo++;
1278 /* skip this line if it is empty or a comment or is a return value
1279 or start of some variable section */
1280 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1281 line[0] == 0x000D || line[0] == '[') {
1282 continue;
1283 }
1284
1285 str = getCodePoints(line, codepoints);
1286
1287 /* these are 'fake' codepoints in the fractional UCA, and are used just
1288 * for positioning of indirect values. They should not go through this
1289 * test.
1290 */
1291 if(*codepoints == 0xFDD0) {
1292 continue;
1293 }
1294
1295 getCEs(str, ces, &status);
1296 if (U_FAILURE(status)) {
1297 log_err("Error in parsing collation elements in FractionalUCA.txt\n");
1298 break;
1299 }
1300 iter = ucol_openElements(coll, codepoints, -1, &status);
1301 if (U_FAILURE(status)) {
1302 log_err("Error in opening collation elements\n");
1303 break;
1304 }
1305 for (;;) {
1306 uint32_t ce = (uint32_t)ucol_next(iter, &status);
1307 if (ce == 0xFFFFFFFF) {
1308 ce = 0;
1309 }
1310 /* we now unconditionally reorder Thai/Lao prevowels, so this
1311 * test would fail if we don't skip here.
1312 */
1313 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) {
1314 continue;
1315 }
1316 if (ce != ces[count] || U_FAILURE(status)) {
1317 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
1318 break;
1319 }
1320 if (ces[count] == 0) {
1321 break;
1322 }
1323 count ++;
1324 }
1325 ucol_closeElements(iter);
1326 }
1327
1328 T_FileStream_close(file);
1329 ucol_close(coll);
1330 }
1331
1332 /**
1333 * Testing the discontigous contractions
1334 */
1335 static void TestDiscontiguos() {
1336 const char *rulestr =
1337 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315";
1338 UChar rule[50];
1339 int rulelen = u_unescape(rulestr, rule, 50);
1340 const char *src[] = {
1341 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC",
1342 /* base character blocked */
1343 "XD\\u0300", "XD\\u0300\\u0315",
1344 /* non blocking combining character */
1345 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315",
1346 /* blocking combining character */
1347 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315",
1348 /* contraction prefix */
1349 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315",
1350 "X\\u0300\\u031A\\u0315",
1351 /* ends not with a contraction character */
1352 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D",
1353 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D"
1354 };
1355 const char *tgt[] = {
1356 /* non blocking combining character */
1357 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC",
1358 /* base character blocked */
1359 "X D \\u0300", "X D \\u0300\\u0315",
1360 /* non blocking combining character */
1361 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319",
1362 /* blocking combining character */
1363 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315",
1364 /* contraction prefix */
1365 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319",
1366 "X\\u0300 \\u031A \\u0315",
1367 /* ends not with a contraction character */
1368 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D",
1369 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D"
1370 };
1371 int size = 20;
1372 UCollator *coll;
1373 UErrorCode status = U_ZERO_ERROR;
1374 int count = 0;
1375 UCollationElements *iter;
1376 UCollationElements *resultiter;
1377
1378 coll = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status);
1379 iter = ucol_openElements(coll, rule, 1, &status);
1380 resultiter = ucol_openElements(coll, rule, 1, &status);
1381
1382 if (U_FAILURE(status)) {
1383 log_err("Error opening collation rules\n");
1384 return;
1385 }
1386
1387 while (count < size) {
1388 UChar str[20];
1389 UChar tstr[20];
1390 int strLen = u_unescape(src[count], str, 20);
1391 UChar *s;
1392
1393 ucol_setText(iter, str, strLen, &status);
1394 if (U_FAILURE(status)) {
1395 log_err("Error opening collation iterator\n");
1396 return;
1397 }
1398
1399 u_unescape(tgt[count], tstr, 20);
1400 s = tstr;
1401
1402 log_verbose("count %d\n", count);
1403
1404 for (;;) {
1405 uint32_t ce;
1406 UChar *e = u_strchr(s, 0x20);
1407 if (e == 0) {
1408 e = u_strchr(s, 0);
1409 }
1410 ucol_setText(resultiter, s, (int32_t)(e - s), &status);
1411 ce = ucol_next(resultiter, &status);
1412 if (U_FAILURE(status)) {
1413 log_err("Error manipulating collation iterator\n");
1414 return;
1415 }
1416 while (ce != UCOL_NULLORDER) {
1417 if (ce != (uint32_t)ucol_next(iter, &status) ||
1418 U_FAILURE(status)) {
1419 log_err("Discontiguos contraction test mismatch\n");
1420 return;
1421 }
1422 ce = ucol_next(resultiter, &status);
1423 if (U_FAILURE(status)) {
1424 log_err("Error getting next collation element\n");
1425 return;
1426 }
1427 }
1428 s = e + 1;
1429 if (*e == 0) {
1430 break;
1431 }
1432 }
1433 ucol_reset(iter);
1434 backAndForth(iter);
1435 count ++;
1436 }
1437 ucol_closeElements(resultiter);
1438 ucol_closeElements(iter);
1439 ucol_close(coll);
1440 }
1441
1442 static void TestCEBufferOverflow()
1443 {
1444 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1];
1445 UErrorCode status = U_ZERO_ERROR;
1446 UChar rule[10];
1447 UCollator *coll;
1448 UCollationElements *iter;
1449
1450 u_uastrcpy(rule, "&z < AB");
1451 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
1452 if (U_FAILURE(status)) {
1453 log_err("Rule based collator not created for testing ce buffer overflow\n");
1454 return;
1455 }
1456
1457 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
1458 test. this will cause an overflow in getPrev */
1459 str[0] = 0x0041; /* 'A' */
1460 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
1461 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);
1462 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */
1463 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
1464 &status);
1465 if (ucol_previous(iter, &status) != UCOL_NULLORDER ||
1466 status != U_BUFFER_OVERFLOW_ERROR) {
1467 log_err("CE buffer expected to overflow with long string of trail surrogates\n");
1468 }
1469 ucol_closeElements(iter);
1470 ucol_close(coll);
1471 }
1472
1473 /**
1474 * Byte bounds checks. Checks if each byte in data is between upper and lower
1475 * inclusive.
1476 */
1477 static UBool checkByteBounds(uint32_t data, char upper, char lower)
1478 {
1479 int count = 4;
1480 while (count > 0) {
1481 char b = (char)(data & 0xFF);
1482 if (b > upper || b < lower) {
1483 return FALSE;
1484 }
1485 data = data >> 8;
1486 count --;
1487 }
1488 return TRUE;
1489 }
1490
1491 /**
1492 * Determines case of the string of codepoints.
1493 * If it is a multiple codepoints it has to treated as a contraction.
1494 */
1495 #if 0
1496 static uint8_t getCase(const UChar *s, uint32_t len) {
1497 UBool lower = FALSE;
1498 UBool upper = FALSE;
1499 UBool title = FALSE;
1500 UErrorCode status = U_ZERO_ERROR;
1501 UChar str[256];
1502 const UChar *ps = s;
1503
1504 if (len == 0) {
1505 return UCOL_LOWER_CASE;
1506 }
1507
1508 while (len > 0) {
1509 UChar c = *ps ++;
1510
1511 if (u_islower(c)) {
1512 lower = TRUE;
1513 }
1514 if (u_isupper(c)) {
1515 upper = TRUE;
1516 }
1517 if (u_istitle(c)) {
1518 title = TRUE;
1519 }
1520
1521 len --;
1522 }
1523 if ((lower && !upper && !title) || (!lower && !upper && !title)){
1524 return UCOL_LOWER_CASE;
1525 }
1526 if (upper && !lower && !title) {
1527 return UCOL_UPPER_CASE;
1528 }
1529 /* mix of cases here */
1530 /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status);
1531 if (U_FAILURE(status)) {
1532 log_err("Error normalizing data string\n");
1533 return UCOL_LOWER_CASE;
1534 }*/
1535
1536 if ((title && len >= 2) || (lower && upper)) {
1537 return UCOL_MIXED_CASE;
1538 }
1539 if (u_isupper(s[0])) {
1540 return UCOL_UPPER_CASE;
1541 }
1542 return UCOL_LOWER_CASE;
1543 }
1544 #endif
1545
1546 /**
1547 * Checking collation element validity given the boundary arguments.
1548 */
1549 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
1550 int length, uint32_t primarymax,
1551 uint32_t secondarymax)
1552 {
1553 UErrorCode status = U_ZERO_ERROR;
1554 UCollationElements *iter = ucol_openElements(coll, codepoints, length,
1555 &status);
1556 uint32_t ce;
1557 UBool first = TRUE;
1558 /*
1559 UBool upper = FALSE;
1560 UBool lower = FALSE;
1561 */
1562
1563 if (U_FAILURE(status)) {
1564 log_err("Error creating iterator for testing validity\n");
1565 }
1566
1567 ce = ucol_next(iter, &status);
1568
1569 while (ce != UCOL_NULLORDER) {
1570 if (ce != 0) {
1571 uint32_t primary = UCOL_PRIMARYORDER(ce);
1572 uint32_t secondary = UCOL_SECONDARYORDER(ce);
1573 uint32_t tertiary = UCOL_TERTIARYORDER(ce);
1574 /* uint32_t scasebits = tertiary & 0xC0;*/
1575
1576 if ((tertiary == 0 && secondary != 0) ||
1577 (tertiary < 0xC0 && secondary == 0 && primary != 0)) {
1578 /* n-1th level is not zero when the nth level is
1579 except for continuations, this is wrong */
1580 log_err("Lower level weight not 0 when high level weight is 0\n");
1581 goto fail;
1582 }
1583 else {
1584 /* checks if any byte is illegal ie = 01 02 03. */
1585 if (checkByteBounds(ce, 0x3, 0x1)) {
1586 log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n");
1587 goto fail;
1588 }
1589 }
1590 if ((primary != 0 && primary < primarymax)
1591 || ((primary & 0xFF) == 0xFF) || (((primary>>8) & 0xFF) == 0xFF)
1592 || ((primary & 0xFF) && ((primary & 0xFF) <= 0x03))
1593 || (((primary>>8) & 0xFF) && ((primary>>8) & 0xFF) <= 0x03)
1594 || (primary >= 0xFE00 && !isContinuation(ce))) {
1595 log_err("UCA primary weight out of bounds: %04X for string starting with %04X\n",
1596 primary, codepoints[0]);
1597 goto fail;
1598 }
1599 /* case matching not done since data generated by ken */
1600 if (first) {
1601 if (secondary >= 6 && secondary <= secondarymax) {
1602 log_err("Secondary weight out of range\n");
1603 goto fail;
1604 }
1605 first = FALSE;
1606 }
1607 }
1608 ce = ucol_next(iter, &status);
1609 }
1610 ucol_closeElements(iter);
1611 return TRUE;
1612 fail :
1613 ucol_closeElements(iter);
1614 return FALSE;
1615 }
1616
1617 static void TestCEValidity()
1618 {
1619 /* testing UCA collation elements */
1620 UErrorCode status = U_ZERO_ERROR;
1621 /* en_US has no tailorings */
1622 UCollator *coll = ucol_open("root", &status);
1623 /* tailored locales */
1624 char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
1625 const char *loc;
1626 FileStream *file = getFractionalUCA();
1627 char line[1024];
1628 UChar codepoints[10];
1629 int count = 0;
1630 int maxCount = 0;
1631 UParseError parseError;
1632 if (U_FAILURE(status)) {
1633 log_err("en_US collator creation failed\n");
1634 return;
1635 }
1636 log_verbose("Testing UCA elements\n");
1637 if (file == NULL) {
1638 log_err("Fractional UCA data can not be opened\n");
1639 return;
1640 }
1641
1642 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1643 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1644 line[0] == 0x000D || line[0] == '[') {
1645 continue;
1646 }
1647
1648 getCodePoints(line, codepoints);
1649 checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86);
1650 }
1651
1652 log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1653 codepoints[0] = 0;
1654 while (codepoints[0] < 0xFFFF) {
1655 if (u_isdefined((UChar32)codepoints[0])) {
1656 checkCEValidity(coll, codepoints, 1, 5, 86);
1657 }
1658 codepoints[0] ++;
1659 }
1660
1661 ucol_close(coll);
1662
1663 /* testing tailored collation elements */
1664 log_verbose("Testing tailored elements\n");
1665 if(QUICK) {
1666 maxCount = sizeof(locale)/sizeof(locale[0]);
1667 } else {
1668 maxCount = uloc_countAvailable();
1669 }
1670 while (count < maxCount) {
1671 const UChar *rules = NULL,
1672 *current = NULL;
1673 UChar *rulesCopy = NULL;
1674 int32_t ruleLen = 0;
1675
1676 uint32_t chOffset = 0;
1677 uint32_t chLen = 0;
1678 uint32_t exOffset = 0;
1679 uint32_t exLen = 0;
1680 uint32_t prefixOffset = 0;
1681 uint32_t prefixLen = 0;
1682 UBool startOfRules = TRUE;
1683 UColOptionSet opts;
1684
1685 UColTokenParser src;
1686 uint32_t strength = 0;
1687 uint16_t specs = 0;
1688 if(QUICK) {
1689 loc = locale[count];
1690 } else {
1691 loc = uloc_getAvailable(count);
1692 if(!hasCollationElements(loc)) {
1693 count++;
1694 continue;
1695 }
1696 }
1697
1698 log_verbose("Testing CEs for %s\n", loc);
1699
1700 coll = ucol_open(loc, &status);
1701 if (U_FAILURE(status)) {
1702 log_err("%s collator creation failed\n", loc);
1703 return;
1704 }
1705
1706 src.opts = &opts;
1707 rules = ucol_getRules(coll, &ruleLen);
1708
1709 if (ruleLen > 0) {
1710 rulesCopy = (UChar *)malloc((ruleLen +
1711 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1712 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1713 src.current = src.source = rulesCopy;
1714 src.end = rulesCopy + ruleLen;
1715 src.extraCurrent = src.end;
1716 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1717
1718 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) {
1719 strength = src.parsedToken.strength;
1720 chOffset = src.parsedToken.charsOffset;
1721 chLen = src.parsedToken.charsLen;
1722 exOffset = src.parsedToken.extensionOffset;
1723 exLen = src.parsedToken.extensionLen;
1724 prefixOffset = src.parsedToken.prefixOffset;
1725 prefixLen = src.parsedToken.prefixLen;
1726 specs = src.parsedToken.flags;
1727
1728 startOfRules = FALSE;
1729 uprv_memcpy(codepoints, src.source + chOffset,
1730 chLen * sizeof(UChar));
1731 codepoints[chLen] = 0;
1732 checkCEValidity(coll, codepoints, chLen, 4, 85);
1733 }
1734 free(rulesCopy);
1735 }
1736
1737 ucol_close(coll);
1738 count ++;
1739 }
1740 T_FileStream_close(file);
1741 }
1742
1743 static void printSortKeyError(const UChar *codepoints, int length,
1744 uint8_t *sortkey, int sklen)
1745 {
1746 int count = 0;
1747 log_err("Sortkey not valid for ");
1748 while (length > 0) {
1749 log_err("0x%04x ", *codepoints);
1750 length --;
1751 codepoints ++;
1752 }
1753 log_err("\nSortkey : ");
1754 while (count < sklen) {
1755 log_err("0x%02x ", sortkey[count]);
1756 count ++;
1757 }
1758 log_err("\n");
1759 }
1760
1761 /**
1762 * Checking sort key validity for all levels
1763 */
1764 static UBool checkSortKeyValidity(UCollator *coll,
1765 const UChar *codepoints,
1766 int length)
1767 {
1768 UErrorCode status = U_ZERO_ERROR;
1769 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY,
1770 UCOL_TERTIARY, UCOL_QUATERNARY,
1771 UCOL_IDENTICAL};
1772 int strengthlen = 5;
1773 int index = 0;
1774 int caselevel = 0;
1775
1776 while (caselevel < 1) {
1777 if (caselevel == 0) {
1778 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status);
1779 }
1780 else {
1781 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
1782 }
1783
1784 while (index < strengthlen) {
1785 int count01 = 0;
1786 uint32_t count = 0;
1787 uint8_t sortkey[128];
1788 uint32_t sklen;
1789
1790 ucol_setStrength(coll, strength[index]);
1791 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
1792 while (sortkey[count] != 0) {
1793 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) {
1794 printSortKeyError(codepoints, length, sortkey, sklen);
1795 return FALSE;
1796 }
1797 if (sortkey[count] == 1) {
1798 count01 ++;
1799 }
1800 count ++;
1801 }
1802
1803 if (count + 1 != sklen || (count01 != index + caselevel)) {
1804 printSortKeyError(codepoints, length, sortkey, sklen);
1805 return FALSE;
1806 }
1807 index ++;
1808 }
1809 caselevel ++;
1810 }
1811 return TRUE;
1812 }
1813
1814 static void TestSortKeyValidity(void)
1815 {
1816 /* testing UCA collation elements */
1817 UErrorCode status = U_ZERO_ERROR;
1818 /* en_US has no tailorings */
1819 UCollator *coll = ucol_open("en_US", &status);
1820 /* tailored locales */
1821 char locale[][6] = {"fr_FR\0", "ko_KR\0", "sh_YU\0", "th_TH\0", "zh_CN\0"};
1822 FileStream *file = getFractionalUCA();
1823 char line[1024];
1824 UChar codepoints[10];
1825 int count = 0;
1826 UParseError parseError;
1827 if (U_FAILURE(status)) {
1828 log_err("en_US collator creation failed\n");
1829 return;
1830 }
1831 log_verbose("Testing UCA elements\n");
1832 if (file == NULL) {
1833 log_err("Fractional UCA data can not be opened\n");
1834 return;
1835 }
1836
1837 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1838 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1839 line[0] == 0x000D || line[0] == '[') {
1840 continue;
1841 }
1842
1843 getCodePoints(line, codepoints);
1844 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
1845 }
1846
1847 log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1848 codepoints[0] = 0;
1849
1850 while (codepoints[0] < 0xFFFF) {
1851 if (u_isdefined((UChar32)codepoints[0])) {
1852 checkSortKeyValidity(coll, codepoints, 1);
1853 }
1854 codepoints[0] ++;
1855 }
1856
1857 ucol_close(coll);
1858
1859 /* testing tailored collation elements */
1860 log_verbose("Testing tailored elements\n");
1861 while (count < 5) {
1862 const UChar *rules = NULL,
1863 *current = NULL;
1864 UChar *rulesCopy = NULL;
1865 int32_t ruleLen = 0;
1866
1867 uint32_t chOffset = 0;
1868 uint32_t chLen = 0;
1869 uint32_t exOffset = 0;
1870 uint32_t exLen = 0;
1871 uint32_t prefixOffset = 0;
1872 uint32_t prefixLen = 0;
1873 UBool startOfRules = TRUE;
1874 UColOptionSet opts;
1875
1876 UColTokenParser src;
1877 uint32_t strength = 0;
1878 uint16_t specs = 0;
1879
1880 coll = ucol_open(locale[count], &status);
1881 if (U_FAILURE(status)) {
1882 log_err("%s collator creation failed\n", locale[count]);
1883 return;
1884 }
1885
1886 src.opts = &opts;
1887 rules = ucol_getRules(coll, &ruleLen);
1888
1889 if (ruleLen > 0) {
1890 rulesCopy = (UChar *)malloc((ruleLen +
1891 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1892 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1893 src.current = src.source = rulesCopy;
1894 src.end = rulesCopy + ruleLen;
1895 src.extraCurrent = src.end;
1896 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1897
1898 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) {
1899 strength = src.parsedToken.strength;
1900 chOffset = src.parsedToken.charsOffset;
1901 chLen = src.parsedToken.charsLen;
1902 exOffset = src.parsedToken.extensionOffset;
1903 exLen = src.parsedToken.extensionLen;
1904 prefixOffset = src.parsedToken.prefixOffset;
1905 prefixLen = src.parsedToken.prefixLen;
1906 specs = src.parsedToken.flags;
1907
1908 startOfRules = FALSE;
1909 uprv_memcpy(codepoints, src.source + chOffset,
1910 chLen * sizeof(UChar));
1911 codepoints[chLen] = 0;
1912 checkSortKeyValidity(coll, codepoints, chLen);
1913 }
1914 free(rulesCopy);
1915 }
1916
1917 ucol_close(coll);
1918 count ++;
1919 }
1920 T_FileStream_close(file);
1921 }
1922
1923 #endif /* #if !UCONFIG_NO_COLLATION */