]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/citertst.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / test / cintltst / citertst.c
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2003, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/********************************************************************************
7*
8* File CITERTST.C
9*
10* Modification History:
11* Date Name Description
12* Madhu Katragadda Ported for C API
13* 02/19/01 synwee Modified test case for new collation iterator
14*********************************************************************************/
15/*
16 * Collation Iterator tests.
17 * (Let me reiterate my position...)
18 */
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucol.h"
25#include "unicode/uloc.h"
26#include "unicode/uchar.h"
27#include "unicode/ustring.h"
28#include "cmemory.h"
29#include "cintltst.h"
30#include "citertst.h"
31#include "ccolltst.h"
32#include "filestrm.h"
33#include "cstring.h"
34#include "ucol_imp.h"
35#include "ucol_tok.h"
36#include <stdio.h>
37
38extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
39
40void addCollIterTest(TestNode** root)
41{
42 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious");
43 addTest(root, &TestOffset, "tscoll/citertst/TestOffset");
44 addTest(root, &TestSetText, "tscoll/citertst/TestSetText");
45 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion");
46 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar");
47 addTest(root, &TestNormalizedUnicodeChar,
48 "tscoll/citertst/TestNormalizedUnicodeChar");
49 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization");
50 addTest(root, &TestBug672, "tscoll/citertst/TestBug672");
51 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize");
52 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer");
53 addTest(root, &TestCEs, "tscoll/citertst/TestCEs");
54 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos");
55 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
56 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
57 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
58}
59
60/* The locales we support */
61
62static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"};
63
64static void TestBug672() {
65 UErrorCode status = U_ZERO_ERROR;
66 UChar pattern[20];
67 UChar text[50];
68 int i;
69 int result[3][3];
70
71 u_uastrcpy(pattern, "resume");
72 u_uastrcpy(text, "Time to resume updating my resume.");
73
74 for (i = 0; i < 3; ++ i) {
75 UCollator *coll = ucol_open(LOCALES[i], &status);
76 UCollationElements *pitr = ucol_openElements(coll, pattern, -1,
77 &status);
78 UCollationElements *titer = ucol_openElements(coll, text, -1,
79 &status);
80 if (U_FAILURE(status)) {
81 log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
82 myErrorName(status));
83 return;
84 }
85
86 log_verbose("locale tested %s\n", LOCALES[i]);
87
88 while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
89 U_SUCCESS(status)) {
90 }
91 if (U_FAILURE(status)) {
92 log_err("ERROR: reversing collation iterator :%s\n",
93 myErrorName(status));
94 return;
95 }
96 ucol_reset(pitr);
97
98 ucol_setOffset(titer, u_strlen(pattern), &status);
99 if (U_FAILURE(status)) {
100 log_err("ERROR: setting offset in collator :%s\n",
101 myErrorName(status));
102 return;
103 }
104 result[i][0] = ucol_getOffset(titer);
105 log_verbose("Text iterator set to offset %d\n", result[i][0]);
106
107 /* Use previous() */
108 ucol_previous(titer, &status);
109 result[i][1] = ucol_getOffset(titer);
110 log_verbose("Current offset %d after previous\n", result[i][1]);
111
112 /* Add one to index */
113 log_verbose("Adding one to current offset...\n");
114 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
115 if (U_FAILURE(status)) {
116 log_err("ERROR: setting offset in collator :%s\n",
117 myErrorName(status));
118 return;
119 }
120 result[i][2] = ucol_getOffset(titer);
121 log_verbose("Current offset in text = %d\n", result[i][2]);
122 ucol_closeElements(pitr);
123 ucol_closeElements(titer);
124 ucol_close(coll);
125 }
126
127 if (uprv_memcmp(result[0], result[1], 3) != 0 ||
128 uprv_memcmp(result[1], result[2], 3) != 0) {
129 log_err("ERROR: Different locales have different offsets at the same character\n");
130 }
131}
132
133
134
135/* Running this test with normalization enabled showed up a bug in the incremental
136 normalization code. */
137static void TestBug672Normalize() {
138 UErrorCode status = U_ZERO_ERROR;
139 UChar pattern[20];
140 UChar text[50];
141 int i;
142 int result[3][3];
143
144 u_uastrcpy(pattern, "resume");
145 u_uastrcpy(text, "Time to resume updating my resume.");
146
147 for (i = 0; i < 3; ++ i) {
148 UCollator *coll = ucol_open(LOCALES[i], &status);
149 UCollationElements *pitr = NULL;
150 UCollationElements *titer = NULL;
151
152 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
153
154 pitr = ucol_openElements(coll, pattern, -1, &status);
155 titer = ucol_openElements(coll, text, -1, &status);
156 if (U_FAILURE(status)) {
157 log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
158 myErrorName(status));
159 return;
160 }
161
162 log_verbose("locale tested %s\n", LOCALES[i]);
163
164 while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
165 U_SUCCESS(status)) {
166 }
167 if (U_FAILURE(status)) {
168 log_err("ERROR: reversing collation iterator :%s\n",
169 myErrorName(status));
170 return;
171 }
172 ucol_reset(pitr);
173
174 ucol_setOffset(titer, u_strlen(pattern), &status);
175 if (U_FAILURE(status)) {
176 log_err("ERROR: setting offset in collator :%s\n",
177 myErrorName(status));
178 return;
179 }
180 result[i][0] = ucol_getOffset(titer);
181 log_verbose("Text iterator set to offset %d\n", result[i][0]);
182
183 /* Use previous() */
184 ucol_previous(titer, &status);
185 result[i][1] = ucol_getOffset(titer);
186 log_verbose("Current offset %d after previous\n", result[i][1]);
187
188 /* Add one to index */
189 log_verbose("Adding one to current offset...\n");
190 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
191 if (U_FAILURE(status)) {
192 log_err("ERROR: setting offset in collator :%s\n",
193 myErrorName(status));
194 return;
195 }
196 result[i][2] = ucol_getOffset(titer);
197 log_verbose("Current offset in text = %d\n", result[i][2]);
198 ucol_closeElements(pitr);
199 ucol_closeElements(titer);
200 ucol_close(coll);
201 }
202
203 if (uprv_memcmp(result[0], result[1], 3) != 0 ||
204 uprv_memcmp(result[1], result[2], 3) != 0) {
205 log_err("ERROR: Different locales have different offsets at the same character\n");
206 }
207}
208
209
210
211
212/**
213 * Test for CollationElementIterator previous and next for the whole set of
214 * unicode characters.
215 */
216static void TestUnicodeChar()
217{
218 UChar source[0x100];
219 UCollator *en_us;
220 UCollationElements *iter;
221 UErrorCode status = U_ZERO_ERROR;
222 UChar codepoint;
223
224 UChar *test;
225 en_us = ucol_open("en_US", &status);
226 if (U_FAILURE(status)){
227 log_err("ERROR: in creation of collation data using ucol_open()\n %s\n",
228 myErrorName(status));
229 return;
230 }
231
232 for (codepoint = 1; codepoint < 0xFFFE;)
233 {
234 test = source;
235
236 while (codepoint % 0xFF != 0)
237 {
238 if (u_isdefined(codepoint))
239 *(test ++) = codepoint;
240 codepoint ++;
241 }
242
243 if (u_isdefined(codepoint))
244 *(test ++) = codepoint;
245
246 if (codepoint != 0xFFFF)
247 codepoint ++;
248
249 *test = 0;
250 iter=ucol_openElements(en_us, source, u_strlen(source), &status);
251 if(U_FAILURE(status)){
252 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
253 myErrorName(status));
254 ucol_close(en_us);
255 return;
256 }
257 /* A basic test to see if it's working at all */
258 log_verbose("codepoint testing %x\n", codepoint);
259 backAndForth(iter);
260 ucol_closeElements(iter);
261
262 /* null termination test */
263 iter=ucol_openElements(en_us, source, -1, &status);
264 if(U_FAILURE(status)){
265 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
266 myErrorName(status));
267 ucol_close(en_us);
268 return;
269 }
270 /* A basic test to see if it's working at all */
271 backAndForth(iter);
272 ucol_closeElements(iter);
273 }
274
275 ucol_close(en_us);
276}
277
278/**
279 * Test for CollationElementIterator previous and next for the whole set of
280 * unicode characters with normalization on.
281 */
282static void TestNormalizedUnicodeChar()
283{
284 UChar source[0x100];
285 UCollator *th_th;
286 UCollationElements *iter;
287 UErrorCode status = U_ZERO_ERROR;
288 UChar codepoint;
289
290 UChar *test;
291 /* thai should have normalization on */
292 th_th = ucol_open("th_TH", &status);
293 if (U_FAILURE(status)){
294 log_err("ERROR: in creation of thai collation using ucol_open()\n %s\n",
295 myErrorName(status));
296 return;
297 }
298
299 for (codepoint = 1; codepoint < 0xFFFE;)
300 {
301 test = source;
302
303 while (codepoint % 0xFF != 0)
304 {
305 if (u_isdefined(codepoint))
306 *(test ++) = codepoint;
307 codepoint ++;
308 }
309
310 if (u_isdefined(codepoint))
311 *(test ++) = codepoint;
312
313 if (codepoint != 0xFFFF)
314 codepoint ++;
315
316 *test = 0;
317 iter=ucol_openElements(th_th, source, u_strlen(source), &status);
318 if(U_FAILURE(status)){
319 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
320 myErrorName(status));
321 ucol_close(th_th);
322 return;
323 }
324
325 backAndForth(iter);
326 ucol_closeElements(iter);
327
328 iter=ucol_openElements(th_th, source, -1, &status);
329 if(U_FAILURE(status)){
330 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
331 myErrorName(status));
332 ucol_close(th_th);
333 return;
334 }
335
336 backAndForth(iter);
337 ucol_closeElements(iter);
338 }
339
340 ucol_close(th_th);
341}
342
343/**
344* Test the incremental normalization
345*/
346static void TestNormalization()
347{
348 UErrorCode status = U_ZERO_ERROR;
349 const char *str =
350 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315";
351 UCollator *coll;
352 UChar rule[50];
353 int rulelen = u_unescape(str, rule, 50);
354 int count = 0;
355 const char *testdata[] =
356 {"\\u1ED9", "o\\u0323\\u0302",
357 "\\u0300\\u0315", "\\u0315\\u0300",
358 "A\\u0300\\u0315B", "A\\u0315\\u0300B",
359 "A\\u0316\\u0315B", "A\\u0315\\u0316B",
360 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316",
361 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B",
362 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"};
363 int32_t srclen;
364 UChar source[10];
365 UCollationElements *iter;
366
367 coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status);
368 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
369 if (U_FAILURE(status)){
370 log_err("ERROR: in creation of collator using ucol_openRules()\n %s\n",
371 myErrorName(status));
372 return;
373 }
374
375 srclen = u_unescape(testdata[0], source, 10);
376 iter = ucol_openElements(coll, source, srclen, &status);
377 backAndForth(iter);
378 ucol_closeElements(iter);
379
380 srclen = u_unescape(testdata[1], source, 10);
381 iter = ucol_openElements(coll, source, srclen, &status);
382 backAndForth(iter);
383 ucol_closeElements(iter);
384
385 while (count < 12) {
386 srclen = u_unescape(testdata[count], source, 10);
387 iter = ucol_openElements(coll, source, srclen, &status);
388
389 if (U_FAILURE(status)){
390 log_err("ERROR: in creation of collator element iterator\n %s\n",
391 myErrorName(status));
392 return;
393 }
394 backAndForth(iter);
395 ucol_closeElements(iter);
396
397 iter = ucol_openElements(coll, source, -1, &status);
398
399 if (U_FAILURE(status)){
400 log_err("ERROR: in creation of collator element iterator\n %s\n",
401 myErrorName(status));
402 return;
403 }
404 backAndForth(iter);
405 ucol_closeElements(iter);
406 count ++;
407 }
408 ucol_close(coll);
409}
410
411/**
412 * Test for CollationElementIterator.previous()
413 *
414 * @bug 4108758 - Make sure it works with contracting characters
415 *
416 */
417static void TestPrevious()
418{
419 UCollator *coll=NULL;
420 UChar rule[50];
421 UChar *source;
422 UCollator *c1, *c2, *c3;
423 UCollationElements *iter;
424 UErrorCode status = U_ZERO_ERROR;
425
426 test1=(UChar*)malloc(sizeof(UChar) * 50);
427 test2=(UChar*)malloc(sizeof(UChar) * 50);
428 u_uastrcpy(test1, "What subset of all possible test cases?");
429 u_uastrcpy(test2, "has the highest probability of detecting");
430 coll = ucol_open("en_US", &status);
431
432 iter=ucol_openElements(coll, test1, u_strlen(test1), &status);
433 log_verbose("English locale testing back and forth\n");
434 if(U_FAILURE(status)){
435 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
436 myErrorName(status));
437 ucol_close(coll);
438 return;
439 }
440 /* A basic test to see if it's working at all */
441 backAndForth(iter);
442 ucol_closeElements(iter);
443 ucol_close(coll);
444
445 /* Test with a contracting character sequence */
446 u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH");
447 c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
448
449 log_verbose("Contraction rule testing back and forth with no normalization\n");
450
451 if (c1 == NULL || U_FAILURE(status))
452 {
453 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n",
454 myErrorName(status));
455 return;
456 }
457 source=(UChar*)malloc(sizeof(UChar) * 20);
458 u_uastrcpy(source, "abchdcba");
459 iter=ucol_openElements(c1, source, u_strlen(source), &status);
460 if(U_FAILURE(status)){
461 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
462 myErrorName(status));
463 return;
464 }
465 backAndForth(iter);
466 ucol_closeElements(iter);
467 ucol_close(c1);
468
469 /* Test with an expanding character sequence */
470 u_uastrcpy(rule, "&a < b < c/abd < d");
471 c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
472 log_verbose("Expansion rule testing back and forth with no normalization\n");
473 if (c2 == NULL || U_FAILURE(status))
474 {
475 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
476 myErrorName(status));
477 return;
478 }
479 u_uastrcpy(source, "abcd");
480 iter=ucol_openElements(c2, source, u_strlen(source), &status);
481 if(U_FAILURE(status)){
482 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
483 myErrorName(status));
484 return;
485 }
486 backAndForth(iter);
487 ucol_closeElements(iter);
488 ucol_close(c2);
489 /* Now try both */
490 u_uastrcpy(rule, "&a < b < c/aba < d < z < ch");
491 c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,NULL, &status);
492 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n");
493
494 if (c3 == NULL || U_FAILURE(status))
495 {
496 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
497 myErrorName(status));
498 return;
499 }
500 u_uastrcpy(source, "abcdbchdc");
501 iter=ucol_openElements(c3, source, u_strlen(source), &status);
502 if(U_FAILURE(status)){
503 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
504 myErrorName(status));
505 return;
506 }
507 backAndForth(iter);
508 ucol_closeElements(iter);
509 ucol_close(c3);
510 source[0] = 0x0e41;
511 source[1] = 0x0e02;
512 source[2] = 0x0e41;
513 source[3] = 0x0e02;
514 source[4] = 0x0e27;
515 source[5] = 0x61;
516 source[6] = 0x62;
517 source[7] = 0x63;
518 source[8] = 0;
519
520 coll = ucol_open("th_TH", &status);
521 log_verbose("Thai locale testing back and forth with normalization\n");
522 iter=ucol_openElements(coll, source, u_strlen(source), &status);
523 if(U_FAILURE(status)){
524 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
525 myErrorName(status));
526 return;
527 }
528 backAndForth(iter);
529 ucol_closeElements(iter);
530 ucol_close(coll);
531
532 /* prev test */
533 source[0] = 0x0061;
534 source[1] = 0x30CF;
535 source[2] = 0x3099;
536 source[3] = 0x30FC;
537 source[4] = 0;
538
539 coll = ucol_open("ja_JP", &status);
540 log_verbose("Japanese locale testing back and forth with normalization\n");
541 iter=ucol_openElements(coll, source, u_strlen(source), &status);
542 if(U_FAILURE(status)){
543 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
544 myErrorName(status));
545 return;
546 }
547 backAndForth(iter);
548 ucol_closeElements(iter);
549 ucol_close(coll);
550
551 free(source);
552 free(test1);
553 free(test2);
554}
555
556/**
557 * Test for getOffset() and setOffset()
558 */
559static void TestOffset()
560{
561 UErrorCode status= U_ZERO_ERROR;
562 UCollator *en_us=NULL;
563 UCollationElements *iter, *pristine;
564 int32_t offset;
565 int32_t *orders;
566 int32_t orderLength=0;
567 int count = 0;
568 test1=(UChar*)malloc(sizeof(UChar) * 50);
569 test2=(UChar*)malloc(sizeof(UChar) * 50);
570 u_uastrcpy(test1, "What subset of all possible test cases?");
571 u_uastrcpy(test2, "has the highest probability of detecting");
572 en_us = ucol_open("en_US", &status);
573 log_verbose("Testing getOffset and setOffset for CollationElements\n");
574 iter = ucol_openElements(en_us, test1, u_strlen(test1), &status);
575 if(U_FAILURE(status)){
576 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
577 myErrorName(status));
578 ucol_close(en_us);
579 return;
580 }
581 /* Run all the way through the iterator, then get the offset */
582
583 orders = getOrders(iter, &orderLength);
584
585 offset = ucol_getOffset(iter);
586
587 if (offset != u_strlen(test1))
588 {
589 log_err("offset at end != length %d vs %d\n", offset,
590 u_strlen(test1) );
591 }
592
593 /* Now set the offset back to the beginning and see if it works */
594 pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status);
595 if(U_FAILURE(status)){
596 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
597 myErrorName(status));
598 ucol_close(en_us);
599 return;
600 }
601 status = U_ZERO_ERROR;
602
603 ucol_setOffset(iter, 0, &status);
604 if (U_FAILURE(status))
605 {
606 log_err("setOffset failed. %s\n", myErrorName(status));
607 }
608 else
609 {
610 assertEqual(iter, pristine);
611 }
612
613 ucol_closeElements(pristine);
614 ucol_closeElements(iter);
615 free(orders);
616
617 /* testing offsets in normalization buffer */
618 test1[0] = 0x61;
619 test1[1] = 0x300;
620 test1[2] = 0x316;
621 test1[3] = 0x62;
622 test1[4] = 0;
623 ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
624 iter = ucol_openElements(en_us, test1, 4, &status);
625 if(U_FAILURE(status)){
626 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
627 myErrorName(status));
628 ucol_close(en_us);
629 return;
630 }
631
632 count = 0;
633 while (ucol_next(iter, &status) != UCOL_NULLORDER &&
634 U_SUCCESS(status)) {
635 switch (count) {
636 case 0:
637 if (ucol_getOffset(iter) != 1) {
638 log_err("ERROR: Offset of iteration should be 0\n");
639 }
640 break;
641 case 3:
642 if (ucol_getOffset(iter) != 4) {
643 log_err("ERROR: Offset of iteration should be 4\n");
644 }
645 break;
646 default:
647 if (ucol_getOffset(iter) != 3) {
648 log_err("ERROR: Offset of iteration should be 3\n");
649 }
650 }
651 count ++;
652 }
653
654 ucol_reset(iter);
655 count = 0;
656 while (ucol_previous(iter, &status) != UCOL_NULLORDER &&
657 U_SUCCESS(status)) {
658 switch (count) {
659 case 0:
660 if (ucol_getOffset(iter) != 3) {
661 log_err("ERROR: Offset of iteration should be 3\n");
662 }
663 break;
664 default:
665 if (ucol_getOffset(iter) != 0) {
666 log_err("ERROR: Offset of iteration should be 0\n");
667 }
668 }
669 count ++;
670 }
671
672 if(U_FAILURE(status)){
673 log_err("ERROR: in iterating collation elements %s\n",
674 myErrorName(status));
675 }
676
677 ucol_closeElements(iter);
678 ucol_close(en_us);
679 free(test1);
680 free(test2);
681}
682
683/**
684 * Test for setText()
685 */
686static void TestSetText()
687{
688 int32_t c,i;
689 UErrorCode status = U_ZERO_ERROR;
690 UCollator *en_us=NULL;
691 UCollationElements *iter1, *iter2;
692 test1=(UChar*)malloc(sizeof(UChar) * 50);
693 test2=(UChar*)malloc(sizeof(UChar) * 50);
694 u_uastrcpy(test1, "What subset of all possible test cases?");
695 u_uastrcpy(test2, "has the highest probability of detecting");
696 en_us = ucol_open("en_US", &status);
697 log_verbose("testing setText for Collation elements\n");
698 iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status);
699 if(U_FAILURE(status)){
700 log_err("ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
701 myErrorName(status));
702 ucol_close(en_us);
703 return;
704 }
705 iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status);
706 if(U_FAILURE(status)){
707 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n",
708 myErrorName(status));
709 ucol_close(en_us);
710 return;
711 }
712
713 /* Run through the second iterator just to exercise it */
714 c = ucol_next(iter2, &status);
715 i = 0;
716
717 while ( ++i < 10 && (c != UCOL_NULLORDER))
718 {
719 if (U_FAILURE(status))
720 {
721 log_err("iter2->next() returned an error. %s\n", myErrorName(status));
722 ucol_closeElements(iter2);
723 ucol_closeElements(iter1);
724 ucol_close(en_us);
725 return;
726 }
727
728 c = ucol_next(iter2, &status);
729 }
730
731 /* Now set it to point to the same string as the first iterator */
732 ucol_setText(iter2, test1, u_strlen(test1), &status);
733 if (U_FAILURE(status))
734 {
735 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status));
736 }
737 else
738 {
739 assertEqual(iter1, iter2);
740 }
741
742 /* Now set it to point to a null string with fake length*/
743 ucol_setText(iter2, NULL, 2, &status);
744 if (U_FAILURE(status))
745 {
746 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status));
747 }
748 else
749 {
750 if (ucol_next(iter2, &status) != UCOL_NULLORDER) {
751 log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
752 }
753 }
754
755 ucol_closeElements(iter2);
756 ucol_closeElements(iter1);
757 ucol_close(en_us);
758 free(test1);
759 free(test2);
760}
761
762
763
764static void backAndForth(UCollationElements *iter)
765{
766 /* Run through the iterator forwards and stick it into an array */
767 int32_t index, o;
768 UErrorCode status = U_ZERO_ERROR;
769 int32_t orderLength = 0;
770 int32_t *orders;
771 orders= getOrders(iter, &orderLength);
772
773
774 /* Now go through it backwards and make sure we get the same values */
775 index = orderLength;
776 ucol_reset(iter);
777
778 /* synwee : changed */
779 while ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
780 {
781 if (o != orders[-- index])
782 {
783 if (o == 0)
784 index ++;
785 else
786 {
787 while (index > 0 && orders[-- index] == 0)
788 {
789 }
790 if (o != orders[index])
791 {
792 log_err("Mismatch at index : 0x%x\n", index);
793 return;
794 }
795
796 }
797 }
798 }
799
800 while (index != 0 && orders[index - 1] == 0) {
801 index --;
802 }
803
804 if (index != 0)
805 {
806 log_err("Didn't get back to beginning - index is %d\n", index);
807
808 ucol_reset(iter);
809 log_err("\nnext: ");
810 if ((o = ucol_next(iter, &status)) != UCOL_NULLORDER)
811 {
812 log_err("Error at %x\n", o);
813 }
814 log_err("\nprev: ");
815 if ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
816 {
817 log_err("Error at %x\n", o);
818 }
819 log_verbose("\n");
820 }
821
822 free(orders);
823}
824
825/** @bug 4108762
826 * Test for getMaxExpansion()
827 */
828static void TestMaxExpansion()
829{
830 UErrorCode status = U_ZERO_ERROR;
831 UCollator *coll ;/*= ucol_open("en_US", &status);*/
832 UChar ch = 0;
833 UChar supplementary[2] = {0xD800, 0xDC00};
834 uint32_t sorder = 0;
835 UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/
836 uint32_t temporder = 0;
837
838 UChar rule[256];
839 u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch");
840 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
841 UCOL_DEFAULT_STRENGTH,NULL, &status);
842 if(U_SUCCESS(status) && coll) {
843 iter = ucol_openElements(coll, &ch, 1, &status);
844
845 while (ch < 0xFFFF && U_SUCCESS(status)) {
846 int count = 1;
847 uint32_t order;
848 int32_t size = 0;
849
850 ch ++;
851
852 ucol_setText(iter, &ch, 1, &status);
853 order = ucol_previous(iter, &status);
854
855 /* thai management */
856 if (order == 0)
857 order = ucol_previous(iter, &status);
858
859 while (U_SUCCESS(status) &&
860 ucol_previous(iter, &status) != UCOL_NULLORDER) {
861 count ++;
862 }
863
864 size = ucol_getMaxExpansion(iter, order);
865 if (U_FAILURE(status) || size < count) {
866 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
867 ch, count);
868 }
869 }
870
871 /* testing for exact max expansion */
872 ch = 0;
873 while (ch < 0x61) {
874 uint32_t order;
875 int32_t size;
876 ucol_setText(iter, &ch, 1, &status);
877 order = ucol_previous(iter, &status);
878 size = ucol_getMaxExpansion(iter, order);
879 if (U_FAILURE(status) || size != 1) {
880 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
881 ch, 1);
882 }
883 ch ++;
884 }
885
886 ch = 0x63;
887 ucol_setText(iter, &ch, 1, &status);
888 temporder = ucol_previous(iter, &status);
889
890 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) {
891 log_err("Failure at codepoint %d, maximum expansion count != %d\n",
892 ch, 3);
893 }
894
895 ch = 0x64;
896 ucol_setText(iter, &ch, 1, &status);
897 temporder = ucol_previous(iter, &status);
898
899 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) {
900 log_err("Failure at codepoint %d, maximum expansion count != %d\n",
901 ch, 3);
902 }
903
904 ucol_setText(iter, supplementary, 2, &status);
905 sorder = ucol_previous(iter, &status);
906
907 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) {
908 log_err("Failure at codepoint %d, maximum expansion count < %d\n",
909 ch, 2);
910 }
911
912 /* testing jamo */
913 ch = 0x1165;
914
915 ucol_setText(iter, &ch, 1, &status);
916 temporder = ucol_previous(iter, &status);
917 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) {
918 log_err("Failure at codepoint %d, maximum expansion count > %d\n",
919 ch, 3);
920 }
921
922 ucol_closeElements(iter);
923 ucol_close(coll);
924
925 /* testing special jamo &a<\u1160 */
926 rule[0] = 0x26;
927 rule[1] = 0x71;
928 rule[2] = 0x3c;
929 rule[3] = 0x1165;
930 rule[4] = 0x2f;
931 rule[5] = 0x71;
932 rule[6] = 0x71;
933 rule[7] = 0x71;
934 rule[8] = 0x71;
935 rule[9] = 0;
936
937 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
938 UCOL_DEFAULT_STRENGTH,NULL, &status);
939 iter = ucol_openElements(coll, &ch, 1, &status);
940
941 temporder = ucol_previous(iter, &status);
942 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) {
943 log_err("Failure at codepoint %d, maximum expansion count > %d\n",
944 ch, 5);
945 }
946
947 ucol_closeElements(iter);
948 ucol_close(coll);
949 } else {
950 log_data_err("Couldn't open collator\n");
951 }
952
953}
954
955/**
956 * Return an integer array containing all of the collation orders
957 * returned by calls to next on the specified iterator
958 */
959static int32_t* getOrders(UCollationElements *iter, int32_t *orderLength)
960{
961 UErrorCode status;
962 int32_t order;
963 int32_t maxSize = 100;
964 int32_t size = 0;
965 int32_t *temp;
966 int32_t *orders =(int32_t*)malloc(sizeof(int32_t) * maxSize);
967 status= U_ZERO_ERROR;
968
969
970 while ((order=ucol_next(iter, &status)) != UCOL_NULLORDER)
971 {
972 if (size == maxSize)
973 {
974 maxSize *= 2;
975 temp = (int32_t*)malloc(sizeof(int32_t) * maxSize);
976
977 memcpy(temp, orders, size * sizeof(int32_t));
978 free(orders);
979 orders = temp;
980
981 }
982
983 orders[size++] = order;
984 }
985
986 if (maxSize > size)
987 {
988 if (size == 0) {
989 size = 1;
990 temp = (int32_t*)malloc(sizeof(int32_t) * size);
991 temp[0] = 0;
992 }
993 else {
994 temp = (int32_t*)malloc(sizeof(int32_t) * size);
995 memcpy(temp, orders, size * sizeof(int32_t));
996 }
997
998 free(orders);
999 orders = temp;
1000 }
1001
1002 *orderLength = size;
1003 return orders;
1004}
1005
1006
1007static void assertEqual(UCollationElements *i1, UCollationElements *i2)
1008{
1009 int32_t c1, c2;
1010 int32_t count = 0;
1011 UErrorCode status = U_ZERO_ERROR;
1012
1013 do
1014 {
1015 c1 = ucol_next(i1, &status);
1016 c2 = ucol_next(i2, &status);
1017
1018 if (c1 != c2)
1019 {
1020 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count, c1, c2);
1021 break;
1022 }
1023
1024 count += 1;
1025 }
1026 while (c1 != UCOL_NULLORDER);
1027}
1028
1029/**
1030 * Testing iterators with extremely small buffers
1031 */
1032static void TestSmallBuffer()
1033{
1034 UErrorCode status = U_ZERO_ERROR;
1035 UCollator *coll;
1036 UCollationElements *testiter,
1037 *iter;
1038 int32_t count = 0;
1039 int32_t *testorders,
1040 *orders;
1041
1042 UChar teststr[500];
1043 UChar str[] = {0x300, 0x31A, 0};
1044 /*
1045 creating a long string of decomposable characters,
1046 since by default the writable buffer is of size 256
1047 */
1048 while (count < 500) {
1049 if ((count & 1) == 0) {
1050 teststr[count ++] = 0x300;
1051 }
1052 else {
1053 teststr[count ++] = 0x31A;
1054 }
1055 }
1056
1057 coll = ucol_open("th_TH", &status);
1058 if(U_SUCCESS(status) && coll) {
1059 testiter = ucol_openElements(coll, teststr, 500, &status);
1060 iter = ucol_openElements(coll, str, 2, &status);
1061
1062 orders = getOrders(iter, &count);
1063 if (count != 2) {
1064 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n");
1065 }
1066
1067 /*
1068 this will rearrange the string data to 250 characters of 0x300 first then
1069 250 characters of 0x031A
1070 */
1071 testorders = getOrders(testiter, &count);
1072
1073 if (count != 500) {
1074 log_err("Error decomposition does not give the right sized collation elements\n");
1075 }
1076
1077 while (count != 0) {
1078 /* UCA collation element for 0x0F76 */
1079 if ((count > 250 && testorders[-- count] != orders[1]) ||
1080 (count <= 250 && testorders[-- count] != orders[0])) {
1081 log_err("Error decomposition does not give the right collation element at %d count\n", count);
1082 break;
1083 }
1084 }
1085
1086 free(testorders);
1087 free(orders);
1088
1089 ucol_reset(testiter);
1090 /* ensures that the writable buffer was cleared */
1091 if (testiter->iteratordata_.writableBuffer !=
1092 testiter->iteratordata_.stackWritableBuffer) {
1093 log_err("Error Writable buffer in collation element iterator not reset\n");
1094 }
1095
1096 /* ensures closing of elements done properly to clear writable buffer */
1097 ucol_next(testiter, &status);
1098 ucol_next(testiter, &status);
1099 ucol_closeElements(testiter);
1100 ucol_closeElements(iter);
1101 ucol_close(coll);
1102 } else {
1103 log_data_err("Couldn't open collator\n");
1104 }
1105}
1106
1107/**
1108* Sniplets of code from genuca
1109*/
1110static int32_t hex2num(char hex) {
1111 if(hex>='0' && hex <='9') {
1112 return hex-'0';
1113 } else if(hex>='a' && hex<='f') {
1114 return hex-'a'+10;
1115 } else if(hex>='A' && hex<='F') {
1116 return hex-'A'+10;
1117 } else {
1118 return 0;
1119 }
1120}
1121
1122/**
1123* Getting codepoints from a string
1124* @param str character string contain codepoints seperated by space and ended
1125* by a semicolon
1126* @param codepoints array for storage, assuming size > 5
1127* @return position at the end of the codepoint section
1128*/
1129static char * getCodePoints(char *str, UChar *codepoints) {
1130 char *pStartCP = str;
1131 char *pEndCP = str + 4;
1132
1133 *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
1134 (hex2num(*(pStartCP + 1)) << 8) |
1135 (hex2num(*(pStartCP + 2)) << 4) |
1136 (hex2num(*(pStartCP + 3))));
1137 codepoints ++;
1138 while (*pEndCP != ';') {
1139 pStartCP = pEndCP + 1;
1140 *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
1141 (hex2num(*(pStartCP + 1)) << 8) |
1142 (hex2num(*(pStartCP + 2)) << 4) |
1143 (hex2num(*(pStartCP + 3))));
1144 codepoints ++;
1145 pEndCP = pStartCP + 4;
1146 }
1147 *codepoints = 0;
1148 return pEndCP + 1;
1149}
1150
1151/**
1152* Sniplets of code from genuca
1153*/
1154static int32_t
1155readElement(char **from, char *to, char separator, UErrorCode *status)
1156{
1157 if (U_SUCCESS(*status)) {
1158 char buffer[1024];
1159 int32_t i = 0;
1160 while (**from != separator) {
1161 if (**from != ' ') {
1162 *(buffer+i++) = **from;
1163 }
1164 (*from)++;
1165 }
1166 (*from)++;
1167 *(buffer + i) = 0;
1168 strcpy(to, buffer);
1169 return i/2;
1170 }
1171
1172 return 0;
1173}
1174
1175/**
1176* Sniplets of code from genuca
1177*/
1178static uint32_t
1179getSingleCEValue(char *primary, char *secondary, char *tertiary,
1180 UErrorCode *status)
1181{
1182 if (U_SUCCESS(*status)) {
1183 uint32_t value = 0;
1184 char primsave = '\0';
1185 char secsave = '\0';
1186 char tersave = '\0';
1187 char *primend = primary+4;
1188 char *secend = secondary+2;
1189 char *terend = tertiary+2;
1190 uint32_t primvalue;
1191 uint32_t secvalue;
1192 uint32_t tervalue;
1193
1194 if (uprv_strlen(primary) > 4) {
1195 primsave = *primend;
1196 *primend = '\0';
1197 }
1198
1199 if (uprv_strlen(secondary) > 2) {
1200 secsave = *secend;
1201 *secend = '\0';
1202 }
1203
1204 if (uprv_strlen(tertiary) > 2) {
1205 tersave = *terend;
1206 *terend = '\0';
1207 }
1208
1209 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0;
1210 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0;
1211 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0;
1212 if(primvalue <= 0xFF) {
1213 primvalue <<= 8;
1214 }
1215
1216 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK)
1217 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK)
1218 | (tervalue & UCOL_TERTIARYORDERMASK);
1219
1220 if(primsave!='\0') {
1221 *primend = primsave;
1222 }
1223 if(secsave!='\0') {
1224 *secend = secsave;
1225 }
1226 if(tersave!='\0') {
1227 *terend = tersave;
1228 }
1229 return value;
1230 }
1231 return 0;
1232}
1233
1234/**
1235* Getting collation elements generated from a string
1236* @param str character string contain collation elements contained in [] and
1237* seperated by space
1238* @param ce array for storage, assuming size > 20
1239* @param status error status
1240* @return position at the end of the codepoint section
1241*/
1242static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) {
1243 char *pStartCP = uprv_strchr(str, '[');
1244 int count = 0;
1245 char *pEndCP;
1246 char primary[100];
1247 char secondary[100];
1248 char tertiary[100];
1249
1250 while (*pStartCP == '[') {
1251 uint32_t primarycount = 0;
1252 uint32_t secondarycount = 0;
1253 uint32_t tertiarycount = 0;
1254 uint32_t CEi = 1;
1255 pEndCP = strchr(pStartCP, ']');
1256 if(pEndCP == NULL) {
1257 break;
1258 }
1259 pStartCP ++;
1260
1261 primarycount = readElement(&pStartCP, primary, ',', status);
1262 secondarycount = readElement(&pStartCP, secondary, ',', status);
1263 tertiarycount = readElement(&pStartCP, tertiary, ']', status);
1264
1265 /* I want to get the CEs entered right here, including continuation */
1266 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status);
1267 if (U_FAILURE(*status)) {
1268 break;
1269 }
1270
1271 while (2 * CEi < primarycount || CEi < secondarycount ||
1272 CEi < tertiarycount) {
1273 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
1274 if (2 * CEi < primarycount) {
1275 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28);
1276 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24);
1277 }
1278
1279 if (2 * CEi + 1 < primarycount) {
1280 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20);
1281 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16);
1282 }
1283
1284 if (CEi < secondarycount) {
1285 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12);
1286 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8);
1287 }
1288
1289 if (CEi < tertiarycount) {
1290 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4);
1291 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF);
1292 }
1293
1294 CEi ++;
1295 ces[count ++] = value;
1296 }
1297
1298 pStartCP = pEndCP + 1;
1299 }
1300 ces[count] = 0;
1301 return pStartCP;
1302}
1303
1304/**
1305* Getting the FractionalUCA.txt file stream
1306*/
1307static FileStream * getFractionalUCA(void)
1308{
1309 char newPath[256];
1310 char backupPath[256];
1311 FileStream *result = NULL;
1312
1313 /* Look inside ICU_DATA first */
1314 uprv_strcpy(newPath, u_getDataDirectory());
1315 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
1316 uprv_strcat(newPath, "FractionalUCA.txt");
1317
1318 /* As a fallback, try to guess where the source data was located
1319 * at the time ICU was built, and look there.
1320 */
1321#if defined (U_TOPSRCDIR)
1322 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data");
1323#else
1324 {
1325 UErrorCode errorCode = U_ZERO_ERROR;
1326 strcpy(backupPath, loadTestData(&errorCode));
1327 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
1328 }
1329#endif
1330 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt");
1331
1332 result = T_FileStream_open(newPath, "rb");
1333
1334 if (result == NULL) {
1335 result = T_FileStream_open(backupPath, "rb");
1336 if (result == NULL) {
1337 log_err("Failed to open either %s or %s\n", newPath, backupPath);
1338 }
1339 }
1340 return result;
1341}
1342
1343/**
1344* Testing the CEs returned by the iterator
1345*/
1346static void TestCEs() {
1347 FileStream *file = NULL;
1348 char line[1024];
1349 char *str;
1350 UChar codepoints[5];
1351 uint32_t ces[20];
1352 UErrorCode status = U_ZERO_ERROR;
1353 UCollator *coll = ucol_open("", &status);
1354 uint32_t lineNo = 0;
1355
1356 if (U_FAILURE(status)) {
1357 log_err("Error in opening root collator\n");
1358 return;
1359 }
1360
1361 file = getFractionalUCA();
1362
1363 if (file == NULL) {
1364 log_err("*** unable to open input FractionalUCA.txt file ***\n");
1365 return;
1366 }
1367
1368
1369 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1370 int count = 0;
1371 UCollationElements *iter;
1372 lineNo++;
1373 /* skip this line if it is empty or a comment or is a return value
1374 or start of some variable section */
1375 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1376 line[0] == 0x000D || line[0] == '[') {
1377 continue;
1378 }
1379
1380 str = getCodePoints(line, codepoints);
1381
1382 /* these are 'fake' codepoints in the fractional UCA, and are used just
1383 * for positioning of indirect values. They should not go through this
1384 * test.
1385 */
1386 if(*codepoints == 0xFDD0) {
1387 continue;
1388 }
1389
1390 getCEs(str, ces, &status);
1391 if (U_FAILURE(status)) {
1392 log_err("Error in parsing collation elements in FractionalUCA.txt\n");
1393 break;
1394 }
1395 iter = ucol_openElements(coll, codepoints, -1, &status);
1396 if (U_FAILURE(status)) {
1397 log_err("Error in opening collation elements\n");
1398 break;
1399 }
1400 for (;;) {
1401 uint32_t ce = (uint32_t)ucol_next(iter, &status);
1402 if (ce == 0xFFFFFFFF) {
1403 ce = 0;
1404 }
1405 /* we now unconditionally reorder Thai/Lao prevowels, so this
1406 * test would fail if we don't skip here.
1407 */
1408 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) {
1409 continue;
1410 }
1411 if (ce != ces[count] || U_FAILURE(status)) {
1412 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
1413 break;
1414 }
1415 if (ces[count] == 0) {
1416 break;
1417 }
1418 count ++;
1419 }
1420 ucol_closeElements(iter);
1421 }
1422
1423 T_FileStream_close(file);
1424 ucol_close(coll);
1425}
1426
1427/**
1428* Testing the discontigous contractions
1429*/
1430static void TestDiscontiguos() {
1431 const char *rulestr =
1432 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315";
1433 UChar rule[50];
1434 int rulelen = u_unescape(rulestr, rule, 50);
1435 const char *src[] = {
1436 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC",
1437 /* base character blocked */
1438 "XD\\u0300", "XD\\u0300\\u0315",
1439 /* non blocking combining character */
1440 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315",
1441 /* blocking combining character */
1442 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315",
1443 /* contraction prefix */
1444 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315",
1445 "X\\u0300\\u031A\\u0315",
1446 /* ends not with a contraction character */
1447 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D",
1448 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D"
1449 };
1450 const char *tgt[] = {
1451 /* non blocking combining character */
1452 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC",
1453 /* base character blocked */
1454 "X D \\u0300", "X D \\u0300\\u0315",
1455 /* non blocking combining character */
1456 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319",
1457 /* blocking combining character */
1458 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315",
1459 /* contraction prefix */
1460 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319",
1461 "X\\u0300 \\u031A \\u0315",
1462 /* ends not with a contraction character */
1463 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D",
1464 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D"
1465 };
1466 int size = 20;
1467 UCollator *coll;
1468 UErrorCode status = U_ZERO_ERROR;
1469 int count = 0;
1470 UCollationElements *iter;
1471 UCollationElements *resultiter;
1472
1473 coll = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status);
1474 iter = ucol_openElements(coll, rule, 1, &status);
1475 resultiter = ucol_openElements(coll, rule, 1, &status);
1476
1477 if (U_FAILURE(status)) {
1478 log_err("Error opening collation rules\n");
1479 return;
1480 }
1481
1482 while (count < size) {
1483 UChar str[20];
1484 UChar tstr[20];
1485 int strLen = u_unescape(src[count], str, 20);
1486 UChar *s;
1487
1488 ucol_setText(iter, str, strLen, &status);
1489 if (U_FAILURE(status)) {
1490 log_err("Error opening collation iterator\n");
1491 return;
1492 }
1493
1494 u_unescape(tgt[count], tstr, 20);
1495 s = tstr;
1496
1497 log_verbose("count %d\n", count);
1498
1499 for (;;) {
1500 uint32_t ce;
1501 UChar *e = u_strchr(s, 0x20);
1502 if (e == 0) {
1503 e = u_strchr(s, 0);
1504 }
1505 ucol_setText(resultiter, s, (int32_t)(e - s), &status);
1506 ce = ucol_next(resultiter, &status);
1507 if (U_FAILURE(status)) {
1508 log_err("Error manipulating collation iterator\n");
1509 return;
1510 }
1511 while (ce != UCOL_NULLORDER) {
1512 if (ce != (uint32_t)ucol_next(iter, &status) ||
1513 U_FAILURE(status)) {
1514 log_err("Discontiguos contraction test mismatch\n");
1515 return;
1516 }
1517 ce = ucol_next(resultiter, &status);
1518 if (U_FAILURE(status)) {
1519 log_err("Error getting next collation element\n");
1520 return;
1521 }
1522 }
1523 s = e + 1;
1524 if (*e == 0) {
1525 break;
1526 }
1527 }
1528 ucol_reset(iter);
1529 backAndForth(iter);
1530 count ++;
1531 }
1532 ucol_closeElements(resultiter);
1533 ucol_closeElements(iter);
1534 ucol_close(coll);
1535}
1536
1537static void TestCEBufferOverflow()
1538{
1539 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1];
1540 UErrorCode status = U_ZERO_ERROR;
1541 UChar rule[10];
1542 UCollator *coll;
1543 UCollationElements *iter;
1544
1545 u_uastrcpy(rule, "&z < AB");
1546 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
1547 if (U_FAILURE(status)) {
1548 log_err("Rule based collator not created for testing ce buffer overflow\n");
1549 return;
1550 }
1551
1552 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
1553 test. this will cause an overflow in getPrev */
1554 str[0] = 0x0041; /* 'A' */
1555 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
1556 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);
1557 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */
1558 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
1559 &status);
1560 if (ucol_previous(iter, &status) != UCOL_NULLORDER ||
1561 status != U_BUFFER_OVERFLOW_ERROR) {
1562 log_err("CE buffer expected to overflow with long string of trail surrogates\n");
1563 }
1564 ucol_closeElements(iter);
1565 ucol_close(coll);
1566}
1567
1568/**
1569* Byte bounds checks. Checks if each byte in data is between upper and lower
1570* inclusive.
1571*/
1572static UBool checkByteBounds(uint32_t data, char upper, char lower)
1573{
1574 int count = 4;
1575 while (count > 0) {
1576 char b = (char)(data & 0xFF);
1577 if (b > upper || b < lower) {
1578 return FALSE;
1579 }
1580 data = data >> 8;
1581 count --;
1582 }
1583 return TRUE;
1584}
1585
1586/**
1587* Determines case of the string of codepoints.
1588* If it is a multiple codepoints it has to treated as a contraction.
1589*/
1590#if 0
1591static uint8_t getCase(const UChar *s, uint32_t len) {
1592 UBool lower = FALSE;
1593 UBool upper = FALSE;
1594 UBool title = FALSE;
1595 UErrorCode status = U_ZERO_ERROR;
1596 UChar str[256];
1597 const UChar *ps = s;
1598
1599 if (len == 0) {
1600 return UCOL_LOWER_CASE;
1601 }
1602
1603 while (len > 0) {
1604 UChar c = *ps ++;
1605
1606 if (u_islower(c)) {
1607 lower = TRUE;
1608 }
1609 if (u_isupper(c)) {
1610 upper = TRUE;
1611 }
1612 if (u_istitle(c)) {
1613 title = TRUE;
1614 }
1615
1616 len --;
1617 }
1618 if ((lower && !upper && !title) || (!lower && !upper && !title)){
1619 return UCOL_LOWER_CASE;
1620 }
1621 if (upper && !lower && !title) {
1622 return UCOL_UPPER_CASE;
1623 }
1624 /* mix of cases here */
1625 /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status);
1626 if (U_FAILURE(status)) {
1627 log_err("Error normalizing data string\n");
1628 return UCOL_LOWER_CASE;
1629 }*/
1630
1631 if ((title && len >= 2) || (lower && upper)) {
1632 return UCOL_MIXED_CASE;
1633 }
1634 if (u_isupper(s[0])) {
1635 return UCOL_UPPER_CASE;
1636 }
1637 return UCOL_LOWER_CASE;
1638}
1639#endif
1640
1641/**
1642* Checking collation element validity given the boundary arguments.
1643*/
1644static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
1645 int length, uint32_t primarymax,
1646 uint32_t secondarymax)
1647{
1648 UErrorCode status = U_ZERO_ERROR;
1649 UCollationElements *iter = ucol_openElements(coll, codepoints, length,
1650 &status);
1651 uint32_t ce;
1652 UBool first = TRUE;
1653/*
1654 UBool upper = FALSE;
1655 UBool lower = FALSE;
1656*/
1657
1658 if (U_FAILURE(status)) {
1659 log_err("Error creating iterator for testing validity\n");
1660 }
1661
1662 ce = ucol_next(iter, &status);
1663
1664 while (ce != UCOL_NULLORDER) {
1665 if (ce != 0) {
1666 uint32_t primary = UCOL_PRIMARYORDER(ce);
1667 uint32_t secondary = UCOL_SECONDARYORDER(ce);
1668 uint32_t tertiary = UCOL_TERTIARYORDER(ce);
1669/* uint32_t scasebits = tertiary & 0xC0;*/
1670
1671 if ((tertiary == 0 && secondary != 0) ||
1672 (tertiary < 0xC0 && secondary == 0 && primary != 0)) {
1673 /* n-1th level is not zero when the nth level is
1674 except for continuations, this is wrong */
1675 log_err("Lower level weight not 0 when high level weight is 0\n");
1676 goto fail;
1677 }
1678 else {
1679 /* checks if any byte is illegal ie = 01 02 03. */
1680 if (checkByteBounds(ce, 0x3, 0x1)) {
1681 log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n");
1682 goto fail;
1683 }
1684 }
1685 if ((primary != 0 && primary < primarymax) || (primary >= 0xFF00 && !isContinuation(ce))) {
1686 log_err("UCA primary weight out of bounds\n");
1687 goto fail;
1688 }
1689 /* case matching not done since data generated by ken */
1690 if (first) {
1691 if (secondary >= 6 && secondary <= secondarymax) {
1692 log_err("Secondary weight out of range\n");
1693 goto fail;
1694 }
1695 first = FALSE;
1696 }
1697 }
1698 ce = ucol_next(iter, &status);
1699 }
1700 ucol_closeElements(iter);
1701 return TRUE;
1702fail :
1703 ucol_closeElements(iter);
1704 return FALSE;
1705}
1706
1707static void TestCEValidity()
1708{
1709 /* testing UCA collation elements */
1710 UErrorCode status = U_ZERO_ERROR;
1711 /* en_US has no tailorings */
1712 UCollator *coll = ucol_open("en_US", &status);
1713 /* tailored locales */
1714 char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
1715 FileStream *file = getFractionalUCA();
1716 char line[1024];
1717 UChar codepoints[10];
1718 int count = 0;
1719 UParseError parseError;
1720 if (U_FAILURE(status)) {
1721 log_err("en_US collator creation failed\n");
1722 return;
1723 }
1724 log_verbose("Testing UCA elements\n");
1725 if (file == NULL) {
1726 log_err("Fractional UCA data can not be opened\n");
1727 return;
1728 }
1729
1730 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1731 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1732 line[0] == 0x000D || line[0] == '[') {
1733 continue;
1734 }
1735
1736 getCodePoints(line, codepoints);
1737 checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86);
1738 }
1739
1740 log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1741 codepoints[0] = 0;
1742 while (codepoints[0] < 0xFFFF) {
1743 if (u_isdefined((UChar32)codepoints[0])) {
1744 checkCEValidity(coll, codepoints, 1, 5, 86);
1745 }
1746 codepoints[0] ++;
1747 }
1748
1749 ucol_close(coll);
1750
1751 /* testing tailored collation elements */
1752 log_verbose("Testing tailored elements\n");
1753 while (count < 5) {
1754 const UChar *rules = NULL,
1755 *current = NULL;
1756 UChar *rulesCopy = NULL;
1757 int32_t ruleLen = 0;
1758
1759 uint32_t chOffset = 0;
1760 uint32_t chLen = 0;
1761 uint32_t exOffset = 0;
1762 uint32_t exLen = 0;
1763 uint32_t prefixOffset = 0;
1764 uint32_t prefixLen = 0;
1765 UBool startOfRules = TRUE;
1766 UColOptionSet opts;
1767
1768 UColTokenParser src;
1769 uint32_t strength = 0;
1770 uint16_t specs = 0;
1771
1772 coll = ucol_open(locale[count], &status);
1773 if (U_FAILURE(status)) {
1774 log_err("%s collator creation failed\n", locale[count]);
1775 return;
1776 }
1777
1778 src.opts = &opts;
1779 rules = ucol_getRules(coll, &ruleLen);
1780
1781 if (ruleLen > 0) {
1782 rulesCopy = (UChar *)malloc((ruleLen +
1783 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1784 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1785 src.current = src.source = rulesCopy;
1786 src.end = rulesCopy + ruleLen;
1787 src.extraCurrent = src.end;
1788 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1789
1790 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) {
1791 strength = src.parsedToken.strength;
1792 chOffset = src.parsedToken.charsOffset;
1793 chLen = src.parsedToken.charsLen;
1794 exOffset = src.parsedToken.extensionOffset;
1795 exLen = src.parsedToken.extensionLen;
1796 prefixOffset = src.parsedToken.prefixOffset;
1797 prefixLen = src.parsedToken.prefixLen;
1798 specs = src.parsedToken.flags;
1799
1800 startOfRules = FALSE;
1801 uprv_memcpy(codepoints, src.source + chOffset,
1802 chLen * sizeof(UChar));
1803 codepoints[chLen] = 0;
1804 checkCEValidity(coll, codepoints, chLen, 4, 85);
1805 }
1806 free(rulesCopy);
1807 }
1808
1809 ucol_close(coll);
1810 count ++;
1811 }
1812 T_FileStream_close(file);
1813}
1814
1815static void printSortKeyError(const UChar *codepoints, int length,
1816 uint8_t *sortkey, int sklen)
1817{
1818 int count = 0;
1819 log_err("Sortkey not valid for ");
1820 while (length > 0) {
1821 log_err("0x%04x ", *codepoints);
1822 length --;
1823 codepoints ++;
1824 }
1825 log_err("\nSortkey : ");
1826 while (count < sklen) {
1827 log_err("0x%02x ", sortkey[count]);
1828 count ++;
1829 }
1830 log_err("\n");
1831}
1832
1833/**
1834* Checking sort key validity for all levels
1835*/
1836static UBool checkSortKeyValidity(UCollator *coll,
1837 const UChar *codepoints,
1838 int length)
1839{
1840 UErrorCode status = U_ZERO_ERROR;
1841 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY,
1842 UCOL_TERTIARY, UCOL_QUATERNARY,
1843 UCOL_IDENTICAL};
1844 int strengthlen = 5;
1845 int index = 0;
1846 int caselevel = 0;
1847
1848 while (caselevel < 1) {
1849 if (caselevel == 0) {
1850 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status);
1851 }
1852 else {
1853 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
1854 }
1855
1856 while (index < strengthlen) {
1857 int count01 = 0;
1858 uint32_t count = 0;
1859 uint8_t sortkey[128];
1860 uint32_t sklen;
1861
1862 ucol_setStrength(coll, strength[index]);
1863 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
1864 while (sortkey[count] != 0) {
1865 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) {
1866 printSortKeyError(codepoints, length, sortkey, sklen);
1867 return FALSE;
1868 }
1869 if (sortkey[count] == 1) {
1870 count01 ++;
1871 }
1872 count ++;
1873 }
1874
1875 if (count + 1 != sklen || (count01 != index + caselevel)) {
1876 printSortKeyError(codepoints, length, sortkey, sklen);
1877 return FALSE;
1878 }
1879 index ++;
1880 }
1881 caselevel ++;
1882 }
1883 return TRUE;
1884}
1885
1886static void TestSortKeyValidity(void)
1887{
1888 /* testing UCA collation elements */
1889 UErrorCode status = U_ZERO_ERROR;
1890 /* en_US has no tailorings */
1891 UCollator *coll = ucol_open("en_US", &status);
1892 /* tailored locales */
1893 char locale[][6] = {"fr_FR\0", "ko_KR\0", "sh_YU\0", "th_TH\0", "zh_CN\0"};
1894 FileStream *file = getFractionalUCA();
1895 char line[1024];
1896 UChar codepoints[10];
1897 int count = 0;
1898 UParseError parseError;
1899 if (U_FAILURE(status)) {
1900 log_err("en_US collator creation failed\n");
1901 return;
1902 }
1903 log_verbose("Testing UCA elements\n");
1904 if (file == NULL) {
1905 log_err("Fractional UCA data can not be opened\n");
1906 return;
1907 }
1908
1909 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1910 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1911 line[0] == 0x000D || line[0] == '[') {
1912 continue;
1913 }
1914
1915 getCodePoints(line, codepoints);
1916 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
1917 }
1918
1919 log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1920 codepoints[0] = 0;
1921
1922 while (codepoints[0] < 0xFFFF) {
1923 if (u_isdefined((UChar32)codepoints[0])) {
1924 checkSortKeyValidity(coll, codepoints, 1);
1925 }
1926 codepoints[0] ++;
1927 }
1928
1929 ucol_close(coll);
1930
1931 /* testing tailored collation elements */
1932 log_verbose("Testing tailored elements\n");
1933 while (count < 5) {
1934 const UChar *rules = NULL,
1935 *current = NULL;
1936 UChar *rulesCopy = NULL;
1937 int32_t ruleLen = 0;
1938
1939 uint32_t chOffset = 0;
1940 uint32_t chLen = 0;
1941 uint32_t exOffset = 0;
1942 uint32_t exLen = 0;
1943 uint32_t prefixOffset = 0;
1944 uint32_t prefixLen = 0;
1945 UBool startOfRules = TRUE;
1946 UColOptionSet opts;
1947
1948 UColTokenParser src;
1949 uint32_t strength = 0;
1950 uint16_t specs = 0;
1951
1952 coll = ucol_open(locale[count], &status);
1953 if (U_FAILURE(status)) {
1954 log_err("%s collator creation failed\n", locale[count]);
1955 return;
1956 }
1957
1958 src.opts = &opts;
1959 rules = ucol_getRules(coll, &ruleLen);
1960
1961 if (ruleLen > 0) {
1962 rulesCopy = (UChar *)malloc((ruleLen +
1963 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1964 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1965 src.current = src.source = rulesCopy;
1966 src.end = rulesCopy + ruleLen;
1967 src.extraCurrent = src.end;
1968 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1969
1970 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) {
1971 strength = src.parsedToken.strength;
1972 chOffset = src.parsedToken.charsOffset;
1973 chLen = src.parsedToken.charsLen;
1974 exOffset = src.parsedToken.extensionOffset;
1975 exLen = src.parsedToken.extensionLen;
1976 prefixOffset = src.parsedToken.prefixOffset;
1977 prefixLen = src.parsedToken.prefixLen;
1978 specs = src.parsedToken.flags;
1979
1980 startOfRules = FALSE;
1981 uprv_memcpy(codepoints, src.source + chOffset,
1982 chLen * sizeof(UChar));
1983 codepoints[chLen] = 0;
1984 checkSortKeyValidity(coll, codepoints, chLen);
1985 }
1986 free(rulesCopy);
1987 }
1988
1989 ucol_close(coll);
1990 count ++;
1991 }
1992 T_FileStream_close(file);
1993}
1994
1995#endif /* #if !UCONFIG_NO_COLLATION */