]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/usettest.cpp
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
1 /*
2 **************************************************************************************
3 * Copyright (C) 1999-2006 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 **************************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 **************************************************************************************
10 */
11
12 #include "unicode/utypes.h"
13 #include "usettest.h"
14 #include "unicode/uniset.h"
15 #include "unicode/uchar.h"
16 #include "unicode/usetiter.h"
17 #include "unicode/ustring.h"
18 #include "unicode/parsepos.h"
19 #include "unicode/symtable.h"
20 #include "unicode/uversion.h"
21 #include "hash.h"
22
23
24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
25 errln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
26 u_errorName(status));}}
27
28 #define TEST_ASSERT(expr) {if (!(expr)) { \
29 errln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
30
31 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
32 UnicodeString pat;
33 set.toPattern(pat);
34 return left + UnicodeSetTest::escape(pat);
35 }
36
37 #define CASE(id,test) case id: \
38 name = #test; \
39 if (exec) { \
40 logln(#test "---"); \
41 logln((UnicodeString)""); \
42 test(); \
43 } \
44 break
45
46 void
47 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
48 const char* &name, char* /*par*/) {
49 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
50 switch (index) {
51 CASE(0,TestPatterns);
52 CASE(1,TestAddRemove);
53 CASE(2,TestCategories);
54 CASE(3,TestCloneEqualHash);
55 CASE(4,TestMinimalRep);
56 CASE(5,TestAPI);
57 CASE(6,TestScriptSet);
58 CASE(7,TestPropertySet);
59 CASE(8,TestClone);
60 CASE(9,TestExhaustive);
61 CASE(10,TestToPattern);
62 CASE(11,TestIndexOf);
63 CASE(12,TestStrings);
64 CASE(13,Testj2268);
65 CASE(14,TestCloseOver);
66 CASE(15,TestEscapePattern);
67 CASE(16,TestInvalidCodePoint);
68 CASE(17,TestSymbolTable);
69 CASE(18,TestSurrogate);
70 CASE(19,TestPosixClasses);
71 CASE(20,TestIteration);
72 default: name = ""; break;
73 }
74 }
75
76 static const char NOT[] = "%%%%";
77
78 /**
79 * UVector was improperly copying contents
80 * This code will crash this is still true
81 */
82 void UnicodeSetTest::Testj2268() {
83 UnicodeSet t;
84 t.add(UnicodeString("abc"));
85 UnicodeSet test(t);
86 UnicodeString ustrPat;
87 test.toPattern(ustrPat, TRUE);
88 }
89
90 /**
91 * Test toPattern().
92 */
93 void UnicodeSetTest::TestToPattern() {
94 UErrorCode ec = U_ZERO_ERROR;
95
96 // Test that toPattern() round trips with syntax characters and
97 // whitespace.
98 {
99 static const char* OTHER_TOPATTERN_TESTS[] = {
100 "[[:latin:]&[:greek:]]",
101 "[[:latin:]-[:greek:]]",
102 "[:nonspacing mark:]",
103 NULL
104 };
105
106 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
107 ec = U_ZERO_ERROR;
108 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
109 if (U_FAILURE(ec)) {
110 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
111 continue;
112 }
113 checkPat(OTHER_TOPATTERN_TESTS[j], s);
114 }
115
116 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
117 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
118
119 // check various combinations to make sure they all work.
120 if (i != 0 && !toPatternAux(i, i)){
121 continue;
122 }
123 if (!toPatternAux(0, i)){
124 continue;
125 }
126 if (!toPatternAux(i, 0xFFFF)){
127 continue;
128 }
129 }
130 }
131 }
132
133 // Test pattern behavior of multicharacter strings.
134 {
135 ec = U_ZERO_ERROR;
136 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
137
138 // This loop isn't a loop. It's here to make the compiler happy.
139 // If you're curious, try removing it and changing the 'break'
140 // statements (except for the last) to goto's.
141 for (;;) {
142 if (U_FAILURE(ec)) break;
143 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
144 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
145
146 s->add("ac");
147 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
148 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
149
150 s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
151 if (U_FAILURE(ec)) break;
152 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
153 expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3);
154
155 s->add("[]");
156 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
157 expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
158
159 s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
160 if (U_FAILURE(ec)) break;
161 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
162 expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
163
164 // j2189
165 s->clear();
166 s->add(UnicodeString("abc", ""));
167 s->add(UnicodeString("abc", ""));
168 const char* exp6[] = {"abc", NOT, "ab", NULL};
169 expectToPattern(*s, "[{abc}]", exp6);
170
171 break;
172 }
173
174 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
175 delete s;
176 }
177
178 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
179 UnicodeSet s;
180 s.add((UChar)97, (UChar)98); // 'a', 'b'
181 expectToPattern(s, "[ab]", NULL);
182 }
183
184 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
185
186 // use Integer.toString because Utility.hex doesn't handle ints
187 UnicodeString pat = "";
188 // TODO do these in hex
189 //String source = "0x" + Integer.toString(start,16).toUpperCase();
190 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
191 UnicodeString source;
192 source = source + (uint32_t)start;
193 if (start != end)
194 source = source + ".." + (uint32_t)end;
195 UnicodeSet testSet;
196 testSet.add(start, end);
197 return checkPat(source, testSet);
198 }
199
200 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
201 const UnicodeSet& testSet) {
202 // What we want to make sure of is that a pattern generated
203 // by toPattern(), with or without escaped unprintables, can
204 // be passed back into the UnicodeSet constructor.
205 UnicodeString pat0;
206
207 testSet.toPattern(pat0, TRUE);
208
209 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
210
211 //String pat1 = unescapeLeniently(pat0);
212 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
213
214 UnicodeString pat2;
215 testSet.toPattern(pat2, FALSE);
216 if (!checkPat(source, testSet, pat2)) return FALSE;
217
218 //String pat3 = unescapeLeniently(pat2);
219 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
220
221 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
222 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
223 return TRUE;
224 }
225
226 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
227 const UnicodeSet& testSet,
228 const UnicodeString& pat) {
229 UErrorCode ec = U_ZERO_ERROR;
230 UnicodeSet testSet2(pat, ec);
231 if (testSet2 != testSet) {
232 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
233 return FALSE;
234 }
235 return TRUE;
236 }
237
238 void
239 UnicodeSetTest::TestPatterns(void) {
240 UnicodeSet set;
241 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
242 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
243 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
244 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
245 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
246 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
247
248 // Throw in a test of complement
249 set.complement();
250 UnicodeString exp;
251 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
252 expectPairs(set, exp);
253 }
254
255 void
256 UnicodeSetTest::TestCategories(void) {
257 UErrorCode status = U_ZERO_ERROR;
258 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
259 UnicodeSet set(pat, status);
260 if (U_FAILURE(status)) {
261 errln((UnicodeString)"Fail: Can't construct set with " + pat);
262 } else {
263 expectContainment(set, pat, "ABC", "abc");
264 }
265
266 UChar32 i;
267 int32_t failures = 0;
268 // Make sure generation of L doesn't pollute cached Lu set
269 // First generate L, then Lu
270 set.applyPattern("[:L:]", status);
271 if (U_FAILURE(status)) { errln("FAIL"); return; }
272 for (i=0; i<0x200; ++i) {
273 UBool l = u_isalpha((UChar)i);
274 if (l != set.contains(i)) {
275 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
276 set.contains(i));
277 if (++failures == 10) break;
278 }
279 }
280
281 set.applyPattern("[:Lu:]", status);
282 if (U_FAILURE(status)) { errln("FAIL"); return; }
283 for (i=0; i<0x200; ++i) {
284 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
285 if (lu != set.contains(i)) {
286 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
287 set.contains(i));
288 if (++failures == 20) break;
289 }
290 }
291 }
292 void
293 UnicodeSetTest::TestCloneEqualHash(void) {
294 UErrorCode status = U_ZERO_ERROR;
295 // set1 and set2 used to be built with the obsolete constructor taking
296 // UCharCategory values; replaced with pattern constructors
297 // markus 20030502
298 UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase
299 UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); // Letter, lowercase
300 if (U_FAILURE(status)){
301 errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
302 return;
303 }
304 UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status); //Number, Decimal digit
305 UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit
306 if (U_FAILURE(status)){
307 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
308 return;
309 }
310
311 if (*set1 != *set1a) {
312 errln("FAIL: category constructor for Ll broken");
313 }
314 if (*set2 != *set2a) {
315 errln("FAIL: category constructor for Nd broken");
316 }
317 delete set1a;
318 delete set2a;
319
320 logln("Testing copy construction");
321 UnicodeSet *set1copy=new UnicodeSet(*set1);
322 if(*set1 != *set1copy || *set1 == *set2 ||
323 getPairs(*set1) != getPairs(*set1copy) ||
324 set1->hashCode() != set1copy->hashCode()){
325 errln("FAIL : Error in copy construction");
326 return;
327 }
328
329 logln("Testing =operator");
330 UnicodeSet set1equal=*set1;
331 UnicodeSet set2equal=*set2;
332 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
333 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
334 errln("FAIL: Error in =operator");
335 }
336
337 logln("Testing clone()");
338 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
339 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
340 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
341 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
342 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
343 errln("FAIL: Error in clone");
344 }
345
346 logln("Testing hashcode");
347 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
348 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
349 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
350 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
351 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
352 errln("FAIL: Error in hashCode()");
353 }
354
355 delete set1;
356 delete set1copy;
357 delete set2;
358 delete set1clone;
359 delete set2clone;
360
361
362 }
363 void
364 UnicodeSetTest::TestAddRemove(void) {
365 UnicodeSet set; // Construct empty set
366 doAssert(set.isEmpty() == TRUE, "set should be empty");
367 doAssert(set.size() == 0, "size should be 0");
368 set.complement();
369 doAssert(set.size() == 0x110000, "size should be 0x110000");
370 set.clear();
371 set.add(0x0061, 0x007a);
372 expectPairs(set, "az");
373 doAssert(set.isEmpty() == FALSE, "set should not be empty");
374 doAssert(set.size() != 0, "size should not be equal to 0");
375 doAssert(set.size() == 26, "size should be equal to 26");
376 set.remove(0x006d, 0x0070);
377 expectPairs(set, "alqz");
378 doAssert(set.size() == 22, "size should be equal to 22");
379 set.remove(0x0065, 0x0067);
380 expectPairs(set, "adhlqz");
381 doAssert(set.size() == 19, "size should be equal to 19");
382 set.remove(0x0064, 0x0069);
383 expectPairs(set, "acjlqz");
384 doAssert(set.size() == 16, "size should be equal to 16");
385 set.remove(0x0063, 0x0072);
386 expectPairs(set, "absz");
387 doAssert(set.size() == 10, "size should be equal to 10");
388 set.add(0x0066, 0x0071);
389 expectPairs(set, "abfqsz");
390 doAssert(set.size() == 22, "size should be equal to 22");
391 set.remove(0x0061, 0x0067);
392 expectPairs(set, "hqsz");
393 set.remove(0x0061, 0x007a);
394 expectPairs(set, "");
395 doAssert(set.isEmpty() == TRUE, "set should be empty");
396 doAssert(set.size() == 0, "size should be 0");
397 set.add(0x0061);
398 doAssert(set.isEmpty() == FALSE, "set should not be empty");
399 doAssert(set.size() == 1, "size should not be equal to 1");
400 set.add(0x0062);
401 set.add(0x0063);
402 expectPairs(set, "ac");
403 doAssert(set.size() == 3, "size should not be equal to 3");
404 set.add(0x0070);
405 set.add(0x0071);
406 expectPairs(set, "acpq");
407 doAssert(set.size() == 5, "size should not be equal to 5");
408 set.clear();
409 expectPairs(set, "");
410 doAssert(set.isEmpty() == TRUE, "set should be empty");
411 doAssert(set.size() == 0, "size should be 0");
412
413 // Try removing an entire set from another set
414 expectPattern(set, "[c-x]", "cx");
415 UnicodeSet set2;
416 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
417 set.removeAll(set2);
418 expectPairs(set, "deluxx");
419
420 // Try adding an entire set to another set
421 expectPattern(set, "[jackiemclean]", "aacceein");
422 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
423 set.addAll(set2);
424 expectPairs(set, "aacehort");
425 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
426
427 // Try retaining an set of elements contained in another set (intersection)
428 UnicodeSet set3;
429 expectPattern(set3, "[a-c]", "ac");
430 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
431 set3.remove(0x0062);
432 expectPairs(set3, "aacc");
433 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
434 set.retainAll(set3);
435 expectPairs(set, "aacc");
436 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
437 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
438 set.clear();
439 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
440
441 // Test commutativity
442 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
443 expectPattern(set2, "[jackiemclean]", "aacceein");
444 set.addAll(set2);
445 expectPairs(set, "aacehort");
446 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
447
448
449
450
451 }
452
453 /**
454 * Make sure minimal representation is maintained.
455 */
456 void UnicodeSetTest::TestMinimalRep() {
457 UErrorCode status = U_ZERO_ERROR;
458 // This is pretty thoroughly tested by checkCanonicalRep()
459 // run against the exhaustive operation results. Use the code
460 // here for debugging specific spot problems.
461
462 // 1 overlap against 2
463 UnicodeSet set("[h-km-q]", status);
464 if (U_FAILURE(status)) { errln("FAIL"); return; }
465 UnicodeSet set2("[i-o]", status);
466 if (U_FAILURE(status)) { errln("FAIL"); return; }
467 set.addAll(set2);
468 expectPairs(set, "hq");
469 // right
470 set.applyPattern("[a-m]", status);
471 if (U_FAILURE(status)) { errln("FAIL"); return; }
472 set2.applyPattern("[e-o]", status);
473 if (U_FAILURE(status)) { errln("FAIL"); return; }
474 set.addAll(set2);
475 expectPairs(set, "ao");
476 // left
477 set.applyPattern("[e-o]", status);
478 if (U_FAILURE(status)) { errln("FAIL"); return; }
479 set2.applyPattern("[a-m]", status);
480 if (U_FAILURE(status)) { errln("FAIL"); return; }
481 set.addAll(set2);
482 expectPairs(set, "ao");
483 // 1 overlap against 3
484 set.applyPattern("[a-eg-mo-w]", status);
485 if (U_FAILURE(status)) { errln("FAIL"); return; }
486 set2.applyPattern("[d-q]", status);
487 if (U_FAILURE(status)) { errln("FAIL"); return; }
488 set.addAll(set2);
489 expectPairs(set, "aw");
490 }
491
492 void UnicodeSetTest::TestAPI() {
493 UErrorCode status = U_ZERO_ERROR;
494 // default ct
495 UnicodeSet set;
496 if (!set.isEmpty() || set.getRangeCount() != 0) {
497 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
498 set);
499 }
500
501 // clear(), isEmpty()
502 set.add(0x0061);
503 if (set.isEmpty()) {
504 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
505 set);
506 }
507 set.clear();
508 if (!set.isEmpty()) {
509 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
510 set);
511 }
512
513 // size()
514 set.clear();
515 if (set.size() != 0) {
516 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
517 ": " + set);
518 }
519 set.add(0x0061);
520 if (set.size() != 1) {
521 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
522 ": " + set);
523 }
524 set.add(0x0031, 0x0039);
525 if (set.size() != 10) {
526 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
527 ": " + set);
528 }
529
530 // contains(first, last)
531 set.clear();
532 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
533 if (U_FAILURE(status)) { errln("FAIL"); return; }
534 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
535 UChar32 a = set.getRangeStart(i);
536 UChar32 b = set.getRangeEnd(i);
537 if (!set.contains(a, b)) {
538 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
539 " but doesn't: " + set);
540 }
541 if (set.contains((UChar32)(a-1), b)) {
542 errln((UnicodeString)"FAIL, shouldn't contain " +
543 (unsigned short)(a-1) + '-' + (unsigned short)b +
544 " but does: " + set);
545 }
546 if (set.contains(a, (UChar32)(b+1))) {
547 errln((UnicodeString)"FAIL, shouldn't contain " +
548 (unsigned short)a + '-' + (unsigned short)(b+1) +
549 " but does: " + set);
550 }
551 }
552
553 // Ported InversionList test.
554 UnicodeSet a((UChar32)3,(UChar32)10);
555 UnicodeSet b((UChar32)7,(UChar32)15);
556 UnicodeSet c;
557
558 logln((UnicodeString)"a [3-10]: " + a);
559 logln((UnicodeString)"b [7-15]: " + b);
560 c = a;
561 c.addAll(b);
562 UnicodeSet exp((UChar32)3,(UChar32)15);
563 if (c == exp) {
564 logln((UnicodeString)"c.set(a).add(b): " + c);
565 } else {
566 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
567 }
568 c.complement();
569 exp.set((UChar32)0, (UChar32)2);
570 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
571 if (c == exp) {
572 logln((UnicodeString)"c.complement(): " + c);
573 } else {
574 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
575 }
576 c.complement();
577 exp.set((UChar32)3, (UChar32)15);
578 if (c == exp) {
579 logln((UnicodeString)"c.complement(): " + c);
580 } else {
581 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
582 }
583 c = a;
584 c.complementAll(b);
585 exp.set((UChar32)3,(UChar32)6);
586 exp.add((UChar32)11,(UChar32) 15);
587 if (c == exp) {
588 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
589 } else {
590 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
591 }
592
593 exp = c;
594 bitsToSet(setToBits(c), c);
595 if (c == exp) {
596 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
597 } else {
598 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
599 }
600
601 // Additional tests for coverage JB#2118
602 //UnicodeSet::complement(class UnicodeString const &)
603 //UnicodeSet::complementAll(class UnicodeString const &)
604 //UnicodeSet::containsNone(class UnicodeSet const &)
605 //UnicodeSet::containsNone(long,long)
606 //UnicodeSet::containsSome(class UnicodeSet const &)
607 //UnicodeSet::containsSome(long,long)
608 //UnicodeSet::removeAll(class UnicodeString const &)
609 //UnicodeSet::retain(long)
610 //UnicodeSet::retainAll(class UnicodeString const &)
611 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
612 //UnicodeSetIterator::getString(void)
613 set.clear();
614 set.complement("ab");
615 exp.applyPattern("[{ab}]", status);
616 if (U_FAILURE(status)) { errln("FAIL"); return; }
617 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
618
619 UnicodeSetIterator iset(set);
620 if (!iset.next() || !iset.isString()) {
621 errln("FAIL: UnicodeSetIterator::next/isString");
622 } else if (iset.getString() != "ab") {
623 errln("FAIL: UnicodeSetIterator::getString");
624 }
625
626 set.add((UChar32)0x61, (UChar32)0x7A);
627 set.complementAll("alan");
628 exp.applyPattern("[{ab}b-kmo-z]", status);
629 if (U_FAILURE(status)) { errln("FAIL"); return; }
630 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
631
632 exp.applyPattern("[a-z]", status);
633 if (U_FAILURE(status)) { errln("FAIL"); return; }
634 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
635 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
636 exp.applyPattern("[aln]", status);
637 if (U_FAILURE(status)) { errln("FAIL"); return; }
638 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
639 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
640
641 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
642 errln("FAIL: containsNone(UChar32, UChar32)");
643 }
644 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
645 errln("FAIL: containsSome(UChar32, UChar32)");
646 }
647 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
648 errln("FAIL: containsNone(UChar32, UChar32)");
649 }
650 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
651 errln("FAIL: containsSome(UChar32, UChar32)");
652 }
653
654 set.removeAll("liu");
655 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
656 if (U_FAILURE(status)) { errln("FAIL"); return; }
657 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
658
659 set.retainAll("star");
660 exp.applyPattern("[rst]", status);
661 if (U_FAILURE(status)) { errln("FAIL"); return; }
662 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
663
664 set.retain((UChar32)0x73);
665 exp.applyPattern("[s]", status);
666 if (U_FAILURE(status)) { errln("FAIL"); return; }
667 if (set != exp) { errln("FAIL: retain('s')"); return; }
668
669 uint16_t buf[32];
670 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
671 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
672 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
673 errln("FAIL: serialize");
674 return;
675 }
676 }
677
678 void UnicodeSetTest::TestIteration() {
679 UErrorCode ec = U_ZERO_ERROR;
680 int i = 0;
681 int outerLoop;
682
683 // 6 code points, 3 ranges, 2 strings, 8 total elements
684 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
685 UnicodeSet set("[zabyc\\U0001abcd{str1}{str2}]", ec);
686 TEST_ASSERT_SUCCESS(ec);
687 UnicodeSetIterator it(set);
688
689 for (outerLoop=0; outerLoop<3; outerLoop++) {
690 // Run the test multiple times, to check that iterator.reset() is working.
691 for (i=0; i<10; i++) {
692 UBool nextv = it.next();
693 UBool isString = it.isString();
694 int32_t codePoint = it.getCodepoint();
695 //int32_t codePointEnd = it.getCodepointEnd();
696 UnicodeString s = it.getString();
697 switch (i) {
698 case 0:
699 TEST_ASSERT(nextv == TRUE);
700 TEST_ASSERT(isString == FALSE);
701 TEST_ASSERT(codePoint==0x61);
702 TEST_ASSERT(s == "a");
703 break;
704 case 1:
705 TEST_ASSERT(nextv == TRUE);
706 TEST_ASSERT(isString == FALSE);
707 TEST_ASSERT(codePoint==0x62);
708 TEST_ASSERT(s == "b");
709 break;
710 case 2:
711 TEST_ASSERT(nextv == TRUE);
712 TEST_ASSERT(isString == FALSE);
713 TEST_ASSERT(codePoint==0x63);
714 TEST_ASSERT(s == "c");
715 break;
716 case 3:
717 TEST_ASSERT(nextv == TRUE);
718 TEST_ASSERT(isString == FALSE);
719 TEST_ASSERT(codePoint==0x79);
720 TEST_ASSERT(s == "y");
721 break;
722 case 4:
723 TEST_ASSERT(nextv == TRUE);
724 TEST_ASSERT(isString == FALSE);
725 TEST_ASSERT(codePoint==0x7a);
726 TEST_ASSERT(s == "z");
727 break;
728 case 5:
729 TEST_ASSERT(nextv == TRUE);
730 TEST_ASSERT(isString == FALSE);
731 TEST_ASSERT(codePoint==0x1abcd);
732 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
733 break;
734 case 6:
735 TEST_ASSERT(nextv == TRUE);
736 TEST_ASSERT(isString == TRUE);
737 TEST_ASSERT(s == "str1");
738 break;
739 case 7:
740 TEST_ASSERT(nextv == TRUE);
741 TEST_ASSERT(isString == TRUE);
742 TEST_ASSERT(s == "str2");
743 break;
744 case 8:
745 TEST_ASSERT(nextv == FALSE);
746 break;
747 case 9:
748 TEST_ASSERT(nextv == FALSE);
749 break;
750 }
751 }
752 it.reset(); // prepare to run the iteration again.
753 }
754 }
755
756
757
758
759 void UnicodeSetTest::TestStrings() {
760 UErrorCode ec = U_ZERO_ERROR;
761
762 UnicodeSet* testList[] = {
763 UnicodeSet::createFromAll("abc"),
764 new UnicodeSet("[a-c]", ec),
765
766 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
767 new UnicodeSet("[{ll}{ch}a-z]", ec),
768
769 UnicodeSet::createFrom("ab}c"),
770 new UnicodeSet("[{ab\\}c}]", ec),
771
772 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
773 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
774
775 NULL
776 };
777
778 if (U_FAILURE(ec)) {
779 errln("FAIL: couldn't construct test sets");
780 }
781
782 for (int32_t i = 0; testList[i] != NULL; i+=2) {
783 if (U_SUCCESS(ec)) {
784 UnicodeString pat0, pat1;
785 testList[i]->toPattern(pat0, TRUE);
786 testList[i+1]->toPattern(pat1, TRUE);
787 if (*testList[i] == *testList[i+1]) {
788 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
789 } else {
790 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
791 }
792 }
793 delete testList[i];
794 delete testList[i+1];
795 }
796 }
797
798 /**
799 * Test the [:Latin:] syntax.
800 */
801 void UnicodeSetTest::TestScriptSet() {
802 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
803
804 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
805
806 /* Jitterbug 1423 */
807 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
808
809 }
810
811 /**
812 * Test the [:Latin:] syntax.
813 */
814 void UnicodeSetTest::TestPropertySet() {
815 static const char* DATA[] = {
816 // Pattern, Chars IN, Chars NOT in
817
818 "[:Latin:]",
819 "aA",
820 "\\u0391\\u03B1",
821
822 "[\\p{Greek}]",
823 "\\u0391\\u03B1",
824 "aA",
825
826 "\\P{ GENERAL Category = upper case letter }",
827 "abc",
828 "ABC",
829
830 // Combining class: @since ICU 2.2
831 // Check both symbolic and numeric
832 "\\p{ccc=Nukta}",
833 "\\u0ABC",
834 "abc",
835
836 "\\p{Canonical Combining Class = 11}",
837 "\\u05B1",
838 "\\u05B2",
839
840 "[:c c c = iota subscript :]",
841 "\\u0345",
842 "xyz",
843
844 // Bidi class: @since ICU 2.2
845 "\\p{bidiclass=lefttoright}",
846 "abc",
847 "\\u0671\\u0672",
848
849 // Binary properties: @since ICU 2.2
850 "\\p{ideographic}",
851 "\\u4E0A",
852 "x",
853
854 "[:math=false:]",
855 "q)*(",
856 // weiv: )(and * were removed from math in Unicode 4.0.1
857 //"(*+)",
858 "+<>^",
859
860 // JB#1767 \N{}, \p{ASCII}
861 "[:Ascii:]",
862 "abc\\u0000\\u007F",
863 "\\u0080\\u4E00",
864
865 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
866 "az",
867 "qrs",
868
869 // JB#2015
870 "[:any:]",
871 "a\\U0010FFFF",
872 "",
873
874 "[:nv=0.5:]",
875 "\\u00BD\\u0F2A",
876 "\\u00BC",
877
878 // JB#2653: Age
879 "[:Age=1.1:]",
880 "\\u03D6", // 1.1
881 "\\u03D8\\u03D9", // 3.2
882
883 "[:Age=3.1:]",
884 "\\u1800\\u3400\\U0002f800",
885 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
886
887 // JB#2350: Case_Sensitive
888 "[:Case Sensitive:]",
889 "A\\u1FFC\\U00010410",
890 ";\\u00B4\\U00010500",
891
892 // JB#2832: C99-compatibility props
893 "[:blank:]",
894 " \\u0009",
895 "1-9A-Z",
896
897 "[:graph:]",
898 "19AZ",
899 " \\u0003\\u0007\\u0009\\u000A\\u000D",
900
901 "[:punct:]",
902 "!@#%&*()[]{}-_\\/;:,.?'\"",
903 "09azAZ",
904
905 "[:xdigit:]",
906 "09afAF",
907 "gG!",
908
909 // Regex compatibility test
910 "[-b]", // leading '-' is literal
911 "-b",
912 "ac",
913
914 "[^-b]", // leading '-' is literal
915 "ac",
916 "-b",
917
918 "[b-]", // trailing '-' is literal
919 "-b",
920 "ac",
921
922 "[^b-]", // trailing '-' is literal
923 "ac",
924 "-b",
925
926 "[a-b-]", // trailing '-' is literal
927 "ab-",
928 "c=",
929
930 "[[a-q]&[p-z]-]", // trailing '-' is literal
931 "pq-",
932 "or=",
933
934 "[\\s|\\)|:|$|\\>]", // from regex tests
935 "s|):$>",
936 "abc",
937
938 "[\\uDC00cd]", // JB#2906: isolated trail at start
939 "cd\\uDC00",
940 "ab\\uD800\\U00010000",
941
942 "[ab\\uD800]", // JB#2906: isolated trail at start
943 "ab\\uD800",
944 "cd\\uDC00\\U00010000",
945
946 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
947 "abcd\\uD800",
948 "ef\\uDC00\\U00010000",
949
950 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
951 "abcd\\uDC00",
952 "ef\\uD800\\U00010000",
953
954 "[:^lccc=0:]", // Lead canonical class
955 "\\u0300\\u0301",
956 "abcd\\u00c0\\u00c5",
957
958 "[:^tccc=0:]", // Trail canonical class
959 "\\u0300\\u0301\\u00c0\\u00c5",
960 "abcd",
961
962 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
963 "\\u0300\\u0301\\u00c0\\u00c5",
964 "abcd",
965
966 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
967 "",
968 "abcd\\u0300\\u0301\\u00c0\\u00c5",
969
970 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
971 "\\u0F73\\u0F75\\u0F81",
972 "abcd\\u0300\\u0301\\u00c0\\u00c5",
973
974 "[:Assigned:]",
975 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
976 "\\u0888\\uFDD3\\uFFFE\\U00050005"
977 };
978
979 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
980
981 for (int32_t i=0; i<DATA_LEN; i+=3) {
982 expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]),
983 CharsToUnicodeString(DATA[i+2]));
984 }
985 }
986
987 /**
988 * Test that Posix style character classes [:digit:], etc.
989 * have the Unicode definitions from TR 18.
990 */
991 void UnicodeSetTest::TestPosixClasses() {
992 {
993 UErrorCode status = U_ZERO_ERROR;
994 UnicodeSet s1("[:alpha:]", status);
995 UnicodeSet s2("\\p{Alphabetic}", status);
996 TEST_ASSERT_SUCCESS(status);
997 TEST_ASSERT(s1==s2);
998 }
999 {
1000 UErrorCode status = U_ZERO_ERROR;
1001 UnicodeSet s1("[:lower:]", status);
1002 UnicodeSet s2("\\p{lowercase}", status);
1003 TEST_ASSERT_SUCCESS(status);
1004 TEST_ASSERT(s1==s2);
1005 }
1006 {
1007 UErrorCode status = U_ZERO_ERROR;
1008 UnicodeSet s1("[:upper:]", status);
1009 UnicodeSet s2("\\p{Uppercase}", status);
1010 TEST_ASSERT_SUCCESS(status);
1011 TEST_ASSERT(s1==s2);
1012 }
1013 {
1014 UErrorCode status = U_ZERO_ERROR;
1015 UnicodeSet s1("[:punct:]", status);
1016 UnicodeSet s2("\\p{gc=Punctuation}", status);
1017 TEST_ASSERT_SUCCESS(status);
1018 TEST_ASSERT(s1==s2);
1019 }
1020 {
1021 UErrorCode status = U_ZERO_ERROR;
1022 UnicodeSet s1("[:digit:]", status);
1023 UnicodeSet s2("\\p{gc=DecimalNumber}", status);
1024 TEST_ASSERT_SUCCESS(status);
1025 TEST_ASSERT(s1==s2);
1026 }
1027 {
1028 UErrorCode status = U_ZERO_ERROR;
1029 UnicodeSet s1("[:xdigit:]", status);
1030 UnicodeSet s2("[\\p{DecimalNumber}\\p{HexDigit}]", status);
1031 TEST_ASSERT_SUCCESS(status);
1032 TEST_ASSERT(s1==s2);
1033 }
1034 {
1035 UErrorCode status = U_ZERO_ERROR;
1036 UnicodeSet s1("[:alnum:]", status);
1037 UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1038 TEST_ASSERT_SUCCESS(status);
1039 TEST_ASSERT(s1==s2);
1040 }
1041 {
1042 UErrorCode status = U_ZERO_ERROR;
1043 UnicodeSet s1("[:space:]", status);
1044 UnicodeSet s2("\\p{Whitespace}", status);
1045 TEST_ASSERT_SUCCESS(status);
1046 TEST_ASSERT(s1==s2);
1047 }
1048 {
1049 UErrorCode status = U_ZERO_ERROR;
1050 UnicodeSet s1("[:blank:]", status);
1051 TEST_ASSERT_SUCCESS(status);
1052 UnicodeSet s2("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1053 status);
1054 TEST_ASSERT_SUCCESS(status);
1055 TEST_ASSERT(s1==s2);
1056 }
1057 {
1058 UErrorCode status = U_ZERO_ERROR;
1059 UnicodeSet s1("[:cntrl:]", status);
1060 TEST_ASSERT_SUCCESS(status);
1061 UnicodeSet s2("\\p{Control}", status);
1062 TEST_ASSERT_SUCCESS(status);
1063 TEST_ASSERT(s1==s2);
1064 }
1065 {
1066 UErrorCode status = U_ZERO_ERROR;
1067 UnicodeSet s1("[:graph:]", status);
1068 TEST_ASSERT_SUCCESS(status);
1069 UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1070 TEST_ASSERT_SUCCESS(status);
1071 TEST_ASSERT(s1==s2);
1072 }
1073 {
1074 UErrorCode status = U_ZERO_ERROR;
1075 UnicodeSet s1("[:print:]", status);
1076 TEST_ASSERT_SUCCESS(status);
1077 UnicodeSet s2("[[:graph:][:blank:]-[\\p{Control}]]" ,status);
1078 TEST_ASSERT_SUCCESS(status);
1079 TEST_ASSERT(s1==s2);
1080 }
1081 }
1082 /**
1083 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1084 */
1085 void UnicodeSetTest::TestClone() {
1086 UErrorCode ec = U_ZERO_ERROR;
1087 UnicodeSet s("[abcxyz]", ec);
1088 UnicodeSet t(s);
1089 expectContainment(t, "abc", "def");
1090 }
1091
1092 /**
1093 * Test the indexOf() and charAt() methods.
1094 */
1095 void UnicodeSetTest::TestIndexOf() {
1096 UErrorCode ec = U_ZERO_ERROR;
1097 UnicodeSet set("[a-cx-y3578]", ec);
1098 if (U_FAILURE(ec)) {
1099 errln("FAIL: UnicodeSet constructor");
1100 return;
1101 }
1102 for (int32_t i=0; i<set.size(); ++i) {
1103 UChar32 c = set.charAt(i);
1104 if (set.indexOf(c) != i) {
1105 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1106 i, c, set.indexOf(c));
1107 }
1108 }
1109 UChar32 c = set.charAt(set.size());
1110 if (c != -1) {
1111 errln("FAIL: charAt(<out of range>) = %X", c);
1112 }
1113 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1114 if (j != -1) {
1115 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1116 }
1117 }
1118
1119 /**
1120 * Test closure API.
1121 */
1122 void UnicodeSetTest::TestCloseOver() {
1123 UErrorCode ec = U_ZERO_ERROR;
1124
1125 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1126 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1127 const char* DATA[] = {
1128 // selector, input, output
1129 CASE,
1130 "[aq\\u00DF{Bc}{bC}{Fi}]",
1131 "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
1132
1133 CASE,
1134 "[\\u01F1]", // 'DZ'
1135 "[\\u01F1\\u01F2\\u01F3]",
1136
1137 CASE,
1138 "[\\u1FB4]",
1139 "[\\u1FB4{\\u03AC\\u03B9}]",
1140
1141 CASE,
1142 "[{F\\uFB01}]",
1143 "[\\uFB03{ffi}]",
1144
1145 CASE, // make sure binary search finds limits
1146 "[a\\uFF3A]",
1147 "[aA\\uFF3A\\uFF5A]",
1148
1149 CASE,
1150 "[a-z]","[A-Za-z\\u017F\\u212A]",
1151 CASE,
1152 "[abc]","[A-Ca-c]",
1153 CASE,
1154 "[ABC]","[A-Ca-c]",
1155
1156 CASE, "[i]", "[iI]",
1157
1158 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1159 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1160
1161 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1162
1163 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1164
1165 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1166
1167 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1168
1169 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1170
1171 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1172
1173 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1174 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1175
1176 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1177
1178 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1179
1180 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1181
1182 CASE_MAPPINGS,
1183 "[aq\\u00DF{Bc}{bC}{Fi}]",
1184 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1185
1186 CASE_MAPPINGS,
1187 "[\\u01F1]", // 'DZ'
1188 "[\\u01F1\\u01F2\\u01F3]",
1189
1190 CASE_MAPPINGS,
1191 "[a-z]",
1192 "[A-Za-z]",
1193
1194 NULL
1195 };
1196
1197 UnicodeSet s;
1198 UnicodeSet t;
1199 UnicodeString buf;
1200 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1201 int32_t selector = DATA[i][0];
1202 UnicodeString pat(DATA[i+1]);
1203 UnicodeString exp(DATA[i+2]);
1204 s.applyPattern(pat, ec);
1205 s.closeOver(selector);
1206 t.applyPattern(exp, ec);
1207 if (U_FAILURE(ec)) {
1208 errln("FAIL: applyPattern failed");
1209 continue;
1210 }
1211 if (s == t) {
1212 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1213 } else {
1214 errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1215 s.toPattern(buf, TRUE) + ", expected " + exp);
1216 }
1217 }
1218
1219 #if 0
1220 /*
1221 * Unused test code.
1222 * This was used to compare the old implementation (using USET_CASE)
1223 * with the new one (using 0x100 temporarily)
1224 * while transitioning from hardcoded case closure tables in uniset.cpp
1225 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1226 * and using ucase.c functions for closure.
1227 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1228 *
1229 * Note: The old and new implementation never fully matched because
1230 * the old implementation turned out to not map U+0130 and U+0131 correctly
1231 * (dotted I and dotless i) and because the old implementation's data tables
1232 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1233 * new implementation. (So sigmas and some other characters were not handled
1234 * according to the newer Unicode version.)
1235 */
1236 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1237 UnicodeSetIterator si(sens);
1238 UnicodeString str, buf2;
1239 const UnicodeString *pStr;
1240 UChar32 c;
1241 while(si.next()) {
1242 if(!si.isString()) {
1243 c=si.getCodepoint();
1244 s.clear();
1245 s.add(c);
1246
1247 str.setTo(c);
1248 str.foldCase();
1249 sens2.add(str);
1250
1251 t=s;
1252 s.closeOver(USET_CASE);
1253 t.closeOver(0x100);
1254 if(s!=t) {
1255 errln("FAIL: closeOver(U+%04x) differs: ", c);
1256 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1257 }
1258 }
1259 }
1260 // remove all code points
1261 // should contain all full case folding mapping strings
1262 sens2.remove(0, 0x10ffff);
1263 si.reset(sens2);
1264 while(si.next()) {
1265 if(si.isString()) {
1266 pStr=&si.getString();
1267 s.clear();
1268 s.add(*pStr);
1269 t=s2=s;
1270 s.closeOver(USET_CASE);
1271 t.closeOver(0x100);
1272 if(s!=t) {
1273 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1274 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1275 }
1276 }
1277 }
1278 #endif
1279
1280 // Test the pattern API
1281 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1282 if (U_FAILURE(ec)) {
1283 errln("FAIL: applyPattern failed");
1284 } else {
1285 expectContainment(s, "abcABC", "defDEF");
1286 }
1287 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1288 if (U_FAILURE(ec)) {
1289 errln("FAIL: constructor failed");
1290 } else {
1291 expectContainment(v, "defDEF", "abcABC");
1292 }
1293 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1294 if (U_FAILURE(ec)) {
1295 errln("FAIL: construct w/case mappings failed");
1296 } else {
1297 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1298 }
1299 }
1300
1301 void UnicodeSetTest::TestEscapePattern() {
1302 const char pattern[] =
1303 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1304 const char exp[] =
1305 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1306 // We test this with two passes; in the second pass we
1307 // pre-unescape the pattern. Since U+200E is rule whitespace,
1308 // this fails -- which is what we expect.
1309 for (int32_t pass=1; pass<=2; ++pass) {
1310 UErrorCode ec = U_ZERO_ERROR;
1311 UnicodeString pat(pattern);
1312 if (pass==2) {
1313 pat = pat.unescape();
1314 }
1315 // Pattern is only good for pass 1
1316 UBool isPatternValid = (pass==1);
1317
1318 UnicodeSet set(pat, ec);
1319 if (U_SUCCESS(ec) != isPatternValid){
1320 errln((UnicodeString)"FAIL: applyPattern(" +
1321 escape(pat) + ") => " +
1322 u_errorName(ec));
1323 continue;
1324 }
1325 if (U_FAILURE(ec)) {
1326 continue;
1327 }
1328 if (set.contains((UChar)0x0644)){
1329 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1330 }
1331
1332 UnicodeString newpat;
1333 set.toPattern(newpat, TRUE);
1334 if (newpat == exp) {
1335 logln(escape(pat) + " => " + newpat);
1336 } else {
1337 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1338 }
1339
1340 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1341 UnicodeString str("Range ");
1342 str.append((UChar)(0x30 + i))
1343 .append(": ")
1344 .append((UChar32)set.getRangeStart(i))
1345 .append(" - ")
1346 .append((UChar32)set.getRangeEnd(i));
1347 str = str + " (" + set.getRangeStart(i) + " - " +
1348 set.getRangeEnd(i) + ")";
1349 if (set.getRangeStart(i) < 0) {
1350 errln((UnicodeString)"FAIL: " + escape(str));
1351 } else {
1352 logln(escape(str));
1353 }
1354 }
1355 }
1356 }
1357
1358 void UnicodeSetTest::expectRange(const UnicodeString& label,
1359 const UnicodeSet& set,
1360 UChar32 start, UChar32 end) {
1361 UnicodeSet exp(start, end);
1362 UnicodeString pat;
1363 if (set == exp) {
1364 logln(label + " => " + set.toPattern(pat, TRUE));
1365 } else {
1366 UnicodeString xpat;
1367 errln((UnicodeString)"FAIL: " + label + " => " +
1368 set.toPattern(pat, TRUE) +
1369 ", expected " + exp.toPattern(xpat, TRUE));
1370 }
1371 }
1372
1373 void UnicodeSetTest::TestInvalidCodePoint() {
1374
1375 const UChar32 DATA[] = {
1376 // Test range Expected range
1377 0, 0x10FFFF, 0, 0x10FFFF,
1378 (UChar32)-1, 8, 0, 8,
1379 8, 0x110000, 8, 0x10FFFF
1380 };
1381 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1382
1383 UnicodeString pat;
1384 int32_t i;
1385
1386 for (i=0; i<DATA_LENGTH; i+=4) {
1387 UChar32 start = DATA[i];
1388 UChar32 end = DATA[i+1];
1389 UChar32 xstart = DATA[i+2];
1390 UChar32 xend = DATA[i+3];
1391
1392 // Try various API using the test code points
1393
1394 UnicodeSet set(start, end);
1395 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1396 set, xstart, xend);
1397
1398 set.clear();
1399 set.set(start, end);
1400 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1401 set, xstart, xend);
1402
1403 UBool b = set.contains(start);
1404 b = set.contains(start, end);
1405 b = set.containsNone(start, end);
1406 b = set.containsSome(start, end);
1407
1408 /*int32_t index = set.indexOf(start);*/
1409
1410 set.clear();
1411 set.add(start);
1412 set.add(start, end);
1413 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1414 set, xstart, xend);
1415
1416 set.set(0, 0x10FFFF);
1417 set.retain(start, end);
1418 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1419 set, xstart, xend);
1420 set.retain(start);
1421
1422 set.set(0, 0x10FFFF);
1423 set.remove(start);
1424 set.remove(start, end);
1425 set.complement();
1426 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1427 set, xstart, xend);
1428
1429 set.set(0, 0x10FFFF);
1430 set.complement(start, end);
1431 set.complement();
1432 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1433 set, xstart, xend);
1434 set.complement(start);
1435 }
1436
1437 const UChar32 DATA2[] = {
1438 0,
1439 0x10FFFF,
1440 (UChar32)-1,
1441 0x110000
1442 };
1443 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1444
1445 for (i=0; i<DATA2_LENGTH; ++i) {
1446 UChar32 c = DATA2[i], end = 0x10FFFF;
1447 UBool valid = (c >= 0 && c <= 0x10FFFF);
1448
1449 UnicodeSet set(0, 0x10FFFF);
1450
1451 // For single-codepoint contains, invalid codepoints are NOT contained
1452 UBool b = set.contains(c);
1453 if (b == valid) {
1454 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1455 ") = " + b);
1456 } else {
1457 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1458 ") = " + b);
1459 }
1460
1461 // For codepoint range contains, containsNone, and containsSome,
1462 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1463 b = set.contains(c, end);
1464 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1465 "," + end + ") = " + b);
1466
1467 b = set.containsNone(c, end);
1468 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1469 "," + end + ") = " + b);
1470
1471 b = set.containsSome(c, end);
1472 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1473 "," + end + ") = " + b);
1474
1475 int32_t index = set.indexOf(c);
1476 if ((index >= 0) == valid) {
1477 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1478 ") = " + index);
1479 } else {
1480 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1481 ") = " + index);
1482 }
1483 }
1484 }
1485
1486 // Used by TestSymbolTable
1487 class TokenSymbolTable : public SymbolTable {
1488 public:
1489 Hashtable contents;
1490
1491 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1492 contents.setValueDeleter(uhash_deleteUnicodeString);
1493 }
1494
1495 ~TokenSymbolTable() {}
1496
1497 /**
1498 * (Non-SymbolTable API) Add the given variable and value to
1499 * the table. Variable should NOT contain leading '$'.
1500 */
1501 void add(const UnicodeString& var, const UnicodeString& value,
1502 UErrorCode& ec) {
1503 if (U_SUCCESS(ec)) {
1504 contents.put(var, new UnicodeString(value), ec);
1505 }
1506 }
1507
1508 /**
1509 * SymbolTable API
1510 */
1511 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1512 return (const UnicodeString*) contents.get(s);
1513 }
1514
1515 /**
1516 * SymbolTable API
1517 */
1518 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1519 return NULL;
1520 }
1521
1522 /**
1523 * SymbolTable API
1524 */
1525 virtual UnicodeString parseReference(const UnicodeString& text,
1526 ParsePosition& pos, int32_t limit) const {
1527 int32_t start = pos.getIndex();
1528 int32_t i = start;
1529 UnicodeString result;
1530 while (i < limit) {
1531 UChar c = text.charAt(i);
1532 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1533 break;
1534 }
1535 ++i;
1536 }
1537 if (i == start) { // No valid name chars
1538 return result; // Indicate failure with empty string
1539 }
1540 pos.setIndex(i);
1541 text.extractBetween(start, i, result);
1542 return result;
1543 }
1544 };
1545
1546 void UnicodeSetTest::TestSymbolTable() {
1547 // Multiple test cases can be set up here. Each test case
1548 // is terminated by null:
1549 // var, value, var, value,..., input pat., exp. output pat., null
1550 const char* DATA[] = {
1551 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1552 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1553 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1554 NULL
1555 };
1556
1557 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1558 UErrorCode ec = U_ZERO_ERROR;
1559 TokenSymbolTable sym(ec);
1560 if (U_FAILURE(ec)) {
1561 errln("FAIL: couldn't construct TokenSymbolTable");
1562 continue;
1563 }
1564
1565 // Set up variables
1566 while (DATA[i+2] != NULL) {
1567 sym.add(DATA[i], DATA[i+1], ec);
1568 if (U_FAILURE(ec)) {
1569 errln("FAIL: couldn't add to TokenSymbolTable");
1570 continue;
1571 }
1572 i += 2;
1573 }
1574
1575 // Input pattern and expected output pattern
1576 UnicodeString inpat = DATA[i], exppat = DATA[i+1];
1577 i += 2;
1578
1579 ParsePosition pos(0);
1580 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1581 if (U_FAILURE(ec)) {
1582 errln("FAIL: couldn't construct UnicodeSet");
1583 continue;
1584 }
1585
1586 // results
1587 if (pos.getIndex() != inpat.length()) {
1588 errln((UnicodeString)"Failed to read to end of string \""
1589 + inpat + "\": read to "
1590 + pos.getIndex() + ", length is "
1591 + inpat.length());
1592 }
1593
1594 UnicodeSet us2(exppat, ec);
1595 if (U_FAILURE(ec)) {
1596 errln("FAIL: couldn't construct expected UnicodeSet");
1597 continue;
1598 }
1599
1600 UnicodeString a, b;
1601 if (us != us2) {
1602 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1603 ", expected " + us2.toPattern(b, TRUE));
1604 } else {
1605 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1606 }
1607 }
1608 }
1609
1610 void UnicodeSetTest::TestSurrogate() {
1611 const char* DATA[] = {
1612 // These should all behave identically
1613 "[abc\\uD800\\uDC00]",
1614 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1615 "[abc\\U00010000]",
1616 0
1617 };
1618 for (int i=0; DATA[i] != 0; ++i) {
1619 UErrorCode ec = U_ZERO_ERROR;
1620 logln((UnicodeString)"Test pattern " + i + " :" + DATA[i]);
1621 UnicodeSet set(DATA[i], ec);
1622 if (U_FAILURE(ec)) {
1623 errln("FAIL: UnicodeSet constructor");
1624 continue;
1625 }
1626 expectContainment(set,
1627 CharsToUnicodeString("abc\\U00010000"),
1628 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1629 if (set.size() != 4) {
1630 errln((UnicodeString)"FAIL: " + DATA[i] + ".size() == " +
1631 set.size() + ", expected 4");
1632 }
1633 }
1634 }
1635
1636 void UnicodeSetTest::TestExhaustive() {
1637 // exhaustive tests. Simulate UnicodeSets with integers.
1638 // That gives us very solid tests (except for large memory tests).
1639
1640 int32_t limit = 128;
1641
1642 UnicodeSet x, y, z, aa;
1643
1644 for (int32_t i = 0; i < limit; ++i) {
1645 bitsToSet(i, x);
1646 logln((UnicodeString)"Testing " + i + ", " + x);
1647 _testComplement(i, x, y);
1648
1649 // AS LONG AS WE ARE HERE, check roundtrip
1650 checkRoundTrip(bitsToSet(i, aa));
1651
1652 for (int32_t j = 0; j < limit; ++j) {
1653 _testAdd(i,j, x,y,z);
1654 _testXor(i,j, x,y,z);
1655 _testRetain(i,j, x,y,z);
1656 _testRemove(i,j, x,y,z);
1657 }
1658 }
1659 }
1660
1661 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1662 bitsToSet(a, x);
1663 z = x;
1664 z.complement();
1665 int32_t c = setToBits(z);
1666 if (c != (~a)) {
1667 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1668 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1669 }
1670 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1671 }
1672
1673 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1674 bitsToSet(a, x);
1675 bitsToSet(b, y);
1676 z = x;
1677 z.addAll(y);
1678 int32_t c = setToBits(z);
1679 if (c != (a | b)) {
1680 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1681 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1682 }
1683 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1684 }
1685
1686 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1687 bitsToSet(a, x);
1688 bitsToSet(b, y);
1689 z = x;
1690 z.retainAll(y);
1691 int32_t c = setToBits(z);
1692 if (c != (a & b)) {
1693 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1694 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1695 }
1696 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1697 }
1698
1699 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1700 bitsToSet(a, x);
1701 bitsToSet(b, y);
1702 z = x;
1703 z.removeAll(y);
1704 int32_t c = setToBits(z);
1705 if (c != (a &~ b)) {
1706 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1707 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1708 }
1709 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1710 }
1711
1712 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1713 bitsToSet(a, x);
1714 bitsToSet(b, y);
1715 z = x;
1716 z.complementAll(y);
1717 int32_t c = setToBits(z);
1718 if (c != (a ^ b)) {
1719 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1720 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1721 }
1722 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1723 }
1724
1725 /**
1726 * Check that ranges are monotonically increasing and non-
1727 * overlapping.
1728 */
1729 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1730 int32_t n = set.getRangeCount();
1731 if (n < 0) {
1732 errln((UnicodeString)"FAIL result of " + msg +
1733 ": range count should be >= 0 but is " +
1734 n /*+ " for " + set.toPattern())*/);
1735 return;
1736 }
1737 UChar32 last = 0;
1738 for (int32_t i=0; i<n; ++i) {
1739 UChar32 start = set.getRangeStart(i);
1740 UChar32 end = set.getRangeEnd(i);
1741 if (start > end) {
1742 errln((UnicodeString)"FAIL result of " + msg +
1743 ": range " + (i+1) +
1744 " start > end: " + (int)start + ", " + (int)end +
1745 " for " + set);
1746 }
1747 if (i > 0 && start <= last) {
1748 errln((UnicodeString)"FAIL result of " + msg +
1749 ": range " + (i+1) +
1750 " overlaps previous range: " + (int)start + ", " + (int)end +
1751 " for " + set);
1752 }
1753 last = end;
1754 }
1755 }
1756
1757 /**
1758 * Convert a bitmask to a UnicodeSet.
1759 */
1760 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1761 result.clear();
1762 for (UChar32 i = 0; i < 32; ++i) {
1763 if ((a & (1<<i)) != 0) {
1764 result.add(i);
1765 }
1766 }
1767 return result;
1768 }
1769
1770 /**
1771 * Convert a UnicodeSet to a bitmask. Only the characters
1772 * U+0000 to U+0020 are represented in the bitmask.
1773 */
1774 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1775 int32_t result = 0;
1776 for (int32_t i = 0; i < 32; ++i) {
1777 if (x.contains((UChar32)i)) {
1778 result |= (1<<i);
1779 }
1780 }
1781 return result;
1782 }
1783
1784 /**
1785 * Return the representation of an inversion list based UnicodeSet
1786 * as a pairs list. Ranges are listed in ascending Unicode order.
1787 * For example, the set [a-zA-M3] is represented as "33AMaz".
1788 */
1789 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1790 UnicodeString pairs;
1791 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1792 UChar32 start = set.getRangeStart(i);
1793 UChar32 end = set.getRangeEnd(i);
1794 if (end > 0xFFFF) {
1795 end = 0xFFFF;
1796 i = set.getRangeCount(); // Should be unnecessary
1797 }
1798 pairs.append((UChar)start).append((UChar)end);
1799 }
1800 return pairs;
1801 }
1802
1803 /**
1804 * Basic consistency check for a few items.
1805 * That the iterator works, and that we can create a pattern and
1806 * get the same thing back
1807 */
1808 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1809 UErrorCode ec = U_ZERO_ERROR;
1810
1811 UnicodeSet t(s);
1812 checkEqual(s, t, "copy ct");
1813
1814 t = s;
1815 checkEqual(s, t, "operator=");
1816
1817 copyWithIterator(t, s, FALSE);
1818 checkEqual(s, t, "iterator roundtrip");
1819
1820 copyWithIterator(t, s, TRUE); // try range
1821 checkEqual(s, t, "iterator roundtrip");
1822
1823 UnicodeString pat; s.toPattern(pat, FALSE);
1824 t.applyPattern(pat, ec);
1825 if (U_FAILURE(ec)) {
1826 errln("FAIL: applyPattern");
1827 return;
1828 } else {
1829 checkEqual(s, t, "toPattern(false)");
1830 }
1831
1832 s.toPattern(pat, TRUE);
1833 t.applyPattern(pat, ec);
1834 if (U_FAILURE(ec)) {
1835 errln("FAIL: applyPattern");
1836 return;
1837 } else {
1838 checkEqual(s, t, "toPattern(true)");
1839 }
1840 }
1841
1842 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1843 t.clear();
1844 UnicodeSetIterator it(s);
1845 if (withRange) {
1846 while (it.nextRange()) {
1847 if (it.isString()) {
1848 t.add(it.getString());
1849 } else {
1850 t.add(it.getCodepoint(), it.getCodepointEnd());
1851 }
1852 }
1853 } else {
1854 while (it.next()) {
1855 if (it.isString()) {
1856 t.add(it.getString());
1857 } else {
1858 t.add(it.getCodepoint());
1859 }
1860 }
1861 }
1862 }
1863
1864 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1865 UnicodeString source; s.toPattern(source, TRUE);
1866 UnicodeString result; t.toPattern(result, TRUE);
1867 if (s != t) {
1868 errln((UnicodeString)"FAIL: " + message
1869 + "; source = " + source
1870 + "; result = " + result
1871 );
1872 return FALSE;
1873 } else {
1874 logln((UnicodeString)"Ok: " + message
1875 + "; source = " + source
1876 + "; result = " + result
1877 );
1878 }
1879 return TRUE;
1880 }
1881
1882 void
1883 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1884 const UnicodeString& charsIn,
1885 const UnicodeString& charsOut) {
1886 UErrorCode ec = U_ZERO_ERROR;
1887 UnicodeSet set(pat, ec);
1888 if (U_FAILURE(ec)) {
1889 errln((UnicodeString)"FAIL: pattern \"" +
1890 pat + "\" => " + u_errorName(ec));
1891 return;
1892 }
1893 expectContainment(set, pat, charsIn, charsOut);
1894 }
1895
1896 void
1897 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1898 const UnicodeString& charsIn,
1899 const UnicodeString& charsOut) {
1900 UnicodeString pat;
1901 set.toPattern(pat);
1902 expectContainment(set, pat, charsIn, charsOut);
1903 }
1904
1905 void
1906 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1907 const UnicodeString& setName,
1908 const UnicodeString& charsIn,
1909 const UnicodeString& charsOut) {
1910 UnicodeString bad;
1911 UChar32 c;
1912 int32_t i;
1913
1914 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1915 c = charsIn.char32At(i);
1916 if (!set.contains(c)) {
1917 bad.append(c);
1918 }
1919 }
1920 if (bad.length() > 0) {
1921 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1922 ", expected containment of " + prettify(charsIn));
1923 } else {
1924 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1925 }
1926
1927 bad.truncate(0);
1928 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1929 c = charsOut.char32At(i);
1930 if (set.contains(c)) {
1931 bad.append(c);
1932 }
1933 }
1934 if (bad.length() > 0) {
1935 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1936 ", expected non-containment of " + prettify(charsOut));
1937 } else {
1938 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1939 }
1940 }
1941
1942 void
1943 UnicodeSetTest::expectPattern(UnicodeSet& set,
1944 const UnicodeString& pattern,
1945 const UnicodeString& expectedPairs){
1946 UErrorCode status = U_ZERO_ERROR;
1947 set.applyPattern(pattern, status);
1948 if (U_FAILURE(status)) {
1949 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1950 "\") failed");
1951 return;
1952 } else {
1953 if (getPairs(set) != expectedPairs ) {
1954 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1955 "\") => pairs \"" +
1956 escape(getPairs(set)) + "\", expected \"" +
1957 escape(expectedPairs) + "\"");
1958 } else {
1959 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
1960 "\") => pairs \"" +
1961 escape(getPairs(set)) + "\"");
1962 }
1963 }
1964 // the result of calling set.toPattern(), which is the string representation of
1965 // this set(set), is passed to a UnicodeSet constructor, and tested that it
1966 // will produce another set that is equal to this one.
1967 UnicodeString temppattern;
1968 set.toPattern(temppattern);
1969 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1970 if (U_FAILURE(status)) {
1971 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1972 return;
1973 }
1974 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1975 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1976 escape(getPairs(set)) + "\""));
1977 } else{
1978 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
1979 }
1980
1981 delete tempset;
1982
1983 }
1984
1985 void
1986 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
1987 if (getPairs(set) != expectedPairs) {
1988 errln(UnicodeString("FAIL: Expected pair list \"") +
1989 escape(expectedPairs) + "\", got \"" +
1990 escape(getPairs(set)) + "\"");
1991 }
1992 }
1993
1994 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
1995 const UnicodeString& expPat,
1996 const char** expStrings) {
1997 UnicodeString pat;
1998 set.toPattern(pat, TRUE);
1999 if (pat == expPat) {
2000 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2001 } else {
2002 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2003 return;
2004 }
2005 if (expStrings == NULL) {
2006 return;
2007 }
2008 UBool in = TRUE;
2009 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2010 if (expStrings[i] == NOT) { // sic; pointer comparison
2011 in = FALSE;
2012 continue;
2013 }
2014 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2015 UBool contained = set.contains(s);
2016 if (contained == in) {
2017 logln((UnicodeString)"Ok: " + expPat +
2018 (contained ? " contains {" : " does not contain {") +
2019 escape(expStrings[i]) + "}");
2020 } else {
2021 errln((UnicodeString)"FAIL: " + expPat +
2022 (contained ? " contains {" : " does not contain {") +
2023 escape(expStrings[i]) + "}");
2024 }
2025 }
2026 }
2027
2028 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2029
2030 void
2031 UnicodeSetTest::doAssert(UBool condition, const char *message)
2032 {
2033 if (!condition) {
2034 errln(UnicodeString("ERROR : ") + message);
2035 }
2036 }
2037
2038 UnicodeString
2039 UnicodeSetTest::escape(const UnicodeString& s) {
2040 UnicodeString buf;
2041 for (int32_t i=0; i<s.length(); )
2042 {
2043 UChar32 c = s.char32At(i);
2044 if (0x0020 <= c && c <= 0x007F) {
2045 buf += c;
2046 } else {
2047 if (c <= 0xFFFF) {
2048 buf += (UChar)0x5c; buf += (UChar)0x75;
2049 } else {
2050 buf += (UChar)0x5c; buf += (UChar)0x55;
2051 buf += toHexString((c & 0xF0000000) >> 28);
2052 buf += toHexString((c & 0x0F000000) >> 24);
2053 buf += toHexString((c & 0x00F00000) >> 20);
2054 buf += toHexString((c & 0x000F0000) >> 16);
2055 }
2056 buf += toHexString((c & 0xF000) >> 12);
2057 buf += toHexString((c & 0x0F00) >> 8);
2058 buf += toHexString((c & 0x00F0) >> 4);
2059 buf += toHexString(c & 0x000F);
2060 }
2061 i += U16_LENGTH(c);
2062 }
2063 return buf;
2064 }