]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/usettest.cpp
ICU-6.2.6.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 1999-2004 Alan Liu ,International Business Machines Corporation and
b75a7d8f
A
4* others. All Rights Reserved.
5**********************************************************************
6* Date Name Description
7* 10/20/99 alan Creation.
8* 03/22/2000 Madhu Added additional tests
9**********************************************************************
10*/
11
12#include "unicode/utypes.h"
13#include "usettest.h"
14#include "unicode/uniset.h"
15#include "unicode/uchar.h"
16#include "unicode/usetiter.h"
17#include "unicode/ustring.h"
374ca955
A
18#include "unicode/parsepos.h"
19#include "unicode/symtable.h"
20#include "hash.h"
b75a7d8f
A
21
22UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
23 UnicodeString pat;
24 set.toPattern(pat);
25 return left + UnicodeSetTest::escape(pat);
26}
27
28#define CASE(id,test) case id: \
29 name = #test; \
30 if (exec) { \
31 logln(#test "---"); \
32 logln((UnicodeString)""); \
33 test(); \
34 } \
35 break
36
37void
38UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
39 const char* &name, char* /*par*/) {
40 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
41 switch (index) {
42 CASE(0,TestPatterns);
43 CASE(1,TestAddRemove);
44 CASE(2,TestCategories);
45 CASE(3,TestCloneEqualHash);
46 CASE(4,TestMinimalRep);
47 CASE(5,TestAPI);
48 CASE(6,TestScriptSet);
49 CASE(7,TestPropertySet);
50 CASE(8,TestClone);
51 CASE(9,TestExhaustive);
52 CASE(10,TestToPattern);
53 CASE(11,TestIndexOf);
54 CASE(12,TestStrings);
374ca955
A
55 CASE(13,Testj2268);
56 CASE(14,TestCloseOver);
57 CASE(15,TestEscapePattern);
58 CASE(16,TestInvalidCodePoint);
59 CASE(17,TestSymbolTable);
60 CASE(18,TestSurrogate);
b75a7d8f
A
61 default: name = ""; break;
62 }
63}
64
374ca955
A
65static const char NOT[] = "%%%%";
66
b75a7d8f
A
67/**
68 * UVector was improperly copying contents
69 * This code will crash this is still true
70 */
71void UnicodeSetTest::Testj2268() {
72 UnicodeSet t;
73 t.add(UnicodeString("abc"));
74 UnicodeSet test(t);
75 UnicodeString ustrPat;
76 test.toPattern(ustrPat, TRUE);
77}
78
79/**
374ca955 80 * Test toPattern().
b75a7d8f
A
81 */
82void UnicodeSetTest::TestToPattern() {
374ca955 83 UErrorCode ec = U_ZERO_ERROR;
b75a7d8f 84
374ca955
A
85 // Test that toPattern() round trips with syntax characters and
86 // whitespace.
87 {
88 static const char* OTHER_TOPATTERN_TESTS[] = {
89 "[[:latin:]&[:greek:]]",
90 "[[:latin:]-[:greek:]]",
91 "[:nonspacing mark:]",
92 NULL
93 };
94
95 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
96 ec = U_ZERO_ERROR;
97 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
98 if (U_FAILURE(ec)) {
99 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
b75a7d8f
A
100 continue;
101 }
374ca955
A
102 checkPat(OTHER_TOPATTERN_TESTS[j], s);
103 }
104
105 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
106 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
107
108 // check various combinations to make sure they all work.
109 if (i != 0 && !toPatternAux(i, i)){
110 continue;
111 }
112 if (!toPatternAux(0, i)){
113 continue;
114 }
115 if (!toPatternAux(i, 0xFFFF)){
116 continue;
117 }
b75a7d8f
A
118 }
119 }
120 }
374ca955
A
121
122 // Test pattern behavior of multicharacter strings.
123 {
124 ec = U_ZERO_ERROR;
125 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
126
127 // This loop isn't a loop. It's here to make the compiler happy.
128 // If you're curious, try removing it and changing the 'break'
129 // statements (except for the last) to goto's.
130 for (;;) {
131 if (U_FAILURE(ec)) break;
132 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
133 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
134
135 s->add("ac");
136 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
137 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
138
139 s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
140 if (U_FAILURE(ec)) break;
141 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
142 expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3);
143
144 s->add("[]");
145 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
146 expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
147
148 s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
149 if (U_FAILURE(ec)) break;
150 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
151 expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
152
153 // j2189
154 s->clear();
155 s->add(UnicodeString("abc", ""));
156 s->add(UnicodeString("abc", ""));
157 const char* exp6[] = {"abc", NOT, "ab", NULL};
158 expectToPattern(*s, "[{abc}]", exp6);
159
160 break;
161 }
162
163 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
164 delete s;
165 }
166
167 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
168 UnicodeSet s;
169 s.add((UChar)97, (UChar)98); // 'a', 'b'
170 expectToPattern(s, "[ab]", NULL);
b75a7d8f
A
171}
172
173UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
174
175 // use Integer.toString because Utility.hex doesn't handle ints
176 UnicodeString pat = "";
177 // TODO do these in hex
178 //String source = "0x" + Integer.toString(start,16).toUpperCase();
179 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
180 UnicodeString source;
181 source = source + (uint32_t)start;
182 if (start != end)
183 source = source + ".." + (uint32_t)end;
184 UnicodeSet testSet;
185 testSet.add(start, end);
186 return checkPat(source, testSet);
187}
188
189UBool UnicodeSetTest::checkPat(const UnicodeString& source,
190 const UnicodeSet& testSet) {
191 // What we want to make sure of is that a pattern generated
192 // by toPattern(), with or without escaped unprintables, can
193 // be passed back into the UnicodeSet constructor.
194 UnicodeString pat0;
195
196 testSet.toPattern(pat0, TRUE);
197
198 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
199
200 //String pat1 = unescapeLeniently(pat0);
201 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
202
203 UnicodeString pat2;
204 testSet.toPattern(pat2, FALSE);
205 if (!checkPat(source, testSet, pat2)) return FALSE;
206
207 //String pat3 = unescapeLeniently(pat2);
208 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
209
210 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
211 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
212 return TRUE;
213}
214
215UBool UnicodeSetTest::checkPat(const UnicodeString& source,
216 const UnicodeSet& testSet,
217 const UnicodeString& pat) {
218 UErrorCode ec = U_ZERO_ERROR;
219 UnicodeSet testSet2(pat, ec);
220 if (testSet2 != testSet) {
221 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
222 return FALSE;
223 }
224 return TRUE;
225}
226
227void
228UnicodeSetTest::TestPatterns(void) {
229 UnicodeSet set;
230 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
231 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
232 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
233 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
234 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
235 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
236
237 // Throw in a test of complement
238 set.complement();
239 UnicodeString exp;
240 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
241 expectPairs(set, exp);
242}
243
244void
245UnicodeSetTest::TestCategories(void) {
246 UErrorCode status = U_ZERO_ERROR;
247 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
248 UnicodeSet set(pat, status);
249 if (U_FAILURE(status)) {
250 errln((UnicodeString)"Fail: Can't construct set with " + pat);
251 } else {
252 expectContainment(set, pat, "ABC", "abc");
253 }
254
255 UChar32 i;
256 int32_t failures = 0;
257 // Make sure generation of L doesn't pollute cached Lu set
258 // First generate L, then Lu
259 set.applyPattern("[:L:]", status);
260 if (U_FAILURE(status)) { errln("FAIL"); return; }
261 for (i=0; i<0x200; ++i) {
262 UBool l = u_isalpha((UChar)i);
263 if (l != set.contains(i)) {
264 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
265 set.contains(i));
266 if (++failures == 10) break;
267 }
268 }
269
270 set.applyPattern("[:Lu:]", status);
271 if (U_FAILURE(status)) { errln("FAIL"); return; }
272 for (i=0; i<0x200; ++i) {
273 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
274 if (lu != set.contains(i)) {
275 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
276 set.contains(i));
277 if (++failures == 20) break;
278 }
279 }
280}
281void
282UnicodeSetTest::TestCloneEqualHash(void) {
283 UErrorCode status = U_ZERO_ERROR;
284 // set1 and set2 used to be built with the obsolete constructor taking
285 // UCharCategory values; replaced with pattern constructors
286 // markus 20030502
287 UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase
288 UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); // Letter, lowercase
289 if (U_FAILURE(status)){
290 errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
291 return;
292 }
293 UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status); //Number, Decimal digit
294 UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit
295 if (U_FAILURE(status)){
296 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
297 return;
298 }
299
300 if (*set1 != *set1a) {
301 errln("FAIL: category constructor for Ll broken");
302 }
303 if (*set2 != *set2a) {
304 errln("FAIL: category constructor for Nd broken");
305 }
306 delete set1a;
307 delete set2a;
308
309 logln("Testing copy construction");
310 UnicodeSet *set1copy=new UnicodeSet(*set1);
311 if(*set1 != *set1copy || *set1 == *set2 ||
312 getPairs(*set1) != getPairs(*set1copy) ||
313 set1->hashCode() != set1copy->hashCode()){
314 errln("FAIL : Error in copy construction");
315 return;
316 }
317
318 logln("Testing =operator");
319 UnicodeSet set1equal=*set1;
320 UnicodeSet set2equal=*set2;
321 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
322 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
323 errln("FAIL: Error in =operator");
324 }
325
326 logln("Testing clone()");
327 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
328 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
329 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
330 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
331 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
332 errln("FAIL: Error in clone");
333 }
334
335 logln("Testing hashcode");
336 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
337 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
338 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
339 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
340 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
341 errln("FAIL: Error in hashCode()");
342 }
343
344 delete set1;
345 delete set1copy;
346 delete set2;
347 delete set1clone;
348 delete set2clone;
349
350
351}
352void
353UnicodeSetTest::TestAddRemove(void) {
354 UnicodeSet set; // Construct empty set
355 doAssert(set.isEmpty() == TRUE, "set should be empty");
356 doAssert(set.size() == 0, "size should be 0");
374ca955
A
357 set.complement();
358 doAssert(set.size() == 0x110000, "size should be 0x110000");
359 set.clear();
b75a7d8f
A
360 set.add(0x0061, 0x007a);
361 expectPairs(set, "az");
362 doAssert(set.isEmpty() == FALSE, "set should not be empty");
363 doAssert(set.size() != 0, "size should not be equal to 0");
364 doAssert(set.size() == 26, "size should be equal to 26");
365 set.remove(0x006d, 0x0070);
366 expectPairs(set, "alqz");
367 doAssert(set.size() == 22, "size should be equal to 22");
368 set.remove(0x0065, 0x0067);
369 expectPairs(set, "adhlqz");
370 doAssert(set.size() == 19, "size should be equal to 19");
371 set.remove(0x0064, 0x0069);
372 expectPairs(set, "acjlqz");
373 doAssert(set.size() == 16, "size should be equal to 16");
374 set.remove(0x0063, 0x0072);
375 expectPairs(set, "absz");
376 doAssert(set.size() == 10, "size should be equal to 10");
377 set.add(0x0066, 0x0071);
378 expectPairs(set, "abfqsz");
379 doAssert(set.size() == 22, "size should be equal to 22");
380 set.remove(0x0061, 0x0067);
381 expectPairs(set, "hqsz");
382 set.remove(0x0061, 0x007a);
383 expectPairs(set, "");
384 doAssert(set.isEmpty() == TRUE, "set should be empty");
385 doAssert(set.size() == 0, "size should be 0");
386 set.add(0x0061);
387 doAssert(set.isEmpty() == FALSE, "set should not be empty");
388 doAssert(set.size() == 1, "size should not be equal to 1");
389 set.add(0x0062);
390 set.add(0x0063);
391 expectPairs(set, "ac");
392 doAssert(set.size() == 3, "size should not be equal to 3");
393 set.add(0x0070);
394 set.add(0x0071);
395 expectPairs(set, "acpq");
396 doAssert(set.size() == 5, "size should not be equal to 5");
397 set.clear();
398 expectPairs(set, "");
399 doAssert(set.isEmpty() == TRUE, "set should be empty");
400 doAssert(set.size() == 0, "size should be 0");
401
402 // Try removing an entire set from another set
403 expectPattern(set, "[c-x]", "cx");
404 UnicodeSet set2;
405 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
406 set.removeAll(set2);
407 expectPairs(set, "deluxx");
408
409 // Try adding an entire set to another set
410 expectPattern(set, "[jackiemclean]", "aacceein");
411 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
412 set.addAll(set2);
413 expectPairs(set, "aacehort");
414 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
415
416 // Try retaining an set of elements contained in another set (intersection)
417 UnicodeSet set3;
418 expectPattern(set3, "[a-c]", "ac");
419 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
420 set3.remove(0x0062);
421 expectPairs(set3, "aacc");
422 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
423 set.retainAll(set3);
424 expectPairs(set, "aacc");
425 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
426 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
427 set.clear();
428 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
429
430 // Test commutativity
431 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
432 expectPattern(set2, "[jackiemclean]", "aacceein");
433 set.addAll(set2);
434 expectPairs(set, "aacehort");
435 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
436
437
438
439
440}
441
442/**
443 * Make sure minimal representation is maintained.
444 */
445void UnicodeSetTest::TestMinimalRep() {
446 UErrorCode status = U_ZERO_ERROR;
447 // This is pretty thoroughly tested by checkCanonicalRep()
448 // run against the exhaustive operation results. Use the code
449 // here for debugging specific spot problems.
450
451 // 1 overlap against 2
452 UnicodeSet set("[h-km-q]", status);
453 if (U_FAILURE(status)) { errln("FAIL"); return; }
454 UnicodeSet set2("[i-o]", status);
455 if (U_FAILURE(status)) { errln("FAIL"); return; }
456 set.addAll(set2);
457 expectPairs(set, "hq");
458 // right
459 set.applyPattern("[a-m]", status);
460 if (U_FAILURE(status)) { errln("FAIL"); return; }
461 set2.applyPattern("[e-o]", status);
462 if (U_FAILURE(status)) { errln("FAIL"); return; }
463 set.addAll(set2);
464 expectPairs(set, "ao");
465 // left
466 set.applyPattern("[e-o]", status);
467 if (U_FAILURE(status)) { errln("FAIL"); return; }
468 set2.applyPattern("[a-m]", status);
469 if (U_FAILURE(status)) { errln("FAIL"); return; }
470 set.addAll(set2);
471 expectPairs(set, "ao");
472 // 1 overlap against 3
473 set.applyPattern("[a-eg-mo-w]", status);
474 if (U_FAILURE(status)) { errln("FAIL"); return; }
475 set2.applyPattern("[d-q]", status);
476 if (U_FAILURE(status)) { errln("FAIL"); return; }
477 set.addAll(set2);
478 expectPairs(set, "aw");
479}
480
481void UnicodeSetTest::TestAPI() {
482 UErrorCode status = U_ZERO_ERROR;
483 // default ct
484 UnicodeSet set;
485 if (!set.isEmpty() || set.getRangeCount() != 0) {
486 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
487 set);
488 }
489
490 // clear(), isEmpty()
491 set.add(0x0061);
492 if (set.isEmpty()) {
493 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
494 set);
495 }
496 set.clear();
497 if (!set.isEmpty()) {
498 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
499 set);
500 }
501
502 // size()
503 set.clear();
504 if (set.size() != 0) {
505 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
506 ": " + set);
507 }
508 set.add(0x0061);
509 if (set.size() != 1) {
510 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
511 ": " + set);
512 }
513 set.add(0x0031, 0x0039);
514 if (set.size() != 10) {
515 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
516 ": " + set);
517 }
518
519 // contains(first, last)
520 set.clear();
521 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
522 if (U_FAILURE(status)) { errln("FAIL"); return; }
523 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
524 UChar32 a = set.getRangeStart(i);
525 UChar32 b = set.getRangeEnd(i);
526 if (!set.contains(a, b)) {
527 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
528 " but doesn't: " + set);
529 }
530 if (set.contains((UChar32)(a-1), b)) {
531 errln((UnicodeString)"FAIL, shouldn't contain " +
532 (unsigned short)(a-1) + '-' + (unsigned short)b +
533 " but does: " + set);
534 }
535 if (set.contains(a, (UChar32)(b+1))) {
536 errln((UnicodeString)"FAIL, shouldn't contain " +
537 (unsigned short)a + '-' + (unsigned short)(b+1) +
538 " but does: " + set);
539 }
540 }
541
542 // Ported InversionList test.
543 UnicodeSet a((UChar32)3,(UChar32)10);
544 UnicodeSet b((UChar32)7,(UChar32)15);
545 UnicodeSet c;
546
547 logln((UnicodeString)"a [3-10]: " + a);
548 logln((UnicodeString)"b [7-15]: " + b);
374ca955
A
549 c = a;
550 c.addAll(b);
b75a7d8f
A
551 UnicodeSet exp((UChar32)3,(UChar32)15);
552 if (c == exp) {
553 logln((UnicodeString)"c.set(a).add(b): " + c);
554 } else {
555 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
556 }
557 c.complement();
558 exp.set((UChar32)0, (UChar32)2);
559 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
560 if (c == exp) {
561 logln((UnicodeString)"c.complement(): " + c);
562 } else {
563 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
564 }
565 c.complement();
566 exp.set((UChar32)3, (UChar32)15);
567 if (c == exp) {
568 logln((UnicodeString)"c.complement(): " + c);
569 } else {
570 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
571 }
374ca955
A
572 c = a;
573 c.complementAll(b);
b75a7d8f
A
574 exp.set((UChar32)3,(UChar32)6);
575 exp.add((UChar32)11,(UChar32) 15);
576 if (c == exp) {
577 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
578 } else {
579 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
580 }
581
582 exp = c;
583 bitsToSet(setToBits(c), c);
584 if (c == exp) {
585 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
586 } else {
587 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
588 }
589
590 // Additional tests for coverage JB#2118
591 //UnicodeSet::complement(class UnicodeString const &)
592 //UnicodeSet::complementAll(class UnicodeString const &)
593 //UnicodeSet::containsNone(class UnicodeSet const &)
594 //UnicodeSet::containsNone(long,long)
595 //UnicodeSet::containsSome(class UnicodeSet const &)
596 //UnicodeSet::containsSome(long,long)
597 //UnicodeSet::removeAll(class UnicodeString const &)
598 //UnicodeSet::retain(long)
599 //UnicodeSet::retainAll(class UnicodeString const &)
600 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
601 //UnicodeSetIterator::getString(void)
602 set.clear();
603 set.complement("ab");
604 exp.applyPattern("[{ab}]", status);
605 if (U_FAILURE(status)) { errln("FAIL"); return; }
606 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
607
608 UnicodeSetIterator iset(set);
609 if (!iset.next() || !iset.isString()) {
610 errln("FAIL: UnicodeSetIterator::next/isString");
611 } else if (iset.getString() != "ab") {
612 errln("FAIL: UnicodeSetIterator::getString");
613 }
614
615 set.add((UChar32)0x61, (UChar32)0x7A);
616 set.complementAll("alan");
617 exp.applyPattern("[{ab}b-kmo-z]", status);
618 if (U_FAILURE(status)) { errln("FAIL"); return; }
619 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
620
621 exp.applyPattern("[a-z]", status);
622 if (U_FAILURE(status)) { errln("FAIL"); return; }
623 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
624 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
625 exp.applyPattern("[aln]", status);
626 if (U_FAILURE(status)) { errln("FAIL"); return; }
627 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
628 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
629
630 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
631 errln("FAIL: containsNone(UChar32, UChar32)");
632 }
633 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
634 errln("FAIL: containsSome(UChar32, UChar32)");
635 }
636 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
637 errln("FAIL: containsNone(UChar32, UChar32)");
638 }
639 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
640 errln("FAIL: containsSome(UChar32, UChar32)");
641 }
642
643 set.removeAll("liu");
644 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
645 if (U_FAILURE(status)) { errln("FAIL"); return; }
646 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
647
648 set.retainAll("star");
649 exp.applyPattern("[rst]", status);
650 if (U_FAILURE(status)) { errln("FAIL"); return; }
651 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
652
653 set.retain((UChar32)0x73);
654 exp.applyPattern("[s]", status);
655 if (U_FAILURE(status)) { errln("FAIL"); return; }
656 if (set != exp) { errln("FAIL: retain('s')"); return; }
657
658 uint16_t buf[32];
659 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
660 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
661 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
662 errln("FAIL: serialize");
663 return;
664 }
665}
666
667void UnicodeSetTest::TestStrings() {
668 UErrorCode ec = U_ZERO_ERROR;
669
670 UnicodeSet* testList[] = {
671 UnicodeSet::createFromAll("abc"),
672 new UnicodeSet("[a-c]", ec),
673
674 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
675 new UnicodeSet("[{ll}{ch}a-z]", ec),
676
677 UnicodeSet::createFrom("ab}c"),
678 new UnicodeSet("[{ab\\}c}]", ec),
679
680 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
681 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
682
683 NULL
684 };
685
686 if (U_FAILURE(ec)) {
687 errln("FAIL: couldn't construct test sets");
688 }
689
690 for (int32_t i = 0; testList[i] != NULL; i+=2) {
691 if (U_SUCCESS(ec)) {
692 UnicodeString pat0, pat1;
693 testList[i]->toPattern(pat0, TRUE);
694 testList[i+1]->toPattern(pat1, TRUE);
695 if (*testList[i] == *testList[i+1]) {
696 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
697 } else {
698 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
699 }
700 }
701 delete testList[i];
702 delete testList[i+1];
703 }
704}
705
b75a7d8f
A
706/**
707 * Test the [:Latin:] syntax.
708 */
709void UnicodeSetTest::TestScriptSet() {
710 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
711
712 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
713
714 /* Jitterbug 1423 */
715 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
716
717}
718
719/**
720 * Test the [:Latin:] syntax.
721 */
722void UnicodeSetTest::TestPropertySet() {
723 static const char* DATA[] = {
724 // Pattern, Chars IN, Chars NOT in
725
726 "[:Latin:]",
727 "aA",
728 "\\u0391\\u03B1",
729
730 "[\\p{Greek}]",
731 "\\u0391\\u03B1",
732 "aA",
733
734 "\\P{ GENERAL Category = upper case letter }",
735 "abc",
736 "ABC",
737
738 // Combining class: @since ICU 2.2
739 // Check both symbolic and numeric
740 "\\p{ccc=Nukta}",
741 "\\u0ABC",
742 "abc",
743
744 "\\p{Canonical Combining Class = 11}",
745 "\\u05B1",
746 "\\u05B2",
747
748 "[:c c c = iota subscript :]",
749 "\\u0345",
750 "xyz",
751
752 // Bidi class: @since ICU 2.2
753 "\\p{bidiclass=lefttoright}",
754 "abc",
755 "\\u0671\\u0672",
756
757 // Binary properties: @since ICU 2.2
758 "\\p{ideographic}",
759 "\\u4E0A",
760 "x",
761
762 "[:math=false:]",
374ca955
A
763 "q)*(",
764 // weiv: )(and * were removed from math in Unicode 4.0.1
765 //"(*+)",
766 "+<>^",
b75a7d8f
A
767
768 // JB#1767 \N{}, \p{ASCII}
769 "[:Ascii:]",
770 "abc\\u0000\\u007F",
771 "\\u0080\\u4E00",
772
773 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
774 "az",
775 "qrs",
776
777 // JB#2015
778 "[:any:]",
779 "a\\U0010FFFF",
780 "",
781
782 "[:nv=0.5:]",
783 "\\u00BD\\u0F2A",
784 "\\u00BC",
785
786 // JB#2653: Age
787 "[:Age=1.1:]",
788 "\\u03D6", // 1.1
789 "\\u03D8\\u03D9", // 3.2
790
791 "[:Age=3.1:]",
792 "\\u1800\\u3400\\U0002f800",
793 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
794
795 // JB#2350: Case_Sensitive
796 "[:Case Sensitive:]",
797 "A\\u1FFC\\U00010410",
798 ";\\u00B4\\U00010500",
799
800 // JB#2832: C99-compatibility props
801 "[:blank:]",
802 " \\u0009",
803 "1-9A-Z",
804
805 "[:graph:]",
806 "19AZ",
807 " \\u0003\\u0007\\u0009\\u000A\\u000D",
808
809 "[:punct:]",
810 "!@#%&*()[]{}-_\\/;:,.?'\"",
811 "09azAZ",
812
813 "[:xdigit:]",
814 "09afAF",
815 "gG!",
816
817 // Regex compatibility test
818 "[-b]", // leading '-' is literal
819 "-b",
820 "ac",
821
822 "[^-b]", // leading '-' is literal
823 "ac",
824 "-b",
825
826 "[b-]", // trailing '-' is literal
827 "-b",
828 "ac",
829
830 "[^b-]", // trailing '-' is literal
831 "ac",
374ca955
A
832 "-b",
833
834 "[a-b-]", // trailing '-' is literal
835 "ab-",
836 "c=",
837
838 "[[a-q]&[p-z]-]", // trailing '-' is literal
839 "pq-",
840 "or=",
841
842 "[\\s|\\)|:|$|\\>]", // from regex tests
843 "s|):$>",
844 "abc",
845
846 "[\\uDC00cd]", // JB#2906: isolated trail at start
847 "cd\\uDC00",
848 "ab\\uD800\\U00010000",
849
850 "[ab\\uD800]", // JB#2906: isolated trail at start
851 "ab\\uD800",
852 "cd\\uDC00\\U00010000",
853
854 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
855 "abcd\\uD800",
856 "ef\\uDC00\\U00010000",
857
858 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
859 "abcd\\uDC00",
860 "ef\\uD800\\U00010000",
861
862 "[:^lccc=0:]", // Lead canonical class
863 "\\u0300\\u0301",
864 "abcd\\u00c0\\u00c5",
865
866 "[:^tccc=0:]", // Trail canonical class
867 "\\u0300\\u0301\\u00c0\\u00c5",
868 "abcd",
869
870 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
871 "\\u0300\\u0301\\u00c0\\u00c5",
872 "abcd",
873
874 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
875 "",
876 "abcd\\u0300\\u0301\\u00c0\\u00c5",
877
878 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
879 "\\u0F73\\u0F75\\u0F81",
880 "abcd\\u0300\\u0301\\u00c0\\u00c5",
881
b75a7d8f
A
882 };
883
884 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
885
886 for (int32_t i=0; i<DATA_LEN; i+=3) {
887 expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]),
888 CharsToUnicodeString(DATA[i+2]));
889 }
890}
891
892/**
893 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
894 */
895void UnicodeSetTest::TestClone() {
896 UErrorCode ec = U_ZERO_ERROR;
897 UnicodeSet s("[abcxyz]", ec);
898 UnicodeSet t(s);
899 expectContainment(t, "abc", "def");
900}
901
902/**
903 * Test the indexOf() and charAt() methods.
904 */
905void UnicodeSetTest::TestIndexOf() {
906 UErrorCode ec = U_ZERO_ERROR;
907 UnicodeSet set("[a-cx-y3578]", ec);
908 if (U_FAILURE(ec)) {
909 errln("FAIL: UnicodeSet constructor");
910 return;
911 }
912 for (int32_t i=0; i<set.size(); ++i) {
913 UChar32 c = set.charAt(i);
914 if (set.indexOf(c) != i) {
915 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
916 i, c, set.indexOf(c));
917 }
918 }
919 UChar32 c = set.charAt(set.size());
920 if (c != -1) {
921 errln("FAIL: charAt(<out of range>) = %X", c);
922 }
923 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
924 if (j != -1) {
925 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
926 }
927}
928
929/**
930 * Test closure API.
931 */
932void UnicodeSetTest::TestCloseOver() {
933 UErrorCode ec = U_ZERO_ERROR;
934
935 char CASE[] = {(char)USET_CASE};
374ca955 936 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
b75a7d8f
A
937 const char* DATA[] = {
938 // selector, input, output
939 CASE,
940 "[aq\\u00DF{Bc}{bC}{Fi}]",
941 "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
942
943 CASE,
944 "[\\u01F1]", // 'DZ'
945 "[\\u01F1\\u01F2\\u01F3]",
946
947 CASE,
948 "[\\u1FB4]",
949 "[\\u1FB4{\\u03AC\\u03B9}]",
950
951 CASE,
952 "[{F\\uFB01}]",
953 "[\\uFB03{ffi}]",
954
955 CASE, // make sure binary search finds limits
956 "[a\\uFF3A]",
957 "[aA\\uFF3A\\uFF5A]",
958
959 CASE,
960 "[a-z]","[A-Za-z\\u017F\\u212A]",
961 CASE,
962 "[abc]","[A-Ca-c]",
963 CASE,
964 "[ABC]","[A-Ca-c]",
965
374ca955
A
966 CASE_MAPPINGS,
967 "[aq\\u00DF{Bc}{bC}{Fi}]",
968 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
969
970 CASE_MAPPINGS,
971 "[\\u01F1]", // 'DZ'
972 "[\\u01F1\\u01F2\\u01F3]",
973
974 CASE_MAPPINGS,
975 "[a-z]",
976 "[A-Za-z]",
977
b75a7d8f
A
978 NULL
979 };
980
981 UnicodeSet s;
982 UnicodeSet t;
983 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
984 int32_t selector = DATA[i][0];
985 UnicodeString pat(DATA[i+1]);
986 UnicodeString exp(DATA[i+2]);
987 s.applyPattern(pat, ec);
988 s.closeOver(selector);
989 t.applyPattern(exp, ec);
990 if (U_FAILURE(ec)) {
991 errln("FAIL: applyPattern failed");
992 continue;
993 }
994 if (s == t) {
995 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
996 } else {
997 UnicodeString buf;
998 errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
999 s.toPattern(buf, TRUE) + ", expected " + exp);
1000 }
1001 }
1002
1003 // Test the pattern API
374ca955 1004 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
b75a7d8f
A
1005 if (U_FAILURE(ec)) {
1006 errln("FAIL: applyPattern failed");
1007 } else {
1008 expectContainment(s, "abcABC", "defDEF");
1009 }
374ca955 1010 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
b75a7d8f
A
1011 if (U_FAILURE(ec)) {
1012 errln("FAIL: constructor failed");
1013 } else {
1014 expectContainment(v, "defDEF", "abcABC");
1015 }
374ca955
A
1016 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1017 if (U_FAILURE(ec)) {
1018 errln("FAIL: construct w/case mappings failed");
1019 } else {
1020 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1021 }
b75a7d8f
A
1022}
1023
1024void UnicodeSetTest::TestEscapePattern() {
1025 const char pattern[] =
374ca955 1026 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
b75a7d8f 1027 const char exp[] =
374ca955 1028 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
b75a7d8f 1029 // We test this with two passes; in the second pass we
374ca955
A
1030 // pre-unescape the pattern. Since U+200E is rule whitespace,
1031 // this fails -- which is what we expect.
b75a7d8f
A
1032 for (int32_t pass=1; pass<=2; ++pass) {
1033 UErrorCode ec = U_ZERO_ERROR;
1034 UnicodeString pat(pattern);
1035 if (pass==2) {
1036 pat = pat.unescape();
1037 }
1038 // Pattern is only good for pass 1
1039 UBool isPatternValid = (pass==1);
1040
1041 UnicodeSet set(pat, ec);
1042 if (U_SUCCESS(ec) != isPatternValid){
1043 errln((UnicodeString)"FAIL: applyPattern(" +
1044 escape(pat) + ") => " +
1045 u_errorName(ec));
1046 continue;
1047 }
1048 if (U_FAILURE(ec)) {
1049 continue;
1050 }
1051 if (set.contains((UChar)0x0644)){
1052 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1053 }
1054
1055 UnicodeString newpat;
1056 set.toPattern(newpat, TRUE);
1057 if (newpat == exp) {
1058 logln(escape(pat) + " => " + newpat);
1059 } else {
1060 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1061 }
1062
1063 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1064 UnicodeString str("Range ");
1065 str.append((UChar)(0x30 + i))
1066 .append(": ")
1067 .append((UChar32)set.getRangeStart(i))
1068 .append(" - ")
1069 .append((UChar32)set.getRangeEnd(i));
1070 str = str + " (" + set.getRangeStart(i) + " - " +
1071 set.getRangeEnd(i) + ")";
1072 if (set.getRangeStart(i) < 0) {
1073 errln((UnicodeString)"FAIL: " + escape(str));
1074 } else {
1075 logln(escape(str));
1076 }
1077 }
1078 }
1079}
1080
1081void UnicodeSetTest::expectRange(const UnicodeString& label,
1082 const UnicodeSet& set,
1083 UChar32 start, UChar32 end) {
1084 UnicodeSet exp(start, end);
1085 UnicodeString pat;
1086 if (set == exp) {
1087 logln(label + " => " + set.toPattern(pat, TRUE));
1088 } else {
1089 UnicodeString xpat;
1090 errln((UnicodeString)"FAIL: " + label + " => " +
1091 set.toPattern(pat, TRUE) +
1092 ", expected " + exp.toPattern(xpat, TRUE));
1093 }
1094}
1095
1096void UnicodeSetTest::TestInvalidCodePoint() {
1097
1098 const UChar32 DATA[] = {
1099 // Test range Expected range
1100 0, 0x10FFFF, 0, 0x10FFFF,
1101 (UChar32)-1, 8, 0, 8,
1102 8, 0x110000, 8, 0x10FFFF
1103 };
1104 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1105
1106 UnicodeString pat;
1107 int32_t i;
1108
1109 for (i=0; i<DATA_LENGTH; i+=4) {
1110 UChar32 start = DATA[i];
1111 UChar32 end = DATA[i+1];
1112 UChar32 xstart = DATA[i+2];
1113 UChar32 xend = DATA[i+3];
1114
1115 // Try various API using the test code points
1116
1117 UnicodeSet set(start, end);
1118 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1119 set, xstart, xend);
1120
1121 set.clear();
1122 set.set(start, end);
1123 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1124 set, xstart, xend);
1125
1126 UBool b = set.contains(start);
1127 b = set.contains(start, end);
1128 b = set.containsNone(start, end);
1129 b = set.containsSome(start, end);
1130
374ca955 1131 /*int32_t index = set.indexOf(start);*/
b75a7d8f
A
1132
1133 set.clear();
1134 set.add(start);
1135 set.add(start, end);
1136 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1137 set, xstart, xend);
1138
1139 set.set(0, 0x10FFFF);
1140 set.retain(start, end);
1141 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1142 set, xstart, xend);
1143 set.retain(start);
1144
1145 set.set(0, 0x10FFFF);
1146 set.remove(start);
1147 set.remove(start, end);
1148 set.complement();
1149 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1150 set, xstart, xend);
1151
1152 set.set(0, 0x10FFFF);
1153 set.complement(start, end);
1154 set.complement();
1155 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1156 set, xstart, xend);
1157 set.complement(start);
1158 }
1159
1160 const UChar32 DATA2[] = {
1161 0,
1162 0x10FFFF,
1163 (UChar32)-1,
1164 0x110000
1165 };
1166 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1167
1168 for (i=0; i<DATA2_LENGTH; ++i) {
1169 UChar32 c = DATA2[i], end = 0x10FFFF;
1170 UBool valid = (c >= 0 && c <= 0x10FFFF);
1171
1172 UnicodeSet set(0, 0x10FFFF);
1173
1174 // For single-codepoint contains, invalid codepoints are NOT contained
1175 UBool b = set.contains(c);
1176 if (b == valid) {
1177 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1178 ") = " + b);
1179 } else {
1180 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1181 ") = " + b);
1182 }
1183
1184 // For codepoint range contains, containsNone, and containsSome,
1185 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1186 b = set.contains(c, end);
1187 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1188 "," + end + ") = " + b);
1189
1190 b = set.containsNone(c, end);
1191 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1192 "," + end + ") = " + b);
1193
1194 b = set.containsSome(c, end);
1195 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1196 "," + end + ") = " + b);
1197
1198 int32_t index = set.indexOf(c);
1199 if ((index >= 0) == valid) {
1200 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1201 ") = " + index);
1202 } else {
1203 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1204 ") = " + index);
1205 }
1206 }
1207}
1208
374ca955
A
1209// Used by TestSymbolTable
1210class TokenSymbolTable : public SymbolTable {
1211public:
1212 Hashtable contents;
1213
1214 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1215 contents.setValueDeleter(uhash_deleteUnicodeString);
1216 }
1217
1218 ~TokenSymbolTable() {}
1219
1220 /**
1221 * (Non-SymbolTable API) Add the given variable and value to
1222 * the table. Variable should NOT contain leading '$'.
1223 */
1224 void add(const UnicodeString& var, const UnicodeString& value,
1225 UErrorCode& ec) {
1226 if (U_SUCCESS(ec)) {
1227 contents.put(var, new UnicodeString(value), ec);
1228 }
1229 }
1230
1231 /**
1232 * SymbolTable API
1233 */
1234 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1235 return (const UnicodeString*) contents.get(s);
1236 }
1237
1238 /**
1239 * SymbolTable API
1240 */
1241 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1242 return NULL;
1243 }
1244
1245 /**
1246 * SymbolTable API
1247 */
1248 virtual UnicodeString parseReference(const UnicodeString& text,
1249 ParsePosition& pos, int32_t limit) const {
1250 int32_t start = pos.getIndex();
1251 int32_t i = start;
1252 UnicodeString result;
1253 while (i < limit) {
1254 UChar c = text.charAt(i);
1255 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1256 break;
1257 }
1258 ++i;
1259 }
1260 if (i == start) { // No valid name chars
1261 return result; // Indicate failure with empty string
1262 }
1263 pos.setIndex(i);
1264 text.extractBetween(start, i, result);
1265 return result;
1266 }
1267};
1268
1269void UnicodeSetTest::TestSymbolTable() {
1270 // Multiple test cases can be set up here. Each test case
1271 // is terminated by null:
1272 // var, value, var, value,..., input pat., exp. output pat., null
1273 const char* DATA[] = {
1274 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1275 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1276 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1277 NULL
1278 };
1279
1280 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1281 UErrorCode ec = U_ZERO_ERROR;
1282 TokenSymbolTable sym(ec);
1283 if (U_FAILURE(ec)) {
1284 errln("FAIL: couldn't construct TokenSymbolTable");
1285 continue;
1286 }
1287
1288 // Set up variables
1289 while (DATA[i+2] != NULL) {
1290 sym.add(DATA[i], DATA[i+1], ec);
1291 if (U_FAILURE(ec)) {
1292 errln("FAIL: couldn't add to TokenSymbolTable");
1293 continue;
1294 }
1295 i += 2;
1296 }
1297
1298 // Input pattern and expected output pattern
1299 UnicodeString inpat = DATA[i], exppat = DATA[i+1];
1300 i += 2;
1301
1302 ParsePosition pos(0);
1303 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1304 if (U_FAILURE(ec)) {
1305 errln("FAIL: couldn't construct UnicodeSet");
1306 continue;
1307 }
1308
1309 // results
1310 if (pos.getIndex() != inpat.length()) {
1311 errln((UnicodeString)"Failed to read to end of string \""
1312 + inpat + "\": read to "
1313 + pos.getIndex() + ", length is "
1314 + inpat.length());
1315 }
1316
1317 UnicodeSet us2(exppat, ec);
1318 if (U_FAILURE(ec)) {
1319 errln("FAIL: couldn't construct expected UnicodeSet");
1320 continue;
1321 }
1322
1323 UnicodeString a, b;
1324 if (us != us2) {
1325 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1326 ", expected " + us2.toPattern(b, TRUE));
1327 } else {
1328 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1329 }
1330 }
1331}
1332
1333void UnicodeSetTest::TestSurrogate() {
1334 const char* DATA[] = {
1335 // These should all behave identically
1336 "[abc\\uD800\\uDC00]",
1337 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1338 "[abc\\U00010000]",
1339 0
1340 };
1341 for (int i=0; DATA[i] != 0; ++i) {
1342 UErrorCode ec = U_ZERO_ERROR;
1343 logln((UnicodeString)"Test pattern " + i + " :" + DATA[i]);
1344 UnicodeSet set(DATA[i], ec);
1345 if (U_FAILURE(ec)) {
1346 errln("FAIL: UnicodeSet constructor");
1347 continue;
1348 }
1349 expectContainment(set,
1350 CharsToUnicodeString("abc\\U00010000"),
1351 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1352 if (set.size() != 4) {
1353 errln((UnicodeString)"FAIL: " + DATA[i] + ".size() == " +
1354 set.size() + ", expected 4");
1355 }
1356 }
1357}
1358
b75a7d8f
A
1359void UnicodeSetTest::TestExhaustive() {
1360 // exhaustive tests. Simulate UnicodeSets with integers.
1361 // That gives us very solid tests (except for large memory tests).
1362
1363 int32_t limit = 128;
1364
1365 UnicodeSet x, y, z, aa;
1366
1367 for (int32_t i = 0; i < limit; ++i) {
1368 bitsToSet(i, x);
1369 logln((UnicodeString)"Testing " + i + ", " + x);
1370 _testComplement(i, x, y);
1371
1372 // AS LONG AS WE ARE HERE, check roundtrip
1373 checkRoundTrip(bitsToSet(i, aa));
1374
1375 for (int32_t j = 0; j < limit; ++j) {
1376 _testAdd(i,j, x,y,z);
1377 _testXor(i,j, x,y,z);
1378 _testRetain(i,j, x,y,z);
1379 _testRemove(i,j, x,y,z);
1380 }
1381 }
1382}
1383
1384void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1385 bitsToSet(a, x);
1386 z = x;
1387 z.complement();
1388 int32_t c = setToBits(z);
1389 if (c != (~a)) {
1390 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1391 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1392 }
1393 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1394}
1395
1396void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1397 bitsToSet(a, x);
1398 bitsToSet(b, y);
1399 z = x;
1400 z.addAll(y);
1401 int32_t c = setToBits(z);
1402 if (c != (a | b)) {
1403 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1404 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1405 }
1406 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1407}
1408
1409void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1410 bitsToSet(a, x);
1411 bitsToSet(b, y);
1412 z = x;
1413 z.retainAll(y);
1414 int32_t c = setToBits(z);
1415 if (c != (a & b)) {
1416 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1417 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1418 }
1419 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1420}
1421
1422void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1423 bitsToSet(a, x);
1424 bitsToSet(b, y);
1425 z = x;
1426 z.removeAll(y);
1427 int32_t c = setToBits(z);
1428 if (c != (a &~ b)) {
1429 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1430 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1431 }
1432 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1433}
1434
1435void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1436 bitsToSet(a, x);
1437 bitsToSet(b, y);
1438 z = x;
1439 z.complementAll(y);
1440 int32_t c = setToBits(z);
1441 if (c != (a ^ b)) {
1442 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1443 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1444 }
1445 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1446}
1447
1448/**
1449 * Check that ranges are monotonically increasing and non-
1450 * overlapping.
1451 */
1452void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1453 int32_t n = set.getRangeCount();
1454 if (n < 0) {
1455 errln((UnicodeString)"FAIL result of " + msg +
1456 ": range count should be >= 0 but is " +
1457 n /*+ " for " + set.toPattern())*/);
1458 return;
1459 }
1460 UChar32 last = 0;
1461 for (int32_t i=0; i<n; ++i) {
1462 UChar32 start = set.getRangeStart(i);
1463 UChar32 end = set.getRangeEnd(i);
1464 if (start > end) {
1465 errln((UnicodeString)"FAIL result of " + msg +
1466 ": range " + (i+1) +
1467 " start > end: " + (int)start + ", " + (int)end +
1468 " for " + set);
1469 }
1470 if (i > 0 && start <= last) {
1471 errln((UnicodeString)"FAIL result of " + msg +
1472 ": range " + (i+1) +
1473 " overlaps previous range: " + (int)start + ", " + (int)end +
1474 " for " + set);
1475 }
1476 last = end;
1477 }
1478}
1479
1480/**
1481 * Convert a bitmask to a UnicodeSet.
1482 */
1483UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1484 result.clear();
1485 for (UChar32 i = 0; i < 32; ++i) {
1486 if ((a & (1<<i)) != 0) {
1487 result.add(i);
1488 }
1489 }
1490 return result;
1491}
1492
1493/**
1494 * Convert a UnicodeSet to a bitmask. Only the characters
1495 * U+0000 to U+0020 are represented in the bitmask.
1496 */
1497int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1498 int32_t result = 0;
1499 for (int32_t i = 0; i < 32; ++i) {
1500 if (x.contains((UChar32)i)) {
1501 result |= (1<<i);
1502 }
1503 }
1504 return result;
1505}
1506
1507/**
1508 * Return the representation of an inversion list based UnicodeSet
1509 * as a pairs list. Ranges are listed in ascending Unicode order.
1510 * For example, the set [a-zA-M3] is represented as "33AMaz".
1511 */
1512UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1513 UnicodeString pairs;
1514 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1515 UChar32 start = set.getRangeStart(i);
1516 UChar32 end = set.getRangeEnd(i);
1517 if (end > 0xFFFF) {
1518 end = 0xFFFF;
1519 i = set.getRangeCount(); // Should be unnecessary
1520 }
1521 pairs.append((UChar)start).append((UChar)end);
1522 }
1523 return pairs;
1524}
1525
1526/**
1527 * Basic consistency check for a few items.
1528 * That the iterator works, and that we can create a pattern and
1529 * get the same thing back
1530 */
1531void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1532 UErrorCode ec = U_ZERO_ERROR;
1533
1534 UnicodeSet t(s);
1535 checkEqual(s, t, "copy ct");
1536
1537 t = s;
1538 checkEqual(s, t, "operator=");
1539
1540 copyWithIterator(t, s, FALSE);
1541 checkEqual(s, t, "iterator roundtrip");
1542
1543 copyWithIterator(t, s, TRUE); // try range
1544 checkEqual(s, t, "iterator roundtrip");
1545
1546 UnicodeString pat; s.toPattern(pat, FALSE);
1547 t.applyPattern(pat, ec);
1548 if (U_FAILURE(ec)) {
1549 errln("FAIL: applyPattern");
1550 return;
1551 } else {
1552 checkEqual(s, t, "toPattern(false)");
1553 }
1554
1555 s.toPattern(pat, TRUE);
1556 t.applyPattern(pat, ec);
1557 if (U_FAILURE(ec)) {
1558 errln("FAIL: applyPattern");
1559 return;
1560 } else {
1561 checkEqual(s, t, "toPattern(true)");
1562 }
1563}
1564
1565void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1566 t.clear();
1567 UnicodeSetIterator it(s);
1568 if (withRange) {
1569 while (it.nextRange()) {
1570 if (it.isString()) {
1571 t.add(it.getString());
1572 } else {
1573 t.add(it.getCodepoint(), it.getCodepointEnd());
1574 }
1575 }
1576 } else {
1577 while (it.next()) {
1578 if (it.isString()) {
1579 t.add(it.getString());
1580 } else {
1581 t.add(it.getCodepoint());
1582 }
1583 }
1584 }
1585}
1586
1587UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1588 UnicodeString source; s.toPattern(source, TRUE);
1589 UnicodeString result; t.toPattern(result, TRUE);
1590 if (s != t) {
1591 errln((UnicodeString)"FAIL: " + message
1592 + "; source = " + source
1593 + "; result = " + result
1594 );
1595 return FALSE;
1596 } else {
1597 logln((UnicodeString)"Ok: " + message
1598 + "; source = " + source
1599 + "; result = " + result
1600 );
1601 }
1602 return TRUE;
1603}
1604
1605void
1606UnicodeSetTest::expectContainment(const UnicodeString& pat,
1607 const UnicodeString& charsIn,
1608 const UnicodeString& charsOut) {
1609 UErrorCode ec = U_ZERO_ERROR;
1610 UnicodeSet set(pat, ec);
1611 if (U_FAILURE(ec)) {
1612 errln((UnicodeString)"FAIL: pattern \"" +
1613 pat + "\" => " + u_errorName(ec));
1614 return;
1615 }
1616 expectContainment(set, pat, charsIn, charsOut);
1617}
1618
1619void
1620UnicodeSetTest::expectContainment(const UnicodeSet& set,
1621 const UnicodeString& charsIn,
1622 const UnicodeString& charsOut) {
1623 UnicodeString pat;
1624 set.toPattern(pat);
1625 expectContainment(set, pat, charsIn, charsOut);
1626}
1627
1628void
1629UnicodeSetTest::expectContainment(const UnicodeSet& set,
1630 const UnicodeString& setName,
1631 const UnicodeString& charsIn,
1632 const UnicodeString& charsOut) {
1633 UnicodeString bad;
1634 UChar32 c;
1635 int32_t i;
1636
1637 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1638 c = charsIn.char32At(i);
1639 if (!set.contains(c)) {
1640 bad.append(c);
1641 }
1642 }
1643 if (bad.length() > 0) {
1644 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1645 ", expected containment of " + prettify(charsIn));
1646 } else {
1647 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1648 }
1649
1650 bad.truncate(0);
1651 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1652 c = charsOut.char32At(i);
1653 if (set.contains(c)) {
1654 bad.append(c);
1655 }
1656 }
1657 if (bad.length() > 0) {
1658 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1659 ", expected non-containment of " + prettify(charsOut));
1660 } else {
1661 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1662 }
1663}
1664
1665void
1666UnicodeSetTest::expectPattern(UnicodeSet& set,
1667 const UnicodeString& pattern,
1668 const UnicodeString& expectedPairs){
1669 UErrorCode status = U_ZERO_ERROR;
1670 set.applyPattern(pattern, status);
1671 if (U_FAILURE(status)) {
1672 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1673 "\") failed");
1674 return;
1675 } else {
1676 if (getPairs(set) != expectedPairs ) {
1677 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1678 "\") => pairs \"" +
1679 escape(getPairs(set)) + "\", expected \"" +
1680 escape(expectedPairs) + "\"");
1681 } else {
1682 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
1683 "\") => pairs \"" +
1684 escape(getPairs(set)) + "\"");
1685 }
1686 }
1687 // the result of calling set.toPattern(), which is the string representation of
1688 // this set(set), is passed to a UnicodeSet constructor, and tested that it
1689 // will produce another set that is equal to this one.
1690 UnicodeString temppattern;
1691 set.toPattern(temppattern);
1692 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1693 if (U_FAILURE(status)) {
1694 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1695 return;
1696 }
1697 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1698 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1699 escape(getPairs(set)) + "\""));
1700 } else{
1701 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
1702 }
1703
1704 delete tempset;
1705
1706}
1707
1708void
1709UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
1710 if (getPairs(set) != expectedPairs) {
1711 errln(UnicodeString("FAIL: Expected pair list \"") +
1712 escape(expectedPairs) + "\", got \"" +
1713 escape(getPairs(set)) + "\"");
1714 }
1715}
1716
1717void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
1718 const UnicodeString& expPat,
1719 const char** expStrings) {
1720 UnicodeString pat;
1721 set.toPattern(pat, TRUE);
1722 if (pat == expPat) {
1723 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
1724 } else {
1725 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
1726 return;
1727 }
374ca955
A
1728 if (expStrings == NULL) {
1729 return;
1730 }
b75a7d8f
A
1731 UBool in = TRUE;
1732 for (int32_t i=0; expStrings[i] != NULL; ++i) {
1733 if (expStrings[i] == NOT) { // sic; pointer comparison
1734 in = FALSE;
1735 continue;
1736 }
1737 UnicodeString s = CharsToUnicodeString(expStrings[i]);
1738 UBool contained = set.contains(s);
1739 if (contained == in) {
1740 logln((UnicodeString)"Ok: " + expPat +
1741 (contained ? " contains {" : " does not contain {") +
1742 escape(expStrings[i]) + "}");
1743 } else {
1744 errln((UnicodeString)"FAIL: " + expPat +
1745 (contained ? " contains {" : " does not contain {") +
1746 escape(expStrings[i]) + "}");
1747 }
1748 }
1749}
1750
1751static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
1752
1753void
1754UnicodeSetTest::doAssert(UBool condition, const char *message)
1755{
1756 if (!condition) {
1757 errln(UnicodeString("ERROR : ") + message);
1758 }
1759}
1760
1761UnicodeString
1762UnicodeSetTest::escape(const UnicodeString& s) {
1763 UnicodeString buf;
1764 for (int32_t i=0; i<s.length(); )
1765 {
1766 UChar32 c = s.char32At(i);
1767 if (0x0020 <= c && c <= 0x007F) {
1768 buf += c;
1769 } else {
1770 if (c <= 0xFFFF) {
1771 buf += (UChar)0x5c; buf += (UChar)0x75;
1772 } else {
1773 buf += (UChar)0x5c; buf += (UChar)0x55;
1774 buf += toHexString((c & 0xF0000000) >> 28);
1775 buf += toHexString((c & 0x0F000000) >> 24);
1776 buf += toHexString((c & 0x00F00000) >> 20);
1777 buf += toHexString((c & 0x000F0000) >> 16);
1778 }
1779 buf += toHexString((c & 0xF000) >> 12);
1780 buf += toHexString((c & 0x0F00) >> 8);
1781 buf += toHexString((c & 0x00F0) >> 4);
1782 buf += toHexString(c & 0x000F);
1783 }
1784 i += U16_LENGTH(c);
1785 }
1786 return buf;
1787}