]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/usettest.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2003 Alan Liu ,International Business Machines Corporation and
4 * others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 **********************************************************************
10 */
11
12 #include "unicode/utypes.h"
13 #include "usettest.h"
14 #include "unicode/uniset.h"
15 #include "unicode/uchar.h"
16 #include "unicode/usetiter.h"
17 #include "unicode/ustring.h"
18
19 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
20 UnicodeString pat;
21 set.toPattern(pat);
22 return left + UnicodeSetTest::escape(pat);
23 }
24
25 #define CASE(id,test) case id: \
26 name = #test; \
27 if (exec) { \
28 logln(#test "---"); \
29 logln((UnicodeString)""); \
30 test(); \
31 } \
32 break
33
34 void
35 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
36 const char* &name, char* /*par*/) {
37 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
38 switch (index) {
39 CASE(0,TestPatterns);
40 CASE(1,TestAddRemove);
41 CASE(2,TestCategories);
42 CASE(3,TestCloneEqualHash);
43 CASE(4,TestMinimalRep);
44 CASE(5,TestAPI);
45 CASE(6,TestScriptSet);
46 CASE(7,TestPropertySet);
47 CASE(8,TestClone);
48 CASE(9,TestExhaustive);
49 CASE(10,TestToPattern);
50 CASE(11,TestIndexOf);
51 CASE(12,TestStrings);
52 CASE(13,TestStringPatterns);
53 CASE(14,Testj2268);
54 CASE(15,TestCloseOver);
55 CASE(16,TestEscapePattern);
56 CASE(17,TestInvalidCodePoint);
57 default: name = ""; break;
58 }
59 }
60
61 /**
62 * UVector was improperly copying contents
63 * This code will crash this is still true
64 */
65 void UnicodeSetTest::Testj2268() {
66 UnicodeSet t;
67 t.add(UnicodeString("abc"));
68 UnicodeSet test(t);
69 UnicodeString ustrPat;
70 test.toPattern(ustrPat, TRUE);
71 }
72
73 /**
74 * Test that toPattern() round trips with syntax characters and
75 * whitespace.
76 */
77 void UnicodeSetTest::TestToPattern() {
78 static const char* OTHER_TOPATTERN_TESTS[] = {
79 "[[:latin:]&[:greek:]]",
80 "[[:latin:]-[:greek:]]",
81 "[:nonspacing mark:]",
82 NULL
83 };
84
85 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
86 UErrorCode ec = U_ZERO_ERROR;
87 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
88 if (U_FAILURE(ec)) {
89 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
90 continue;
91 }
92 checkPat(OTHER_TOPATTERN_TESTS[j], s);
93 }
94
95 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
96 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
97
98 // check various combinations to make sure they all work.
99 if (i != 0 && !toPatternAux(i, i)){
100 continue;
101 }
102 if (!toPatternAux(0, i)){
103 continue;
104 }
105 if (!toPatternAux(i, 0xFFFF)){
106 continue;
107 }
108 }
109 }
110 }
111
112 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
113
114 // use Integer.toString because Utility.hex doesn't handle ints
115 UnicodeString pat = "";
116 // TODO do these in hex
117 //String source = "0x" + Integer.toString(start,16).toUpperCase();
118 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
119 UnicodeString source;
120 source = source + (uint32_t)start;
121 if (start != end)
122 source = source + ".." + (uint32_t)end;
123 UnicodeSet testSet;
124 testSet.add(start, end);
125 return checkPat(source, testSet);
126 }
127
128 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
129 const UnicodeSet& testSet) {
130 // What we want to make sure of is that a pattern generated
131 // by toPattern(), with or without escaped unprintables, can
132 // be passed back into the UnicodeSet constructor.
133 UnicodeString pat0;
134
135 testSet.toPattern(pat0, TRUE);
136
137 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
138
139 //String pat1 = unescapeLeniently(pat0);
140 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
141
142 UnicodeString pat2;
143 testSet.toPattern(pat2, FALSE);
144 if (!checkPat(source, testSet, pat2)) return FALSE;
145
146 //String pat3 = unescapeLeniently(pat2);
147 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
148
149 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
150 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
151 return TRUE;
152 }
153
154 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
155 const UnicodeSet& testSet,
156 const UnicodeString& pat) {
157 UErrorCode ec = U_ZERO_ERROR;
158 UnicodeSet testSet2(pat, ec);
159 if (testSet2 != testSet) {
160 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
161 return FALSE;
162 }
163 return TRUE;
164 }
165
166 void
167 UnicodeSetTest::TestPatterns(void) {
168 UnicodeSet set;
169 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
170 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
171 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
172 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
173 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
174 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
175
176 // Throw in a test of complement
177 set.complement();
178 UnicodeString exp;
179 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
180 expectPairs(set, exp);
181 }
182
183 void
184 UnicodeSetTest::TestCategories(void) {
185 UErrorCode status = U_ZERO_ERROR;
186 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
187 UnicodeSet set(pat, status);
188 if (U_FAILURE(status)) {
189 errln((UnicodeString)"Fail: Can't construct set with " + pat);
190 } else {
191 expectContainment(set, pat, "ABC", "abc");
192 }
193
194 UChar32 i;
195 int32_t failures = 0;
196 // Make sure generation of L doesn't pollute cached Lu set
197 // First generate L, then Lu
198 set.applyPattern("[:L:]", status);
199 if (U_FAILURE(status)) { errln("FAIL"); return; }
200 for (i=0; i<0x200; ++i) {
201 UBool l = u_isalpha((UChar)i);
202 if (l != set.contains(i)) {
203 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
204 set.contains(i));
205 if (++failures == 10) break;
206 }
207 }
208
209 set.applyPattern("[:Lu:]", status);
210 if (U_FAILURE(status)) { errln("FAIL"); return; }
211 for (i=0; i<0x200; ++i) {
212 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
213 if (lu != set.contains(i)) {
214 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
215 set.contains(i));
216 if (++failures == 20) break;
217 }
218 }
219 }
220 void
221 UnicodeSetTest::TestCloneEqualHash(void) {
222 UErrorCode status = U_ZERO_ERROR;
223 // set1 and set2 used to be built with the obsolete constructor taking
224 // UCharCategory values; replaced with pattern constructors
225 // markus 20030502
226 UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase
227 UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); // Letter, lowercase
228 if (U_FAILURE(status)){
229 errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
230 return;
231 }
232 UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status); //Number, Decimal digit
233 UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit
234 if (U_FAILURE(status)){
235 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
236 return;
237 }
238
239 if (*set1 != *set1a) {
240 errln("FAIL: category constructor for Ll broken");
241 }
242 if (*set2 != *set2a) {
243 errln("FAIL: category constructor for Nd broken");
244 }
245 delete set1a;
246 delete set2a;
247
248 logln("Testing copy construction");
249 UnicodeSet *set1copy=new UnicodeSet(*set1);
250 if(*set1 != *set1copy || *set1 == *set2 ||
251 getPairs(*set1) != getPairs(*set1copy) ||
252 set1->hashCode() != set1copy->hashCode()){
253 errln("FAIL : Error in copy construction");
254 return;
255 }
256
257 logln("Testing =operator");
258 UnicodeSet set1equal=*set1;
259 UnicodeSet set2equal=*set2;
260 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
261 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
262 errln("FAIL: Error in =operator");
263 }
264
265 logln("Testing clone()");
266 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
267 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
268 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
269 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
270 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
271 errln("FAIL: Error in clone");
272 }
273
274 logln("Testing hashcode");
275 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
276 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
277 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
278 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
279 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
280 errln("FAIL: Error in hashCode()");
281 }
282
283 delete set1;
284 delete set1copy;
285 delete set2;
286 delete set1clone;
287 delete set2clone;
288
289
290 }
291 void
292 UnicodeSetTest::TestAddRemove(void) {
293 UnicodeSet set; // Construct empty set
294 doAssert(set.isEmpty() == TRUE, "set should be empty");
295 doAssert(set.size() == 0, "size should be 0");
296 set.add(0x0061, 0x007a);
297 expectPairs(set, "az");
298 doAssert(set.isEmpty() == FALSE, "set should not be empty");
299 doAssert(set.size() != 0, "size should not be equal to 0");
300 doAssert(set.size() == 26, "size should be equal to 26");
301 set.remove(0x006d, 0x0070);
302 expectPairs(set, "alqz");
303 doAssert(set.size() == 22, "size should be equal to 22");
304 set.remove(0x0065, 0x0067);
305 expectPairs(set, "adhlqz");
306 doAssert(set.size() == 19, "size should be equal to 19");
307 set.remove(0x0064, 0x0069);
308 expectPairs(set, "acjlqz");
309 doAssert(set.size() == 16, "size should be equal to 16");
310 set.remove(0x0063, 0x0072);
311 expectPairs(set, "absz");
312 doAssert(set.size() == 10, "size should be equal to 10");
313 set.add(0x0066, 0x0071);
314 expectPairs(set, "abfqsz");
315 doAssert(set.size() == 22, "size should be equal to 22");
316 set.remove(0x0061, 0x0067);
317 expectPairs(set, "hqsz");
318 set.remove(0x0061, 0x007a);
319 expectPairs(set, "");
320 doAssert(set.isEmpty() == TRUE, "set should be empty");
321 doAssert(set.size() == 0, "size should be 0");
322 set.add(0x0061);
323 doAssert(set.isEmpty() == FALSE, "set should not be empty");
324 doAssert(set.size() == 1, "size should not be equal to 1");
325 set.add(0x0062);
326 set.add(0x0063);
327 expectPairs(set, "ac");
328 doAssert(set.size() == 3, "size should not be equal to 3");
329 set.add(0x0070);
330 set.add(0x0071);
331 expectPairs(set, "acpq");
332 doAssert(set.size() == 5, "size should not be equal to 5");
333 set.clear();
334 expectPairs(set, "");
335 doAssert(set.isEmpty() == TRUE, "set should be empty");
336 doAssert(set.size() == 0, "size should be 0");
337
338 // Try removing an entire set from another set
339 expectPattern(set, "[c-x]", "cx");
340 UnicodeSet set2;
341 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
342 set.removeAll(set2);
343 expectPairs(set, "deluxx");
344
345 // Try adding an entire set to another set
346 expectPattern(set, "[jackiemclean]", "aacceein");
347 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
348 set.addAll(set2);
349 expectPairs(set, "aacehort");
350 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
351
352 // Try retaining an set of elements contained in another set (intersection)
353 UnicodeSet set3;
354 expectPattern(set3, "[a-c]", "ac");
355 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
356 set3.remove(0x0062);
357 expectPairs(set3, "aacc");
358 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
359 set.retainAll(set3);
360 expectPairs(set, "aacc");
361 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
362 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
363 set.clear();
364 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
365
366 // Test commutativity
367 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
368 expectPattern(set2, "[jackiemclean]", "aacceein");
369 set.addAll(set2);
370 expectPairs(set, "aacehort");
371 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
372
373
374
375
376 }
377
378 /**
379 * Make sure minimal representation is maintained.
380 */
381 void UnicodeSetTest::TestMinimalRep() {
382 UErrorCode status = U_ZERO_ERROR;
383 // This is pretty thoroughly tested by checkCanonicalRep()
384 // run against the exhaustive operation results. Use the code
385 // here for debugging specific spot problems.
386
387 // 1 overlap against 2
388 UnicodeSet set("[h-km-q]", status);
389 if (U_FAILURE(status)) { errln("FAIL"); return; }
390 UnicodeSet set2("[i-o]", status);
391 if (U_FAILURE(status)) { errln("FAIL"); return; }
392 set.addAll(set2);
393 expectPairs(set, "hq");
394 // right
395 set.applyPattern("[a-m]", status);
396 if (U_FAILURE(status)) { errln("FAIL"); return; }
397 set2.applyPattern("[e-o]", status);
398 if (U_FAILURE(status)) { errln("FAIL"); return; }
399 set.addAll(set2);
400 expectPairs(set, "ao");
401 // left
402 set.applyPattern("[e-o]", status);
403 if (U_FAILURE(status)) { errln("FAIL"); return; }
404 set2.applyPattern("[a-m]", status);
405 if (U_FAILURE(status)) { errln("FAIL"); return; }
406 set.addAll(set2);
407 expectPairs(set, "ao");
408 // 1 overlap against 3
409 set.applyPattern("[a-eg-mo-w]", status);
410 if (U_FAILURE(status)) { errln("FAIL"); return; }
411 set2.applyPattern("[d-q]", status);
412 if (U_FAILURE(status)) { errln("FAIL"); return; }
413 set.addAll(set2);
414 expectPairs(set, "aw");
415 }
416
417 void UnicodeSetTest::TestAPI() {
418 UErrorCode status = U_ZERO_ERROR;
419 // default ct
420 UnicodeSet set;
421 if (!set.isEmpty() || set.getRangeCount() != 0) {
422 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
423 set);
424 }
425
426 // clear(), isEmpty()
427 set.add(0x0061);
428 if (set.isEmpty()) {
429 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
430 set);
431 }
432 set.clear();
433 if (!set.isEmpty()) {
434 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
435 set);
436 }
437
438 // size()
439 set.clear();
440 if (set.size() != 0) {
441 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
442 ": " + set);
443 }
444 set.add(0x0061);
445 if (set.size() != 1) {
446 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
447 ": " + set);
448 }
449 set.add(0x0031, 0x0039);
450 if (set.size() != 10) {
451 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
452 ": " + set);
453 }
454
455 // contains(first, last)
456 set.clear();
457 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
458 if (U_FAILURE(status)) { errln("FAIL"); return; }
459 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
460 UChar32 a = set.getRangeStart(i);
461 UChar32 b = set.getRangeEnd(i);
462 if (!set.contains(a, b)) {
463 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
464 " but doesn't: " + set);
465 }
466 if (set.contains((UChar32)(a-1), b)) {
467 errln((UnicodeString)"FAIL, shouldn't contain " +
468 (unsigned short)(a-1) + '-' + (unsigned short)b +
469 " but does: " + set);
470 }
471 if (set.contains(a, (UChar32)(b+1))) {
472 errln((UnicodeString)"FAIL, shouldn't contain " +
473 (unsigned short)a + '-' + (unsigned short)(b+1) +
474 " but does: " + set);
475 }
476 }
477
478 // Ported InversionList test.
479 UnicodeSet a((UChar32)3,(UChar32)10);
480 UnicodeSet b((UChar32)7,(UChar32)15);
481 UnicodeSet c;
482
483 logln((UnicodeString)"a [3-10]: " + a);
484 logln((UnicodeString)"b [7-15]: " + b);
485 c = a; c.addAll(b);
486 UnicodeSet exp((UChar32)3,(UChar32)15);
487 if (c == exp) {
488 logln((UnicodeString)"c.set(a).add(b): " + c);
489 } else {
490 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
491 }
492 c.complement();
493 exp.set((UChar32)0, (UChar32)2);
494 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
495 if (c == exp) {
496 logln((UnicodeString)"c.complement(): " + c);
497 } else {
498 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
499 }
500 c.complement();
501 exp.set((UChar32)3, (UChar32)15);
502 if (c == exp) {
503 logln((UnicodeString)"c.complement(): " + c);
504 } else {
505 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
506 }
507 c = a; c.complementAll(b);
508 exp.set((UChar32)3,(UChar32)6);
509 exp.add((UChar32)11,(UChar32) 15);
510 if (c == exp) {
511 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
512 } else {
513 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
514 }
515
516 exp = c;
517 bitsToSet(setToBits(c), c);
518 if (c == exp) {
519 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
520 } else {
521 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
522 }
523
524 // Additional tests for coverage JB#2118
525 //UnicodeSet::complement(class UnicodeString const &)
526 //UnicodeSet::complementAll(class UnicodeString const &)
527 //UnicodeSet::containsNone(class UnicodeSet const &)
528 //UnicodeSet::containsNone(long,long)
529 //UnicodeSet::containsSome(class UnicodeSet const &)
530 //UnicodeSet::containsSome(long,long)
531 //UnicodeSet::removeAll(class UnicodeString const &)
532 //UnicodeSet::retain(long)
533 //UnicodeSet::retainAll(class UnicodeString const &)
534 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
535 //UnicodeSetIterator::getString(void)
536 set.clear();
537 set.complement("ab");
538 exp.applyPattern("[{ab}]", status);
539 if (U_FAILURE(status)) { errln("FAIL"); return; }
540 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
541
542 UnicodeSetIterator iset(set);
543 if (!iset.next() || !iset.isString()) {
544 errln("FAIL: UnicodeSetIterator::next/isString");
545 } else if (iset.getString() != "ab") {
546 errln("FAIL: UnicodeSetIterator::getString");
547 }
548
549 set.add((UChar32)0x61, (UChar32)0x7A);
550 set.complementAll("alan");
551 exp.applyPattern("[{ab}b-kmo-z]", status);
552 if (U_FAILURE(status)) { errln("FAIL"); return; }
553 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
554
555 exp.applyPattern("[a-z]", status);
556 if (U_FAILURE(status)) { errln("FAIL"); return; }
557 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
558 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
559 exp.applyPattern("[aln]", status);
560 if (U_FAILURE(status)) { errln("FAIL"); return; }
561 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
562 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
563
564 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
565 errln("FAIL: containsNone(UChar32, UChar32)");
566 }
567 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
568 errln("FAIL: containsSome(UChar32, UChar32)");
569 }
570 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
571 errln("FAIL: containsNone(UChar32, UChar32)");
572 }
573 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
574 errln("FAIL: containsSome(UChar32, UChar32)");
575 }
576
577 set.removeAll("liu");
578 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
579 if (U_FAILURE(status)) { errln("FAIL"); return; }
580 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
581
582 set.retainAll("star");
583 exp.applyPattern("[rst]", status);
584 if (U_FAILURE(status)) { errln("FAIL"); return; }
585 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
586
587 set.retain((UChar32)0x73);
588 exp.applyPattern("[s]", status);
589 if (U_FAILURE(status)) { errln("FAIL"); return; }
590 if (set != exp) { errln("FAIL: retain('s')"); return; }
591
592 uint16_t buf[32];
593 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
594 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
595 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
596 errln("FAIL: serialize");
597 return;
598 }
599 }
600
601 void UnicodeSetTest::TestStrings() {
602 UErrorCode ec = U_ZERO_ERROR;
603
604 UnicodeSet* testList[] = {
605 UnicodeSet::createFromAll("abc"),
606 new UnicodeSet("[a-c]", ec),
607
608 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
609 new UnicodeSet("[{ll}{ch}a-z]", ec),
610
611 UnicodeSet::createFrom("ab}c"),
612 new UnicodeSet("[{ab\\}c}]", ec),
613
614 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
615 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
616
617 NULL
618 };
619
620 if (U_FAILURE(ec)) {
621 errln("FAIL: couldn't construct test sets");
622 }
623
624 for (int32_t i = 0; testList[i] != NULL; i+=2) {
625 if (U_SUCCESS(ec)) {
626 UnicodeString pat0, pat1;
627 testList[i]->toPattern(pat0, TRUE);
628 testList[i+1]->toPattern(pat1, TRUE);
629 if (*testList[i] == *testList[i+1]) {
630 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
631 } else {
632 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
633 }
634 }
635 delete testList[i];
636 delete testList[i+1];
637 }
638 }
639
640 static const char NOT[] = "%%%%";
641
642 /**
643 * Test pattern behavior of multicharacter strings.
644 */
645 void UnicodeSetTest::TestStringPatterns() {
646 UErrorCode ec = U_ZERO_ERROR;
647 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
648
649 // This loop isn't a loop. It's here to make the compiler happy.
650 // If you're curious, try removing it and changing the 'break'
651 // statements (except for the last) to goto's.
652 for (;;) {
653 if (U_FAILURE(ec)) break;
654 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
655 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
656
657 s->add("ac");
658 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
659 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
660
661 s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
662 if (U_FAILURE(ec)) break;
663 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
664 expectToPattern(*s, "[a-z{\\{l}{r\\}}]", exp3);
665
666 s->add("[]");
667 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
668 expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
669
670 s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
671 if (U_FAILURE(ec)) break;
672 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
673 expectToPattern(*s, "[a-z{\\u4E01\\u4E02}{\\n\\r}]", exp5);
674
675 // j2189
676 s->clear();
677 s->add(UnicodeString("abc", ""));
678 s->add(UnicodeString("abc", ""));
679 const char* exp6[] = {"abc", NOT, "ab", NULL};
680 expectToPattern(*s, "[{abc}]", exp6);
681
682 break;
683 }
684
685 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
686 delete s;
687 }
688
689 /**
690 * Test the [:Latin:] syntax.
691 */
692 void UnicodeSetTest::TestScriptSet() {
693 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
694
695 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
696
697 /* Jitterbug 1423 */
698 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
699
700 }
701
702 /**
703 * Test the [:Latin:] syntax.
704 */
705 void UnicodeSetTest::TestPropertySet() {
706 static const char* DATA[] = {
707 // Pattern, Chars IN, Chars NOT in
708
709 "[:Latin:]",
710 "aA",
711 "\\u0391\\u03B1",
712
713 "[\\p{Greek}]",
714 "\\u0391\\u03B1",
715 "aA",
716
717 "\\P{ GENERAL Category = upper case letter }",
718 "abc",
719 "ABC",
720
721 // Combining class: @since ICU 2.2
722 // Check both symbolic and numeric
723 "\\p{ccc=Nukta}",
724 "\\u0ABC",
725 "abc",
726
727 "\\p{Canonical Combining Class = 11}",
728 "\\u05B1",
729 "\\u05B2",
730
731 "[:c c c = iota subscript :]",
732 "\\u0345",
733 "xyz",
734
735 // Bidi class: @since ICU 2.2
736 "\\p{bidiclass=lefttoright}",
737 "abc",
738 "\\u0671\\u0672",
739
740 // Binary properties: @since ICU 2.2
741 "\\p{ideographic}",
742 "\\u4E0A",
743 "x",
744
745 "[:math=false:]",
746 "q",
747 "(*+)",
748
749 // JB#1767 \N{}, \p{ASCII}
750 "[:Ascii:]",
751 "abc\\u0000\\u007F",
752 "\\u0080\\u4E00",
753
754 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
755 "az",
756 "qrs",
757
758 // JB#2015
759 "[:any:]",
760 "a\\U0010FFFF",
761 "",
762
763 "[:nv=0.5:]",
764 "\\u00BD\\u0F2A",
765 "\\u00BC",
766
767 // JB#2653: Age
768 "[:Age=1.1:]",
769 "\\u03D6", // 1.1
770 "\\u03D8\\u03D9", // 3.2
771
772 "[:Age=3.1:]",
773 "\\u1800\\u3400\\U0002f800",
774 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
775
776 // JB#2350: Case_Sensitive
777 "[:Case Sensitive:]",
778 "A\\u1FFC\\U00010410",
779 ";\\u00B4\\U00010500",
780
781 // JB#2832: C99-compatibility props
782 "[:blank:]",
783 " \\u0009",
784 "1-9A-Z",
785
786 "[:graph:]",
787 "19AZ",
788 " \\u0003\\u0007\\u0009\\u000A\\u000D",
789
790 "[:punct:]",
791 "!@#%&*()[]{}-_\\/;:,.?'\"",
792 "09azAZ",
793
794 "[:xdigit:]",
795 "09afAF",
796 "gG!",
797
798 // Regex compatibility test
799 "[-b]", // leading '-' is literal
800 "-b",
801 "ac",
802
803 "[^-b]", // leading '-' is literal
804 "ac",
805 "-b",
806
807 "[b-]", // trailing '-' is literal
808 "-b",
809 "ac",
810
811 "[^b-]", // trailing '-' is literal
812 "ac",
813 "-b"
814 };
815
816 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
817
818 for (int32_t i=0; i<DATA_LEN; i+=3) {
819 expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]),
820 CharsToUnicodeString(DATA[i+2]));
821 }
822 }
823
824 /**
825 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
826 */
827 void UnicodeSetTest::TestClone() {
828 UErrorCode ec = U_ZERO_ERROR;
829 UnicodeSet s("[abcxyz]", ec);
830 UnicodeSet t(s);
831 expectContainment(t, "abc", "def");
832 }
833
834 /**
835 * Test the indexOf() and charAt() methods.
836 */
837 void UnicodeSetTest::TestIndexOf() {
838 UErrorCode ec = U_ZERO_ERROR;
839 UnicodeSet set("[a-cx-y3578]", ec);
840 if (U_FAILURE(ec)) {
841 errln("FAIL: UnicodeSet constructor");
842 return;
843 }
844 for (int32_t i=0; i<set.size(); ++i) {
845 UChar32 c = set.charAt(i);
846 if (set.indexOf(c) != i) {
847 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
848 i, c, set.indexOf(c));
849 }
850 }
851 UChar32 c = set.charAt(set.size());
852 if (c != -1) {
853 errln("FAIL: charAt(<out of range>) = %X", c);
854 }
855 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
856 if (j != -1) {
857 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
858 }
859 }
860
861 /**
862 * Test closure API.
863 */
864 void UnicodeSetTest::TestCloseOver() {
865 UErrorCode ec = U_ZERO_ERROR;
866
867 char CASE[] = {(char)USET_CASE};
868 const char* DATA[] = {
869 // selector, input, output
870 CASE,
871 "[aq\\u00DF{Bc}{bC}{Fi}]",
872 "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
873
874 CASE,
875 "[\\u01F1]", // 'DZ'
876 "[\\u01F1\\u01F2\\u01F3]",
877
878 CASE,
879 "[\\u1FB4]",
880 "[\\u1FB4{\\u03AC\\u03B9}]",
881
882 CASE,
883 "[{F\\uFB01}]",
884 "[\\uFB03{ffi}]",
885
886 CASE, // make sure binary search finds limits
887 "[a\\uFF3A]",
888 "[aA\\uFF3A\\uFF5A]",
889
890 CASE,
891 "[a-z]","[A-Za-z\\u017F\\u212A]",
892 CASE,
893 "[abc]","[A-Ca-c]",
894 CASE,
895 "[ABC]","[A-Ca-c]",
896
897 NULL
898 };
899
900 UnicodeSet s;
901 UnicodeSet t;
902 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
903 int32_t selector = DATA[i][0];
904 UnicodeString pat(DATA[i+1]);
905 UnicodeString exp(DATA[i+2]);
906 s.applyPattern(pat, ec);
907 s.closeOver(selector);
908 t.applyPattern(exp, ec);
909 if (U_FAILURE(ec)) {
910 errln("FAIL: applyPattern failed");
911 continue;
912 }
913 if (s == t) {
914 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
915 } else {
916 UnicodeString buf;
917 errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
918 s.toPattern(buf, TRUE) + ", expected " + exp);
919 }
920 }
921
922 // Test the pattern API
923 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, ec);
924 if (U_FAILURE(ec)) {
925 errln("FAIL: applyPattern failed");
926 } else {
927 expectContainment(s, "abcABC", "defDEF");
928 }
929 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, ec);
930 if (U_FAILURE(ec)) {
931 errln("FAIL: constructor failed");
932 } else {
933 expectContainment(v, "defDEF", "abcABC");
934 }
935 }
936
937 void UnicodeSetTest::TestEscapePattern() {
938 const char pattern[] =
939 "[\\uFEFF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
940 const char exp[] =
941 "[\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
942 // We test this with two passes; in the second pass we
943 // pre-unescape the pattern. Since U+FEFF and several other code
944 // points are rule whitespace, this fails -- which is what we
945 // expect.
946 for (int32_t pass=1; pass<=2; ++pass) {
947 UErrorCode ec = U_ZERO_ERROR;
948 UnicodeString pat(pattern);
949 if (pass==2) {
950 pat = pat.unescape();
951 }
952 // Pattern is only good for pass 1
953 UBool isPatternValid = (pass==1);
954
955 UnicodeSet set(pat, ec);
956 if (U_SUCCESS(ec) != isPatternValid){
957 errln((UnicodeString)"FAIL: applyPattern(" +
958 escape(pat) + ") => " +
959 u_errorName(ec));
960 continue;
961 }
962 if (U_FAILURE(ec)) {
963 continue;
964 }
965 if (set.contains((UChar)0x0644)){
966 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
967 }
968
969 UnicodeString newpat;
970 set.toPattern(newpat, TRUE);
971 if (newpat == exp) {
972 logln(escape(pat) + " => " + newpat);
973 } else {
974 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
975 }
976
977 for (int32_t i=0; i<set.getRangeCount(); ++i) {
978 UnicodeString str("Range ");
979 str.append((UChar)(0x30 + i))
980 .append(": ")
981 .append((UChar32)set.getRangeStart(i))
982 .append(" - ")
983 .append((UChar32)set.getRangeEnd(i));
984 str = str + " (" + set.getRangeStart(i) + " - " +
985 set.getRangeEnd(i) + ")";
986 if (set.getRangeStart(i) < 0) {
987 errln((UnicodeString)"FAIL: " + escape(str));
988 } else {
989 logln(escape(str));
990 }
991 }
992 }
993 }
994
995 void UnicodeSetTest::expectRange(const UnicodeString& label,
996 const UnicodeSet& set,
997 UChar32 start, UChar32 end) {
998 UnicodeSet exp(start, end);
999 UnicodeString pat;
1000 if (set == exp) {
1001 logln(label + " => " + set.toPattern(pat, TRUE));
1002 } else {
1003 UnicodeString xpat;
1004 errln((UnicodeString)"FAIL: " + label + " => " +
1005 set.toPattern(pat, TRUE) +
1006 ", expected " + exp.toPattern(xpat, TRUE));
1007 }
1008 }
1009
1010 void UnicodeSetTest::TestInvalidCodePoint() {
1011
1012 const UChar32 DATA[] = {
1013 // Test range Expected range
1014 0, 0x10FFFF, 0, 0x10FFFF,
1015 (UChar32)-1, 8, 0, 8,
1016 8, 0x110000, 8, 0x10FFFF
1017 };
1018 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1019
1020 UnicodeString pat;
1021 int32_t i;
1022
1023 for (i=0; i<DATA_LENGTH; i+=4) {
1024 UChar32 start = DATA[i];
1025 UChar32 end = DATA[i+1];
1026 UChar32 xstart = DATA[i+2];
1027 UChar32 xend = DATA[i+3];
1028
1029 // Try various API using the test code points
1030
1031 UnicodeSet set(start, end);
1032 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1033 set, xstart, xend);
1034
1035 set.clear();
1036 set.set(start, end);
1037 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1038 set, xstart, xend);
1039
1040 UBool b = set.contains(start);
1041 b = set.contains(start, end);
1042 b = set.containsNone(start, end);
1043 b = set.containsSome(start, end);
1044
1045 int32_t index = set.indexOf(start);
1046
1047 set.clear();
1048 set.add(start);
1049 set.add(start, end);
1050 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1051 set, xstart, xend);
1052
1053 set.set(0, 0x10FFFF);
1054 set.retain(start, end);
1055 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1056 set, xstart, xend);
1057 set.retain(start);
1058
1059 set.set(0, 0x10FFFF);
1060 set.remove(start);
1061 set.remove(start, end);
1062 set.complement();
1063 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1064 set, xstart, xend);
1065
1066 set.set(0, 0x10FFFF);
1067 set.complement(start, end);
1068 set.complement();
1069 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1070 set, xstart, xend);
1071 set.complement(start);
1072 }
1073
1074 const UChar32 DATA2[] = {
1075 0,
1076 0x10FFFF,
1077 (UChar32)-1,
1078 0x110000
1079 };
1080 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1081
1082 for (i=0; i<DATA2_LENGTH; ++i) {
1083 UChar32 c = DATA2[i], end = 0x10FFFF;
1084 UBool valid = (c >= 0 && c <= 0x10FFFF);
1085
1086 UnicodeSet set(0, 0x10FFFF);
1087
1088 // For single-codepoint contains, invalid codepoints are NOT contained
1089 UBool b = set.contains(c);
1090 if (b == valid) {
1091 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1092 ") = " + b);
1093 } else {
1094 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1095 ") = " + b);
1096 }
1097
1098 // For codepoint range contains, containsNone, and containsSome,
1099 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1100 b = set.contains(c, end);
1101 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1102 "," + end + ") = " + b);
1103
1104 b = set.containsNone(c, end);
1105 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1106 "," + end + ") = " + b);
1107
1108 b = set.containsSome(c, end);
1109 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1110 "," + end + ") = " + b);
1111
1112 int32_t index = set.indexOf(c);
1113 if ((index >= 0) == valid) {
1114 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1115 ") = " + index);
1116 } else {
1117 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1118 ") = " + index);
1119 }
1120 }
1121 }
1122
1123 void UnicodeSetTest::TestExhaustive() {
1124 // exhaustive tests. Simulate UnicodeSets with integers.
1125 // That gives us very solid tests (except for large memory tests).
1126
1127 int32_t limit = 128;
1128
1129 UnicodeSet x, y, z, aa;
1130
1131 for (int32_t i = 0; i < limit; ++i) {
1132 bitsToSet(i, x);
1133 logln((UnicodeString)"Testing " + i + ", " + x);
1134 _testComplement(i, x, y);
1135
1136 // AS LONG AS WE ARE HERE, check roundtrip
1137 checkRoundTrip(bitsToSet(i, aa));
1138
1139 for (int32_t j = 0; j < limit; ++j) {
1140 _testAdd(i,j, x,y,z);
1141 _testXor(i,j, x,y,z);
1142 _testRetain(i,j, x,y,z);
1143 _testRemove(i,j, x,y,z);
1144 }
1145 }
1146 }
1147
1148 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1149 bitsToSet(a, x);
1150 z = x;
1151 z.complement();
1152 int32_t c = setToBits(z);
1153 if (c != (~a)) {
1154 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1155 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1156 }
1157 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1158 }
1159
1160 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1161 bitsToSet(a, x);
1162 bitsToSet(b, y);
1163 z = x;
1164 z.addAll(y);
1165 int32_t c = setToBits(z);
1166 if (c != (a | b)) {
1167 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1168 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1169 }
1170 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1171 }
1172
1173 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1174 bitsToSet(a, x);
1175 bitsToSet(b, y);
1176 z = x;
1177 z.retainAll(y);
1178 int32_t c = setToBits(z);
1179 if (c != (a & b)) {
1180 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1181 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1182 }
1183 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1184 }
1185
1186 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1187 bitsToSet(a, x);
1188 bitsToSet(b, y);
1189 z = x;
1190 z.removeAll(y);
1191 int32_t c = setToBits(z);
1192 if (c != (a &~ b)) {
1193 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1194 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1195 }
1196 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1197 }
1198
1199 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1200 bitsToSet(a, x);
1201 bitsToSet(b, y);
1202 z = x;
1203 z.complementAll(y);
1204 int32_t c = setToBits(z);
1205 if (c != (a ^ b)) {
1206 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1207 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1208 }
1209 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1210 }
1211
1212 /**
1213 * Check that ranges are monotonically increasing and non-
1214 * overlapping.
1215 */
1216 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1217 int32_t n = set.getRangeCount();
1218 if (n < 0) {
1219 errln((UnicodeString)"FAIL result of " + msg +
1220 ": range count should be >= 0 but is " +
1221 n /*+ " for " + set.toPattern())*/);
1222 return;
1223 }
1224 UChar32 last = 0;
1225 for (int32_t i=0; i<n; ++i) {
1226 UChar32 start = set.getRangeStart(i);
1227 UChar32 end = set.getRangeEnd(i);
1228 if (start > end) {
1229 errln((UnicodeString)"FAIL result of " + msg +
1230 ": range " + (i+1) +
1231 " start > end: " + (int)start + ", " + (int)end +
1232 " for " + set);
1233 }
1234 if (i > 0 && start <= last) {
1235 errln((UnicodeString)"FAIL result of " + msg +
1236 ": range " + (i+1) +
1237 " overlaps previous range: " + (int)start + ", " + (int)end +
1238 " for " + set);
1239 }
1240 last = end;
1241 }
1242 }
1243
1244 /**
1245 * Convert a bitmask to a UnicodeSet.
1246 */
1247 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1248 result.clear();
1249 for (UChar32 i = 0; i < 32; ++i) {
1250 if ((a & (1<<i)) != 0) {
1251 result.add(i);
1252 }
1253 }
1254 return result;
1255 }
1256
1257 /**
1258 * Convert a UnicodeSet to a bitmask. Only the characters
1259 * U+0000 to U+0020 are represented in the bitmask.
1260 */
1261 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1262 int32_t result = 0;
1263 for (int32_t i = 0; i < 32; ++i) {
1264 if (x.contains((UChar32)i)) {
1265 result |= (1<<i);
1266 }
1267 }
1268 return result;
1269 }
1270
1271 /**
1272 * Return the representation of an inversion list based UnicodeSet
1273 * as a pairs list. Ranges are listed in ascending Unicode order.
1274 * For example, the set [a-zA-M3] is represented as "33AMaz".
1275 */
1276 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1277 UnicodeString pairs;
1278 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1279 UChar32 start = set.getRangeStart(i);
1280 UChar32 end = set.getRangeEnd(i);
1281 if (end > 0xFFFF) {
1282 end = 0xFFFF;
1283 i = set.getRangeCount(); // Should be unnecessary
1284 }
1285 pairs.append((UChar)start).append((UChar)end);
1286 }
1287 return pairs;
1288 }
1289
1290 /**
1291 * Basic consistency check for a few items.
1292 * That the iterator works, and that we can create a pattern and
1293 * get the same thing back
1294 */
1295 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1296 UErrorCode ec = U_ZERO_ERROR;
1297
1298 UnicodeSet t(s);
1299 checkEqual(s, t, "copy ct");
1300
1301 t = s;
1302 checkEqual(s, t, "operator=");
1303
1304 copyWithIterator(t, s, FALSE);
1305 checkEqual(s, t, "iterator roundtrip");
1306
1307 copyWithIterator(t, s, TRUE); // try range
1308 checkEqual(s, t, "iterator roundtrip");
1309
1310 UnicodeString pat; s.toPattern(pat, FALSE);
1311 t.applyPattern(pat, ec);
1312 if (U_FAILURE(ec)) {
1313 errln("FAIL: applyPattern");
1314 return;
1315 } else {
1316 checkEqual(s, t, "toPattern(false)");
1317 }
1318
1319 s.toPattern(pat, TRUE);
1320 t.applyPattern(pat, ec);
1321 if (U_FAILURE(ec)) {
1322 errln("FAIL: applyPattern");
1323 return;
1324 } else {
1325 checkEqual(s, t, "toPattern(true)");
1326 }
1327 }
1328
1329 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1330 t.clear();
1331 UnicodeSetIterator it(s);
1332 if (withRange) {
1333 while (it.nextRange()) {
1334 if (it.isString()) {
1335 t.add(it.getString());
1336 } else {
1337 t.add(it.getCodepoint(), it.getCodepointEnd());
1338 }
1339 }
1340 } else {
1341 while (it.next()) {
1342 if (it.isString()) {
1343 t.add(it.getString());
1344 } else {
1345 t.add(it.getCodepoint());
1346 }
1347 }
1348 }
1349 }
1350
1351 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1352 UnicodeString source; s.toPattern(source, TRUE);
1353 UnicodeString result; t.toPattern(result, TRUE);
1354 if (s != t) {
1355 errln((UnicodeString)"FAIL: " + message
1356 + "; source = " + source
1357 + "; result = " + result
1358 );
1359 return FALSE;
1360 } else {
1361 logln((UnicodeString)"Ok: " + message
1362 + "; source = " + source
1363 + "; result = " + result
1364 );
1365 }
1366 return TRUE;
1367 }
1368
1369 void
1370 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1371 const UnicodeString& charsIn,
1372 const UnicodeString& charsOut) {
1373 UErrorCode ec = U_ZERO_ERROR;
1374 UnicodeSet set(pat, ec);
1375 if (U_FAILURE(ec)) {
1376 errln((UnicodeString)"FAIL: pattern \"" +
1377 pat + "\" => " + u_errorName(ec));
1378 return;
1379 }
1380 expectContainment(set, pat, charsIn, charsOut);
1381 }
1382
1383 void
1384 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1385 const UnicodeString& charsIn,
1386 const UnicodeString& charsOut) {
1387 UnicodeString pat;
1388 set.toPattern(pat);
1389 expectContainment(set, pat, charsIn, charsOut);
1390 }
1391
1392 void
1393 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1394 const UnicodeString& setName,
1395 const UnicodeString& charsIn,
1396 const UnicodeString& charsOut) {
1397 UnicodeString bad;
1398 UChar32 c;
1399 int32_t i;
1400
1401 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1402 c = charsIn.char32At(i);
1403 if (!set.contains(c)) {
1404 bad.append(c);
1405 }
1406 }
1407 if (bad.length() > 0) {
1408 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1409 ", expected containment of " + prettify(charsIn));
1410 } else {
1411 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1412 }
1413
1414 bad.truncate(0);
1415 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1416 c = charsOut.char32At(i);
1417 if (set.contains(c)) {
1418 bad.append(c);
1419 }
1420 }
1421 if (bad.length() > 0) {
1422 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1423 ", expected non-containment of " + prettify(charsOut));
1424 } else {
1425 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1426 }
1427 }
1428
1429 void
1430 UnicodeSetTest::expectPattern(UnicodeSet& set,
1431 const UnicodeString& pattern,
1432 const UnicodeString& expectedPairs){
1433 UErrorCode status = U_ZERO_ERROR;
1434 set.applyPattern(pattern, status);
1435 if (U_FAILURE(status)) {
1436 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1437 "\") failed");
1438 return;
1439 } else {
1440 if (getPairs(set) != expectedPairs ) {
1441 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1442 "\") => pairs \"" +
1443 escape(getPairs(set)) + "\", expected \"" +
1444 escape(expectedPairs) + "\"");
1445 } else {
1446 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
1447 "\") => pairs \"" +
1448 escape(getPairs(set)) + "\"");
1449 }
1450 }
1451 // the result of calling set.toPattern(), which is the string representation of
1452 // this set(set), is passed to a UnicodeSet constructor, and tested that it
1453 // will produce another set that is equal to this one.
1454 UnicodeString temppattern;
1455 set.toPattern(temppattern);
1456 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1457 if (U_FAILURE(status)) {
1458 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1459 return;
1460 }
1461 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1462 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1463 escape(getPairs(set)) + "\""));
1464 } else{
1465 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
1466 }
1467
1468 delete tempset;
1469
1470 }
1471
1472 void
1473 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
1474 if (getPairs(set) != expectedPairs) {
1475 errln(UnicodeString("FAIL: Expected pair list \"") +
1476 escape(expectedPairs) + "\", got \"" +
1477 escape(getPairs(set)) + "\"");
1478 }
1479 }
1480
1481 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
1482 const UnicodeString& expPat,
1483 const char** expStrings) {
1484 UnicodeString pat;
1485 set.toPattern(pat, TRUE);
1486 if (pat == expPat) {
1487 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
1488 } else {
1489 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
1490 return;
1491 }
1492 UBool in = TRUE;
1493 for (int32_t i=0; expStrings[i] != NULL; ++i) {
1494 if (expStrings[i] == NOT) { // sic; pointer comparison
1495 in = FALSE;
1496 continue;
1497 }
1498 UnicodeString s = CharsToUnicodeString(expStrings[i]);
1499 UBool contained = set.contains(s);
1500 if (contained == in) {
1501 logln((UnicodeString)"Ok: " + expPat +
1502 (contained ? " contains {" : " does not contain {") +
1503 escape(expStrings[i]) + "}");
1504 } else {
1505 errln((UnicodeString)"FAIL: " + expPat +
1506 (contained ? " contains {" : " does not contain {") +
1507 escape(expStrings[i]) + "}");
1508 }
1509 }
1510 }
1511
1512 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
1513
1514 void
1515 UnicodeSetTest::doAssert(UBool condition, const char *message)
1516 {
1517 if (!condition) {
1518 errln(UnicodeString("ERROR : ") + message);
1519 }
1520 }
1521
1522 UnicodeString
1523 UnicodeSetTest::escape(const UnicodeString& s) {
1524 UnicodeString buf;
1525 for (int32_t i=0; i<s.length(); )
1526 {
1527 UChar32 c = s.char32At(i);
1528 if (0x0020 <= c && c <= 0x007F) {
1529 buf += c;
1530 } else {
1531 if (c <= 0xFFFF) {
1532 buf += (UChar)0x5c; buf += (UChar)0x75;
1533 } else {
1534 buf += (UChar)0x5c; buf += (UChar)0x55;
1535 buf += toHexString((c & 0xF0000000) >> 28);
1536 buf += toHexString((c & 0x0F000000) >> 24);
1537 buf += toHexString((c & 0x00F00000) >> 20);
1538 buf += toHexString((c & 0x000F0000) >> 16);
1539 }
1540 buf += toHexString((c & 0xF000) >> 12);
1541 buf += toHexString((c & 0x0F00) >> 8);
1542 buf += toHexString((c & 0x00F0) >> 4);
1543 buf += toHexString(c & 0x000F);
1544 }
1545 i += U16_LENGTH(c);
1546 }
1547 return buf;
1548 }