2 **********************************************************************
3 * Copyright (C) 1999-2004 Alan Liu ,International Business Machines Corporation and
4 * others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 **********************************************************************
12 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/uchar.h"
16 #include "unicode/usetiter.h"
17 #include "unicode/ustring.h"
18 #include "unicode/parsepos.h"
19 #include "unicode/symtable.h"
22 UnicodeString
operator+(const UnicodeString
& left
, const UnicodeSet
& set
) {
25 return left
+ UnicodeSetTest::escape(pat
);
28 #define CASE(id,test) case id: \
32 logln((UnicodeString)""); \
38 UnicodeSetTest::runIndexedTest(int32_t index
, UBool exec
,
39 const char* &name
, char* /*par*/) {
40 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
43 CASE(1,TestAddRemove
);
44 CASE(2,TestCategories
);
45 CASE(3,TestCloneEqualHash
);
46 CASE(4,TestMinimalRep
);
48 CASE(6,TestScriptSet
);
49 CASE(7,TestPropertySet
);
51 CASE(9,TestExhaustive
);
52 CASE(10,TestToPattern
);
56 CASE(14,TestCloseOver
);
57 CASE(15,TestEscapePattern
);
58 CASE(16,TestInvalidCodePoint
);
59 CASE(17,TestSymbolTable
);
60 CASE(18,TestSurrogate
);
61 default: name
= ""; break;
65 static const char NOT
[] = "%%%%";
68 * UVector was improperly copying contents
69 * This code will crash this is still true
71 void UnicodeSetTest::Testj2268() {
73 t
.add(UnicodeString("abc"));
75 UnicodeString ustrPat
;
76 test
.toPattern(ustrPat
, TRUE
);
82 void UnicodeSetTest::TestToPattern() {
83 UErrorCode ec
= U_ZERO_ERROR
;
85 // Test that toPattern() round trips with syntax characters and
88 static const char* OTHER_TOPATTERN_TESTS
[] = {
89 "[[:latin:]&[:greek:]]",
90 "[[:latin:]-[:greek:]]",
91 "[:nonspacing mark:]",
95 for (int32_t j
=0; OTHER_TOPATTERN_TESTS
[j
]!=NULL
; ++j
) {
97 UnicodeSet
s(OTHER_TOPATTERN_TESTS
[j
], ec
);
99 errln((UnicodeString
)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS
[j
]);
102 checkPat(OTHER_TOPATTERN_TESTS
[j
], s
);
105 for (UChar32 i
= 0; i
<= 0x10FFFF; ++i
) {
106 if ((i
<= 0xFF && !u_isalpha(i
)) || u_isspace(i
)) {
108 // check various combinations to make sure they all work.
109 if (i
!= 0 && !toPatternAux(i
, i
)){
112 if (!toPatternAux(0, i
)){
115 if (!toPatternAux(i
, 0xFFFF)){
122 // Test pattern behavior of multicharacter strings.
125 UnicodeSet
* s
= new UnicodeSet("[a-z {aa} {ab}]", ec
);
127 // This loop isn't a loop. It's here to make the compiler happy.
128 // If you're curious, try removing it and changing the 'break'
129 // statements (except for the last) to goto's.
131 if (U_FAILURE(ec
)) break;
132 const char* exp1
[] = {"aa", "ab", NOT
, "ac", NULL
};
133 expectToPattern(*s
, "[a-z{aa}{ab}]", exp1
);
136 const char* exp2
[] = {"aa", "ab", "ac", NOT
, "xy", NULL
};
137 expectToPattern(*s
, "[a-z{aa}{ab}{ac}]", exp2
);
139 s
->applyPattern("[a-z {\\{l} {r\\}}]", ec
);
140 if (U_FAILURE(ec
)) break;
141 const char* exp3
[] = {"{l", "r}", NOT
, "xy", NULL
};
142 expectToPattern(*s
, "[a-z{r\\}}{\\{l}]", exp3
);
145 const char* exp4
[] = {"{l", "r}", "[]", NOT
, "xy", NULL
};
146 expectToPattern(*s
, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4
);
148 s
->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec
);
149 if (U_FAILURE(ec
)) break;
150 const char* exp5
[] = {"\\u4E01\\u4E02", "\n\r", NULL
};
151 expectToPattern(*s
, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5
);
155 s
->add(UnicodeString("abc", ""));
156 s
->add(UnicodeString("abc", ""));
157 const char* exp6
[] = {"abc", NOT
, "ab", NULL
};
158 expectToPattern(*s
, "[{abc}]", exp6
);
163 if (U_FAILURE(ec
)) errln("FAIL: pattern parse error");
167 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
169 s
.add((UChar
)97, (UChar
)98); // 'a', 'b'
170 expectToPattern(s
, "[ab]", NULL
);
173 UBool
UnicodeSetTest::toPatternAux(UChar32 start
, UChar32 end
) {
175 // use Integer.toString because Utility.hex doesn't handle ints
176 UnicodeString pat
= "";
177 // TODO do these in hex
178 //String source = "0x" + Integer.toString(start,16).toUpperCase();
179 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
180 UnicodeString source
;
181 source
= source
+ (uint32_t)start
;
183 source
= source
+ ".." + (uint32_t)end
;
185 testSet
.add(start
, end
);
186 return checkPat(source
, testSet
);
189 UBool
UnicodeSetTest::checkPat(const UnicodeString
& source
,
190 const UnicodeSet
& testSet
) {
191 // What we want to make sure of is that a pattern generated
192 // by toPattern(), with or without escaped unprintables, can
193 // be passed back into the UnicodeSet constructor.
196 testSet
.toPattern(pat0
, TRUE
);
198 if (!checkPat(source
+ " (escaped)", testSet
, pat0
)) return FALSE
;
200 //String pat1 = unescapeLeniently(pat0);
201 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
204 testSet
.toPattern(pat2
, FALSE
);
205 if (!checkPat(source
, testSet
, pat2
)) return FALSE
;
207 //String pat3 = unescapeLeniently(pat2);
208 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
210 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
211 logln((UnicodeString
)source
+ " => " + pat0
+ ", " + pat2
);
215 UBool
UnicodeSetTest::checkPat(const UnicodeString
& source
,
216 const UnicodeSet
& testSet
,
217 const UnicodeString
& pat
) {
218 UErrorCode ec
= U_ZERO_ERROR
;
219 UnicodeSet
testSet2(pat
, ec
);
220 if (testSet2
!= testSet
) {
221 errln((UnicodeString
)"Fail toPattern: " + source
+ " => " + pat
);
228 UnicodeSetTest::TestPatterns(void) {
230 expectPattern(set
, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
231 expectPattern(set
, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
232 expectPattern(set
, UnicodeString("[a\\-z]", ""), "--aazz");
233 expectPattern(set
, UnicodeString("[-az]", ""), "--aazz");
234 expectPattern(set
, UnicodeString("[az-]", ""), "--aazz");
235 expectPattern(set
, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
237 // Throw in a test of complement
240 exp
.append((UChar
)0x0000).append("aeeoouu").append((UChar
)(0x007a+1)).append((UChar
)0xFFFF);
241 expectPairs(set
, exp
);
245 UnicodeSetTest::TestCategories(void) {
246 UErrorCode status
= U_ZERO_ERROR
;
247 const char* pat
= " [:Lu:] "; // Whitespace ok outside [:..:]
248 UnicodeSet
set(pat
, status
);
249 if (U_FAILURE(status
)) {
250 errln((UnicodeString
)"Fail: Can't construct set with " + pat
);
252 expectContainment(set
, pat
, "ABC", "abc");
256 int32_t failures
= 0;
257 // Make sure generation of L doesn't pollute cached Lu set
258 // First generate L, then Lu
259 set
.applyPattern("[:L:]", status
);
260 if (U_FAILURE(status
)) { errln("FAIL"); return; }
261 for (i
=0; i
<0x200; ++i
) {
262 UBool l
= u_isalpha((UChar
)i
);
263 if (l
!= set
.contains(i
)) {
264 errln((UnicodeString
)"FAIL: L contains " + (unsigned short)i
+ " = " +
266 if (++failures
== 10) break;
270 set
.applyPattern("[:Lu:]", status
);
271 if (U_FAILURE(status
)) { errln("FAIL"); return; }
272 for (i
=0; i
<0x200; ++i
) {
273 UBool lu
= (u_charType((UChar
)i
) == U_UPPERCASE_LETTER
);
274 if (lu
!= set
.contains(i
)) {
275 errln((UnicodeString
)"FAIL: Lu contains " + (unsigned short)i
+ " = " +
277 if (++failures
== 20) break;
282 UnicodeSetTest::TestCloneEqualHash(void) {
283 UErrorCode status
= U_ZERO_ERROR
;
284 // set1 and set2 used to be built with the obsolete constructor taking
285 // UCharCategory values; replaced with pattern constructors
287 UnicodeSet
*set1
=new UnicodeSet("\\p{Lowercase Letter}", status
); // :Ll: Letter, lowercase
288 UnicodeSet
*set1a
=new UnicodeSet("[:Ll:]", status
); // Letter, lowercase
289 if (U_FAILURE(status
)){
290 errln((UnicodeString
)"FAIL: Can't construst set with category->Ll");
293 UnicodeSet
*set2
=new UnicodeSet("\\p{Decimal Number}", status
); //Number, Decimal digit
294 UnicodeSet
*set2a
=new UnicodeSet("[:Nd:]", status
); //Number, Decimal digit
295 if (U_FAILURE(status
)){
296 errln((UnicodeString
)"FAIL: Can't construct set with category->Nd");
300 if (*set1
!= *set1a
) {
301 errln("FAIL: category constructor for Ll broken");
303 if (*set2
!= *set2a
) {
304 errln("FAIL: category constructor for Nd broken");
309 logln("Testing copy construction");
310 UnicodeSet
*set1copy
=new UnicodeSet(*set1
);
311 if(*set1
!= *set1copy
|| *set1
== *set2
||
312 getPairs(*set1
) != getPairs(*set1copy
) ||
313 set1
->hashCode() != set1copy
->hashCode()){
314 errln("FAIL : Error in copy construction");
318 logln("Testing =operator");
319 UnicodeSet set1equal
=*set1
;
320 UnicodeSet set2equal
=*set2
;
321 if(set1equal
!= *set1
|| set1equal
!= *set1copy
|| set2equal
!= *set2
||
322 set2equal
== *set1
|| set2equal
== *set1copy
|| set2equal
== set1equal
){
323 errln("FAIL: Error in =operator");
326 logln("Testing clone()");
327 UnicodeSet
*set1clone
=(UnicodeSet
*)set1
->clone();
328 UnicodeSet
*set2clone
=(UnicodeSet
*)set2
->clone();
329 if(*set1clone
!= *set1
|| *set1clone
!= *set1copy
|| *set1clone
!= set1equal
||
330 *set2clone
!= *set2
|| *set2clone
== *set1copy
|| *set2clone
!= set2equal
||
331 *set2clone
== *set1
|| *set2clone
== set1equal
|| *set2clone
== *set1clone
){
332 errln("FAIL: Error in clone");
335 logln("Testing hashcode");
336 if(set1
->hashCode() != set1equal
.hashCode() || set1
->hashCode() != set1clone
->hashCode() ||
337 set2
->hashCode() != set2equal
.hashCode() || set2
->hashCode() != set2clone
->hashCode() ||
338 set1copy
->hashCode() != set1equal
.hashCode() || set1copy
->hashCode() != set1clone
->hashCode() ||
339 set1
->hashCode() == set2
->hashCode() || set1copy
->hashCode() == set2
->hashCode() ||
340 set2
->hashCode() == set1clone
->hashCode() || set2
->hashCode() == set1equal
.hashCode() ){
341 errln("FAIL: Error in hashCode()");
353 UnicodeSetTest::TestAddRemove(void) {
354 UnicodeSet set
; // Construct empty set
355 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
356 doAssert(set
.size() == 0, "size should be 0");
358 doAssert(set
.size() == 0x110000, "size should be 0x110000");
360 set
.add(0x0061, 0x007a);
361 expectPairs(set
, "az");
362 doAssert(set
.isEmpty() == FALSE
, "set should not be empty");
363 doAssert(set
.size() != 0, "size should not be equal to 0");
364 doAssert(set
.size() == 26, "size should be equal to 26");
365 set
.remove(0x006d, 0x0070);
366 expectPairs(set
, "alqz");
367 doAssert(set
.size() == 22, "size should be equal to 22");
368 set
.remove(0x0065, 0x0067);
369 expectPairs(set
, "adhlqz");
370 doAssert(set
.size() == 19, "size should be equal to 19");
371 set
.remove(0x0064, 0x0069);
372 expectPairs(set
, "acjlqz");
373 doAssert(set
.size() == 16, "size should be equal to 16");
374 set
.remove(0x0063, 0x0072);
375 expectPairs(set
, "absz");
376 doAssert(set
.size() == 10, "size should be equal to 10");
377 set
.add(0x0066, 0x0071);
378 expectPairs(set
, "abfqsz");
379 doAssert(set
.size() == 22, "size should be equal to 22");
380 set
.remove(0x0061, 0x0067);
381 expectPairs(set
, "hqsz");
382 set
.remove(0x0061, 0x007a);
383 expectPairs(set
, "");
384 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
385 doAssert(set
.size() == 0, "size should be 0");
387 doAssert(set
.isEmpty() == FALSE
, "set should not be empty");
388 doAssert(set
.size() == 1, "size should not be equal to 1");
391 expectPairs(set
, "ac");
392 doAssert(set
.size() == 3, "size should not be equal to 3");
395 expectPairs(set
, "acpq");
396 doAssert(set
.size() == 5, "size should not be equal to 5");
398 expectPairs(set
, "");
399 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
400 doAssert(set
.size() == 0, "size should be 0");
402 // Try removing an entire set from another set
403 expectPattern(set
, "[c-x]", "cx");
405 expectPattern(set2
, "[f-ky-za-bc[vw]]", "acfkvwyz");
407 expectPairs(set
, "deluxx");
409 // Try adding an entire set to another set
410 expectPattern(set
, "[jackiemclean]", "aacceein");
411 expectPattern(set2
, "[hitoshinamekatajamesanderson]", "aadehkmort");
413 expectPairs(set
, "aacehort");
414 doAssert(set
.containsAll(set2
) == TRUE
, "set should contain all the elements in set2");
416 // Try retaining an set of elements contained in another set (intersection)
418 expectPattern(set3
, "[a-c]", "ac");
419 doAssert(set
.containsAll(set3
) == FALSE
, "set doesn't contain all the elements in set3");
421 expectPairs(set3
, "aacc");
422 doAssert(set
.containsAll(set3
) == TRUE
, "set should contain all the elements in set3");
424 expectPairs(set
, "aacc");
425 doAssert(set
.size() == set3
.size(), "set.size() should be set3.size()");
426 doAssert(set
.containsAll(set3
) == TRUE
, "set should contain all the elements in set3");
428 doAssert(set
.size() != set3
.size(), "set.size() != set3.size()");
430 // Test commutativity
431 expectPattern(set
, "[hitoshinamekatajamesanderson]", "aadehkmort");
432 expectPattern(set2
, "[jackiemclean]", "aacceein");
434 expectPairs(set
, "aacehort");
435 doAssert(set
.containsAll(set2
) == TRUE
, "set should contain all the elements in set2");
443 * Make sure minimal representation is maintained.
445 void UnicodeSetTest::TestMinimalRep() {
446 UErrorCode status
= U_ZERO_ERROR
;
447 // This is pretty thoroughly tested by checkCanonicalRep()
448 // run against the exhaustive operation results. Use the code
449 // here for debugging specific spot problems.
451 // 1 overlap against 2
452 UnicodeSet
set("[h-km-q]", status
);
453 if (U_FAILURE(status
)) { errln("FAIL"); return; }
454 UnicodeSet
set2("[i-o]", status
);
455 if (U_FAILURE(status
)) { errln("FAIL"); return; }
457 expectPairs(set
, "hq");
459 set
.applyPattern("[a-m]", status
);
460 if (U_FAILURE(status
)) { errln("FAIL"); return; }
461 set2
.applyPattern("[e-o]", status
);
462 if (U_FAILURE(status
)) { errln("FAIL"); return; }
464 expectPairs(set
, "ao");
466 set
.applyPattern("[e-o]", status
);
467 if (U_FAILURE(status
)) { errln("FAIL"); return; }
468 set2
.applyPattern("[a-m]", status
);
469 if (U_FAILURE(status
)) { errln("FAIL"); return; }
471 expectPairs(set
, "ao");
472 // 1 overlap against 3
473 set
.applyPattern("[a-eg-mo-w]", status
);
474 if (U_FAILURE(status
)) { errln("FAIL"); return; }
475 set2
.applyPattern("[d-q]", status
);
476 if (U_FAILURE(status
)) { errln("FAIL"); return; }
478 expectPairs(set
, "aw");
481 void UnicodeSetTest::TestAPI() {
482 UErrorCode status
= U_ZERO_ERROR
;
485 if (!set
.isEmpty() || set
.getRangeCount() != 0) {
486 errln((UnicodeString
)"FAIL, set should be empty but isn't: " +
490 // clear(), isEmpty()
493 errln((UnicodeString
)"FAIL, set shouldn't be empty but is: " +
497 if (!set
.isEmpty()) {
498 errln((UnicodeString
)"FAIL, set should be empty but isn't: " +
504 if (set
.size() != 0) {
505 errln((UnicodeString
)"FAIL, size should be 0, but is " + set
.size() +
509 if (set
.size() != 1) {
510 errln((UnicodeString
)"FAIL, size should be 1, but is " + set
.size() +
513 set
.add(0x0031, 0x0039);
514 if (set
.size() != 10) {
515 errln((UnicodeString
)"FAIL, size should be 10, but is " + set
.size() +
519 // contains(first, last)
521 set
.applyPattern("[A-Y 1-8 b-d l-y]", status
);
522 if (U_FAILURE(status
)) { errln("FAIL"); return; }
523 for (int32_t i
= 0; i
<set
.getRangeCount(); ++i
) {
524 UChar32 a
= set
.getRangeStart(i
);
525 UChar32 b
= set
.getRangeEnd(i
);
526 if (!set
.contains(a
, b
)) {
527 errln((UnicodeString
)"FAIL, should contain " + (unsigned short)a
+ '-' + (unsigned short)b
+
528 " but doesn't: " + set
);
530 if (set
.contains((UChar32
)(a
-1), b
)) {
531 errln((UnicodeString
)"FAIL, shouldn't contain " +
532 (unsigned short)(a
-1) + '-' + (unsigned short)b
+
533 " but does: " + set
);
535 if (set
.contains(a
, (UChar32
)(b
+1))) {
536 errln((UnicodeString
)"FAIL, shouldn't contain " +
537 (unsigned short)a
+ '-' + (unsigned short)(b
+1) +
538 " but does: " + set
);
542 // Ported InversionList test.
543 UnicodeSet
a((UChar32
)3,(UChar32
)10);
544 UnicodeSet
b((UChar32
)7,(UChar32
)15);
547 logln((UnicodeString
)"a [3-10]: " + a
);
548 logln((UnicodeString
)"b [7-15]: " + b
);
551 UnicodeSet
exp((UChar32
)3,(UChar32
)15);
553 logln((UnicodeString
)"c.set(a).add(b): " + c
);
555 errln((UnicodeString
)"FAIL: c.set(a).add(b) = " + c
+ ", expect " + exp
);
558 exp
.set((UChar32
)0, (UChar32
)2);
559 exp
.add((UChar32
)16, UnicodeSet::MAX_VALUE
);
561 logln((UnicodeString
)"c.complement(): " + c
);
563 errln((UnicodeString
)"FAIL: c.complement() = " + c
+ ", expect " + exp
);
566 exp
.set((UChar32
)3, (UChar32
)15);
568 logln((UnicodeString
)"c.complement(): " + c
);
570 errln((UnicodeString
)"FAIL: c.complement() = " + c
+ ", expect " + exp
);
574 exp
.set((UChar32
)3,(UChar32
)6);
575 exp
.add((UChar32
)11,(UChar32
) 15);
577 logln((UnicodeString
)"c.set(a).exclusiveOr(b): " + c
);
579 errln((UnicodeString
)"FAIL: c.set(a).exclusiveOr(b) = " + c
+ ", expect " + exp
);
583 bitsToSet(setToBits(c
), c
);
585 logln((UnicodeString
)"bitsToSet(setToBits(c)): " + c
);
587 errln((UnicodeString
)"FAIL: bitsToSet(setToBits(c)) = " + c
+ ", expect " + exp
);
590 // Additional tests for coverage JB#2118
591 //UnicodeSet::complement(class UnicodeString const &)
592 //UnicodeSet::complementAll(class UnicodeString const &)
593 //UnicodeSet::containsNone(class UnicodeSet const &)
594 //UnicodeSet::containsNone(long,long)
595 //UnicodeSet::containsSome(class UnicodeSet const &)
596 //UnicodeSet::containsSome(long,long)
597 //UnicodeSet::removeAll(class UnicodeString const &)
598 //UnicodeSet::retain(long)
599 //UnicodeSet::retainAll(class UnicodeString const &)
600 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
601 //UnicodeSetIterator::getString(void)
603 set
.complement("ab");
604 exp
.applyPattern("[{ab}]", status
);
605 if (U_FAILURE(status
)) { errln("FAIL"); return; }
606 if (set
!= exp
) { errln("FAIL: complement(\"ab\")"); return; }
608 UnicodeSetIterator
iset(set
);
609 if (!iset
.next() || !iset
.isString()) {
610 errln("FAIL: UnicodeSetIterator::next/isString");
611 } else if (iset
.getString() != "ab") {
612 errln("FAIL: UnicodeSetIterator::getString");
615 set
.add((UChar32
)0x61, (UChar32
)0x7A);
616 set
.complementAll("alan");
617 exp
.applyPattern("[{ab}b-kmo-z]", status
);
618 if (U_FAILURE(status
)) { errln("FAIL"); return; }
619 if (set
!= exp
) { errln("FAIL: complementAll(\"alan\")"); return; }
621 exp
.applyPattern("[a-z]", status
);
622 if (U_FAILURE(status
)) { errln("FAIL"); return; }
623 if (set
.containsNone(exp
)) { errln("FAIL: containsNone(UnicodeSet)"); }
624 if (!set
.containsSome(exp
)) { errln("FAIL: containsSome(UnicodeSet)"); }
625 exp
.applyPattern("[aln]", status
);
626 if (U_FAILURE(status
)) { errln("FAIL"); return; }
627 if (!set
.containsNone(exp
)) { errln("FAIL: containsNone(UnicodeSet)"); }
628 if (set
.containsSome(exp
)) { errln("FAIL: containsSome(UnicodeSet)"); }
630 if (set
.containsNone((UChar32
)0x61, (UChar32
)0x7A)) {
631 errln("FAIL: containsNone(UChar32, UChar32)");
633 if (!set
.containsSome((UChar32
)0x61, (UChar32
)0x7A)) {
634 errln("FAIL: containsSome(UChar32, UChar32)");
636 if (!set
.containsNone((UChar32
)0x41, (UChar32
)0x5A)) {
637 errln("FAIL: containsNone(UChar32, UChar32)");
639 if (set
.containsSome((UChar32
)0x41, (UChar32
)0x5A)) {
640 errln("FAIL: containsSome(UChar32, UChar32)");
643 set
.removeAll("liu");
644 exp
.applyPattern("[{ab}b-hj-kmo-tv-z]", status
);
645 if (U_FAILURE(status
)) { errln("FAIL"); return; }
646 if (set
!= exp
) { errln("FAIL: removeAll(\"liu\")"); return; }
648 set
.retainAll("star");
649 exp
.applyPattern("[rst]", status
);
650 if (U_FAILURE(status
)) { errln("FAIL"); return; }
651 if (set
!= exp
) { errln("FAIL: retainAll(\"star\")"); return; }
653 set
.retain((UChar32
)0x73);
654 exp
.applyPattern("[s]", status
);
655 if (U_FAILURE(status
)) { errln("FAIL"); return; }
656 if (set
!= exp
) { errln("FAIL: retain('s')"); return; }
659 int32_t slen
= set
.serialize(buf
, sizeof(buf
)/sizeof(buf
[0]), status
);
660 if (U_FAILURE(status
)) { errln("FAIL: serialize"); return; }
661 if (slen
!= 3 || buf
[0] != 2 || buf
[1] != 0x73 || buf
[2] != 0x74) {
662 errln("FAIL: serialize");
667 void UnicodeSetTest::TestStrings() {
668 UErrorCode ec
= U_ZERO_ERROR
;
670 UnicodeSet
* testList
[] = {
671 UnicodeSet::createFromAll("abc"),
672 new UnicodeSet("[a-c]", ec
),
674 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
675 new UnicodeSet("[{ll}{ch}a-z]", ec
),
677 UnicodeSet::createFrom("ab}c"),
678 new UnicodeSet("[{ab\\}c}]", ec
),
680 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
681 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec
),
687 errln("FAIL: couldn't construct test sets");
690 for (int32_t i
= 0; testList
[i
] != NULL
; i
+=2) {
692 UnicodeString pat0
, pat1
;
693 testList
[i
]->toPattern(pat0
, TRUE
);
694 testList
[i
+1]->toPattern(pat1
, TRUE
);
695 if (*testList
[i
] == *testList
[i
+1]) {
696 logln((UnicodeString
)"Ok: " + pat0
+ " == " + pat1
);
698 logln((UnicodeString
)"FAIL: " + pat0
+ " != " + pat1
);
702 delete testList
[i
+1];
707 * Test the [:Latin:] syntax.
709 void UnicodeSetTest::TestScriptSet() {
710 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
712 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
715 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
720 * Test the [:Latin:] syntax.
722 void UnicodeSetTest::TestPropertySet() {
723 static const char* DATA
[] = {
724 // Pattern, Chars IN, Chars NOT in
734 "\\P{ GENERAL Category = upper case letter }",
738 // Combining class: @since ICU 2.2
739 // Check both symbolic and numeric
744 "\\p{Canonical Combining Class = 11}",
748 "[:c c c = iota subscript :]",
752 // Bidi class: @since ICU 2.2
753 "\\p{bidiclass=lefttoright}",
757 // Binary properties: @since ICU 2.2
764 // weiv: )(and * were removed from math in Unicode 4.0.1
768 // JB#1767 \N{}, \p{ASCII}
773 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
789 "\\u03D8\\u03D9", // 3.2
792 "\\u1800\\u3400\\U0002f800",
793 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
795 // JB#2350: Case_Sensitive
796 "[:Case Sensitive:]",
797 "A\\u1FFC\\U00010410",
798 ";\\u00B4\\U00010500",
800 // JB#2832: C99-compatibility props
807 " \\u0003\\u0007\\u0009\\u000A\\u000D",
810 "!@#%&*()[]{}-_\\/;:,.?'\"",
817 // Regex compatibility test
818 "[-b]", // leading '-' is literal
822 "[^-b]", // leading '-' is literal
826 "[b-]", // trailing '-' is literal
830 "[^b-]", // trailing '-' is literal
834 "[a-b-]", // trailing '-' is literal
838 "[[a-q]&[p-z]-]", // trailing '-' is literal
842 "[\\s|\\)|:|$|\\>]", // from regex tests
846 "[\\uDC00cd]", // JB#2906: isolated trail at start
848 "ab\\uD800\\U00010000",
850 "[ab\\uD800]", // JB#2906: isolated trail at start
852 "cd\\uDC00\\U00010000",
854 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
856 "ef\\uDC00\\U00010000",
858 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
860 "ef\\uD800\\U00010000",
862 "[:^lccc=0:]", // Lead canonical class
864 "abcd\\u00c0\\u00c5",
866 "[:^tccc=0:]", // Trail canonical class
867 "\\u0300\\u0301\\u00c0\\u00c5",
870 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
871 "\\u0300\\u0301\\u00c0\\u00c5",
874 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
876 "abcd\\u0300\\u0301\\u00c0\\u00c5",
878 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
879 "\\u0F73\\u0F75\\u0F81",
880 "abcd\\u0300\\u0301\\u00c0\\u00c5",
884 static const int32_t DATA_LEN
= sizeof(DATA
)/sizeof(DATA
[0]);
886 for (int32_t i
=0; i
<DATA_LEN
; i
+=3) {
887 expectContainment(DATA
[i
], CharsToUnicodeString(DATA
[i
+1]),
888 CharsToUnicodeString(DATA
[i
+2]));
893 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
895 void UnicodeSetTest::TestClone() {
896 UErrorCode ec
= U_ZERO_ERROR
;
897 UnicodeSet
s("[abcxyz]", ec
);
899 expectContainment(t
, "abc", "def");
903 * Test the indexOf() and charAt() methods.
905 void UnicodeSetTest::TestIndexOf() {
906 UErrorCode ec
= U_ZERO_ERROR
;
907 UnicodeSet
set("[a-cx-y3578]", ec
);
909 errln("FAIL: UnicodeSet constructor");
912 for (int32_t i
=0; i
<set
.size(); ++i
) {
913 UChar32 c
= set
.charAt(i
);
914 if (set
.indexOf(c
) != i
) {
915 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
916 i
, c
, set
.indexOf(c
));
919 UChar32 c
= set
.charAt(set
.size());
921 errln("FAIL: charAt(<out of range>) = %X", c
);
923 int32_t j
= set
.indexOf((UChar32
)0x71/*'q'*/);
925 errln((UnicodeString
)"FAIL: indexOf('q') = " + j
);
932 void UnicodeSetTest::TestCloseOver() {
933 UErrorCode ec
= U_ZERO_ERROR
;
935 char CASE
[] = {(char)USET_CASE
};
936 char CASE_MAPPINGS
[] = {(char)USET_ADD_CASE_MAPPINGS
};
937 const char* DATA
[] = {
938 // selector, input, output
940 "[aq\\u00DF{Bc}{bC}{Fi}]",
941 "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
945 "[\\u01F1\\u01F2\\u01F3]",
949 "[\\u1FB4{\\u03AC\\u03B9}]",
955 CASE
, // make sure binary search finds limits
957 "[aA\\uFF3A\\uFF5A]",
960 "[a-z]","[A-Za-z\\u017F\\u212A]",
967 "[aq\\u00DF{Bc}{bC}{Fi}]",
968 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
972 "[\\u01F1\\u01F2\\u01F3]",
983 for (int32_t i
=0; DATA
[i
]!=NULL
; i
+=3) {
984 int32_t selector
= DATA
[i
][0];
985 UnicodeString
pat(DATA
[i
+1]);
986 UnicodeString
exp(DATA
[i
+2]);
987 s
.applyPattern(pat
, ec
);
988 s
.closeOver(selector
);
989 t
.applyPattern(exp
, ec
);
991 errln("FAIL: applyPattern failed");
995 logln((UnicodeString
)"Ok: " + pat
+ ".closeOver(" + selector
+ ") => " + exp
);
998 errln((UnicodeString
)"FAIL: " + pat
+ ".closeOver(" + selector
+ ") => " +
999 s
.toPattern(buf
, TRUE
) + ", expected " + exp
);
1003 // Test the pattern API
1004 s
.applyPattern("[abc]", USET_CASE_INSENSITIVE
, NULL
, ec
);
1005 if (U_FAILURE(ec
)) {
1006 errln("FAIL: applyPattern failed");
1008 expectContainment(s
, "abcABC", "defDEF");
1010 UnicodeSet
v("[^abc]", USET_CASE_INSENSITIVE
, NULL
, ec
);
1011 if (U_FAILURE(ec
)) {
1012 errln("FAIL: constructor failed");
1014 expectContainment(v
, "defDEF", "abcABC");
1016 UnicodeSet
cm("[abck]", USET_ADD_CASE_MAPPINGS
, NULL
, ec
);
1017 if (U_FAILURE(ec
)) {
1018 errln("FAIL: construct w/case mappings failed");
1020 expectContainment(cm
, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1024 void UnicodeSetTest::TestEscapePattern() {
1025 const char pattern
[] =
1026 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1028 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1029 // We test this with two passes; in the second pass we
1030 // pre-unescape the pattern. Since U+200E is rule whitespace,
1031 // this fails -- which is what we expect.
1032 for (int32_t pass
=1; pass
<=2; ++pass
) {
1033 UErrorCode ec
= U_ZERO_ERROR
;
1034 UnicodeString
pat(pattern
);
1036 pat
= pat
.unescape();
1038 // Pattern is only good for pass 1
1039 UBool isPatternValid
= (pass
==1);
1041 UnicodeSet
set(pat
, ec
);
1042 if (U_SUCCESS(ec
) != isPatternValid
){
1043 errln((UnicodeString
)"FAIL: applyPattern(" +
1044 escape(pat
) + ") => " +
1048 if (U_FAILURE(ec
)) {
1051 if (set
.contains((UChar
)0x0644)){
1052 errln((UnicodeString
)"FAIL: " + escape(pat
) + " contains(U+0664)");
1055 UnicodeString newpat
;
1056 set
.toPattern(newpat
, TRUE
);
1057 if (newpat
== exp
) {
1058 logln(escape(pat
) + " => " + newpat
);
1060 errln((UnicodeString
)"FAIL: " + escape(pat
) + " => " + newpat
);
1063 for (int32_t i
=0; i
<set
.getRangeCount(); ++i
) {
1064 UnicodeString
str("Range ");
1065 str
.append((UChar
)(0x30 + i
))
1067 .append((UChar32
)set
.getRangeStart(i
))
1069 .append((UChar32
)set
.getRangeEnd(i
));
1070 str
= str
+ " (" + set
.getRangeStart(i
) + " - " +
1071 set
.getRangeEnd(i
) + ")";
1072 if (set
.getRangeStart(i
) < 0) {
1073 errln((UnicodeString
)"FAIL: " + escape(str
));
1081 void UnicodeSetTest::expectRange(const UnicodeString
& label
,
1082 const UnicodeSet
& set
,
1083 UChar32 start
, UChar32 end
) {
1084 UnicodeSet
exp(start
, end
);
1087 logln(label
+ " => " + set
.toPattern(pat
, TRUE
));
1090 errln((UnicodeString
)"FAIL: " + label
+ " => " +
1091 set
.toPattern(pat
, TRUE
) +
1092 ", expected " + exp
.toPattern(xpat
, TRUE
));
1096 void UnicodeSetTest::TestInvalidCodePoint() {
1098 const UChar32 DATA
[] = {
1099 // Test range Expected range
1100 0, 0x10FFFF, 0, 0x10FFFF,
1101 (UChar32
)-1, 8, 0, 8,
1102 8, 0x110000, 8, 0x10FFFF
1104 const int32_t DATA_LENGTH
= sizeof(DATA
)/sizeof(DATA
[0]);
1109 for (i
=0; i
<DATA_LENGTH
; i
+=4) {
1110 UChar32 start
= DATA
[i
];
1111 UChar32 end
= DATA
[i
+1];
1112 UChar32 xstart
= DATA
[i
+2];
1113 UChar32 xend
= DATA
[i
+3];
1115 // Try various API using the test code points
1117 UnicodeSet
set(start
, end
);
1118 expectRange((UnicodeString
)"ct(" + start
+ "," + end
+ ")",
1122 set
.set(start
, end
);
1123 expectRange((UnicodeString
)"set(" + start
+ "," + end
+ ")",
1126 UBool b
= set
.contains(start
);
1127 b
= set
.contains(start
, end
);
1128 b
= set
.containsNone(start
, end
);
1129 b
= set
.containsSome(start
, end
);
1131 /*int32_t index = set.indexOf(start);*/
1135 set
.add(start
, end
);
1136 expectRange((UnicodeString
)"add(" + start
+ "," + end
+ ")",
1139 set
.set(0, 0x10FFFF);
1140 set
.retain(start
, end
);
1141 expectRange((UnicodeString
)"retain(" + start
+ "," + end
+ ")",
1145 set
.set(0, 0x10FFFF);
1147 set
.remove(start
, end
);
1149 expectRange((UnicodeString
)"!remove(" + start
+ "," + end
+ ")",
1152 set
.set(0, 0x10FFFF);
1153 set
.complement(start
, end
);
1155 expectRange((UnicodeString
)"!complement(" + start
+ "," + end
+ ")",
1157 set
.complement(start
);
1160 const UChar32 DATA2
[] = {
1166 const int32_t DATA2_LENGTH
= sizeof(DATA2
)/sizeof(DATA2
[0]);
1168 for (i
=0; i
<DATA2_LENGTH
; ++i
) {
1169 UChar32 c
= DATA2
[i
], end
= 0x10FFFF;
1170 UBool valid
= (c
>= 0 && c
<= 0x10FFFF);
1172 UnicodeSet
set(0, 0x10FFFF);
1174 // For single-codepoint contains, invalid codepoints are NOT contained
1175 UBool b
= set
.contains(c
);
1177 logln((UnicodeString
)"[\\u0000-\\U0010FFFF].contains(" + c
+
1180 errln((UnicodeString
)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c
+
1184 // For codepoint range contains, containsNone, and containsSome,
1185 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1186 b
= set
.contains(c
, end
);
1187 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].contains(" + c
+
1188 "," + end
+ ") = " + b
);
1190 b
= set
.containsNone(c
, end
);
1191 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].containsNone(" + c
+
1192 "," + end
+ ") = " + b
);
1194 b
= set
.containsSome(c
, end
);
1195 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].containsSome(" + c
+
1196 "," + end
+ ") = " + b
);
1198 int32_t index
= set
.indexOf(c
);
1199 if ((index
>= 0) == valid
) {
1200 logln((UnicodeString
)"[\\u0000-\\U0010FFFF].indexOf(" + c
+
1203 errln((UnicodeString
)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c
+
1209 // Used by TestSymbolTable
1210 class TokenSymbolTable
: public SymbolTable
{
1214 TokenSymbolTable(UErrorCode
& ec
) : contents(FALSE
, ec
) {
1215 contents
.setValueDeleter(uhash_deleteUnicodeString
);
1218 ~TokenSymbolTable() {}
1221 * (Non-SymbolTable API) Add the given variable and value to
1222 * the table. Variable should NOT contain leading '$'.
1224 void add(const UnicodeString
& var
, const UnicodeString
& value
,
1226 if (U_SUCCESS(ec
)) {
1227 contents
.put(var
, new UnicodeString(value
), ec
);
1234 virtual const UnicodeString
* lookup(const UnicodeString
& s
) const {
1235 return (const UnicodeString
*) contents
.get(s
);
1241 virtual const UnicodeFunctor
* lookupMatcher(UChar32
/*ch*/) const {
1248 virtual UnicodeString
parseReference(const UnicodeString
& text
,
1249 ParsePosition
& pos
, int32_t limit
) const {
1250 int32_t start
= pos
.getIndex();
1252 UnicodeString result
;
1254 UChar c
= text
.charAt(i
);
1255 if ((i
==start
&& !u_isIDStart(c
)) || !u_isIDPart(c
)) {
1260 if (i
== start
) { // No valid name chars
1261 return result
; // Indicate failure with empty string
1264 text
.extractBetween(start
, i
, result
);
1269 void UnicodeSetTest::TestSymbolTable() {
1270 // Multiple test cases can be set up here. Each test case
1271 // is terminated by null:
1272 // var, value, var, value,..., input pat., exp. output pat., null
1273 const char* DATA
[] = {
1274 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL
,
1275 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL
,
1276 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL
,
1280 for (int32_t i
=0; DATA
[i
]!=NULL
; ++i
) {
1281 UErrorCode ec
= U_ZERO_ERROR
;
1282 TokenSymbolTable
sym(ec
);
1283 if (U_FAILURE(ec
)) {
1284 errln("FAIL: couldn't construct TokenSymbolTable");
1289 while (DATA
[i
+2] != NULL
) {
1290 sym
.add(DATA
[i
], DATA
[i
+1], ec
);
1291 if (U_FAILURE(ec
)) {
1292 errln("FAIL: couldn't add to TokenSymbolTable");
1298 // Input pattern and expected output pattern
1299 UnicodeString inpat
= DATA
[i
], exppat
= DATA
[i
+1];
1302 ParsePosition
pos(0);
1303 UnicodeSet
us(inpat
, pos
, USET_IGNORE_SPACE
, &sym
, ec
);
1304 if (U_FAILURE(ec
)) {
1305 errln("FAIL: couldn't construct UnicodeSet");
1310 if (pos
.getIndex() != inpat
.length()) {
1311 errln((UnicodeString
)"Failed to read to end of string \""
1312 + inpat
+ "\": read to "
1313 + pos
.getIndex() + ", length is "
1317 UnicodeSet
us2(exppat
, ec
);
1318 if (U_FAILURE(ec
)) {
1319 errln("FAIL: couldn't construct expected UnicodeSet");
1325 errln((UnicodeString
)"Failed, got " + us
.toPattern(a
, TRUE
) +
1326 ", expected " + us2
.toPattern(b
, TRUE
));
1328 logln((UnicodeString
)"Ok, got " + us
.toPattern(a
, TRUE
));
1333 void UnicodeSetTest::TestSurrogate() {
1334 const char* DATA
[] = {
1335 // These should all behave identically
1336 "[abc\\uD800\\uDC00]",
1337 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1341 for (int i
=0; DATA
[i
] != 0; ++i
) {
1342 UErrorCode ec
= U_ZERO_ERROR
;
1343 logln((UnicodeString
)"Test pattern " + i
+ " :" + DATA
[i
]);
1344 UnicodeSet
set(DATA
[i
], ec
);
1345 if (U_FAILURE(ec
)) {
1346 errln("FAIL: UnicodeSet constructor");
1349 expectContainment(set
,
1350 CharsToUnicodeString("abc\\U00010000"),
1351 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1352 if (set
.size() != 4) {
1353 errln((UnicodeString
)"FAIL: " + DATA
[i
] + ".size() == " +
1354 set
.size() + ", expected 4");
1359 void UnicodeSetTest::TestExhaustive() {
1360 // exhaustive tests. Simulate UnicodeSets with integers.
1361 // That gives us very solid tests (except for large memory tests).
1363 int32_t limit
= 128;
1365 UnicodeSet x
, y
, z
, aa
;
1367 for (int32_t i
= 0; i
< limit
; ++i
) {
1369 logln((UnicodeString
)"Testing " + i
+ ", " + x
);
1370 _testComplement(i
, x
, y
);
1372 // AS LONG AS WE ARE HERE, check roundtrip
1373 checkRoundTrip(bitsToSet(i
, aa
));
1375 for (int32_t j
= 0; j
< limit
; ++j
) {
1376 _testAdd(i
,j
, x
,y
,z
);
1377 _testXor(i
,j
, x
,y
,z
);
1378 _testRetain(i
,j
, x
,y
,z
);
1379 _testRemove(i
,j
, x
,y
,z
);
1384 void UnicodeSetTest::_testComplement(int32_t a
, UnicodeSet
& x
, UnicodeSet
& z
) {
1388 int32_t c
= setToBits(z
);
1390 errln((UnicodeString
)"FAILED: add: ~" + x
+ " != " + z
);
1391 errln((UnicodeString
)"FAILED: add: ~" + a
+ " != " + c
);
1393 checkCanonicalRep(z
, (UnicodeString
)"complement " + a
);
1396 void UnicodeSetTest::_testAdd(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1401 int32_t c
= setToBits(z
);
1403 errln((UnicodeString
)"FAILED: add: " + x
+ " | " + y
+ " != " + z
);
1404 errln((UnicodeString
)"FAILED: add: " + a
+ " | " + b
+ " != " + c
);
1406 checkCanonicalRep(z
, (UnicodeString
)"add " + a
+ "," + b
);
1409 void UnicodeSetTest::_testRetain(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1414 int32_t c
= setToBits(z
);
1416 errln((UnicodeString
)"FAILED: retain: " + x
+ " & " + y
+ " != " + z
);
1417 errln((UnicodeString
)"FAILED: retain: " + a
+ " & " + b
+ " != " + c
);
1419 checkCanonicalRep(z
, (UnicodeString
)"retain " + a
+ "," + b
);
1422 void UnicodeSetTest::_testRemove(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1427 int32_t c
= setToBits(z
);
1428 if (c
!= (a
&~ b
)) {
1429 errln((UnicodeString
)"FAILED: remove: " + x
+ " &~ " + y
+ " != " + z
);
1430 errln((UnicodeString
)"FAILED: remove: " + a
+ " &~ " + b
+ " != " + c
);
1432 checkCanonicalRep(z
, (UnicodeString
)"remove " + a
+ "," + b
);
1435 void UnicodeSetTest::_testXor(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1440 int32_t c
= setToBits(z
);
1442 errln((UnicodeString
)"FAILED: complement: " + x
+ " ^ " + y
+ " != " + z
);
1443 errln((UnicodeString
)"FAILED: complement: " + a
+ " ^ " + b
+ " != " + c
);
1445 checkCanonicalRep(z
, (UnicodeString
)"complement " + a
+ "," + b
);
1449 * Check that ranges are monotonically increasing and non-
1452 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet
& set
, const UnicodeString
& msg
) {
1453 int32_t n
= set
.getRangeCount();
1455 errln((UnicodeString
)"FAIL result of " + msg
+
1456 ": range count should be >= 0 but is " +
1457 n
/*+ " for " + set.toPattern())*/);
1461 for (int32_t i
=0; i
<n
; ++i
) {
1462 UChar32 start
= set
.getRangeStart(i
);
1463 UChar32 end
= set
.getRangeEnd(i
);
1465 errln((UnicodeString
)"FAIL result of " + msg
+
1466 ": range " + (i
+1) +
1467 " start > end: " + (int)start
+ ", " + (int)end
+
1470 if (i
> 0 && start
<= last
) {
1471 errln((UnicodeString
)"FAIL result of " + msg
+
1472 ": range " + (i
+1) +
1473 " overlaps previous range: " + (int)start
+ ", " + (int)end
+
1481 * Convert a bitmask to a UnicodeSet.
1483 UnicodeSet
& UnicodeSetTest::bitsToSet(int32_t a
, UnicodeSet
& result
) {
1485 for (UChar32 i
= 0; i
< 32; ++i
) {
1486 if ((a
& (1<<i
)) != 0) {
1494 * Convert a UnicodeSet to a bitmask. Only the characters
1495 * U+0000 to U+0020 are represented in the bitmask.
1497 int32_t UnicodeSetTest::setToBits(const UnicodeSet
& x
) {
1499 for (int32_t i
= 0; i
< 32; ++i
) {
1500 if (x
.contains((UChar32
)i
)) {
1508 * Return the representation of an inversion list based UnicodeSet
1509 * as a pairs list. Ranges are listed in ascending Unicode order.
1510 * For example, the set [a-zA-M3] is represented as "33AMaz".
1512 UnicodeString
UnicodeSetTest::getPairs(const UnicodeSet
& set
) {
1513 UnicodeString pairs
;
1514 for (int32_t i
=0; i
<set
.getRangeCount(); ++i
) {
1515 UChar32 start
= set
.getRangeStart(i
);
1516 UChar32 end
= set
.getRangeEnd(i
);
1519 i
= set
.getRangeCount(); // Should be unnecessary
1521 pairs
.append((UChar
)start
).append((UChar
)end
);
1527 * Basic consistency check for a few items.
1528 * That the iterator works, and that we can create a pattern and
1529 * get the same thing back
1531 void UnicodeSetTest::checkRoundTrip(const UnicodeSet
& s
) {
1532 UErrorCode ec
= U_ZERO_ERROR
;
1535 checkEqual(s
, t
, "copy ct");
1538 checkEqual(s
, t
, "operator=");
1540 copyWithIterator(t
, s
, FALSE
);
1541 checkEqual(s
, t
, "iterator roundtrip");
1543 copyWithIterator(t
, s
, TRUE
); // try range
1544 checkEqual(s
, t
, "iterator roundtrip");
1546 UnicodeString pat
; s
.toPattern(pat
, FALSE
);
1547 t
.applyPattern(pat
, ec
);
1548 if (U_FAILURE(ec
)) {
1549 errln("FAIL: applyPattern");
1552 checkEqual(s
, t
, "toPattern(false)");
1555 s
.toPattern(pat
, TRUE
);
1556 t
.applyPattern(pat
, ec
);
1557 if (U_FAILURE(ec
)) {
1558 errln("FAIL: applyPattern");
1561 checkEqual(s
, t
, "toPattern(true)");
1565 void UnicodeSetTest::copyWithIterator(UnicodeSet
& t
, const UnicodeSet
& s
, UBool withRange
) {
1567 UnicodeSetIterator
it(s
);
1569 while (it
.nextRange()) {
1570 if (it
.isString()) {
1571 t
.add(it
.getString());
1573 t
.add(it
.getCodepoint(), it
.getCodepointEnd());
1578 if (it
.isString()) {
1579 t
.add(it
.getString());
1581 t
.add(it
.getCodepoint());
1587 UBool
UnicodeSetTest::checkEqual(const UnicodeSet
& s
, const UnicodeSet
& t
, const char* message
) {
1588 UnicodeString source
; s
.toPattern(source
, TRUE
);
1589 UnicodeString result
; t
.toPattern(result
, TRUE
);
1591 errln((UnicodeString
)"FAIL: " + message
1592 + "; source = " + source
1593 + "; result = " + result
1597 logln((UnicodeString
)"Ok: " + message
1598 + "; source = " + source
1599 + "; result = " + result
1606 UnicodeSetTest::expectContainment(const UnicodeString
& pat
,
1607 const UnicodeString
& charsIn
,
1608 const UnicodeString
& charsOut
) {
1609 UErrorCode ec
= U_ZERO_ERROR
;
1610 UnicodeSet
set(pat
, ec
);
1611 if (U_FAILURE(ec
)) {
1612 errln((UnicodeString
)"FAIL: pattern \"" +
1613 pat
+ "\" => " + u_errorName(ec
));
1616 expectContainment(set
, pat
, charsIn
, charsOut
);
1620 UnicodeSetTest::expectContainment(const UnicodeSet
& set
,
1621 const UnicodeString
& charsIn
,
1622 const UnicodeString
& charsOut
) {
1625 expectContainment(set
, pat
, charsIn
, charsOut
);
1629 UnicodeSetTest::expectContainment(const UnicodeSet
& set
,
1630 const UnicodeString
& setName
,
1631 const UnicodeString
& charsIn
,
1632 const UnicodeString
& charsOut
) {
1637 for (i
=0; i
<charsIn
.length(); i
+=U16_LENGTH(c
)) {
1638 c
= charsIn
.char32At(i
);
1639 if (!set
.contains(c
)) {
1643 if (bad
.length() > 0) {
1644 errln((UnicodeString
)"Fail: set " + setName
+ " does not contain " + prettify(bad
) +
1645 ", expected containment of " + prettify(charsIn
));
1647 logln((UnicodeString
)"Ok: set " + setName
+ " contains " + prettify(charsIn
));
1651 for (i
=0; i
<charsOut
.length(); i
+=U16_LENGTH(c
)) {
1652 c
= charsOut
.char32At(i
);
1653 if (set
.contains(c
)) {
1657 if (bad
.length() > 0) {
1658 errln((UnicodeString
)"Fail: set " + setName
+ " contains " + prettify(bad
) +
1659 ", expected non-containment of " + prettify(charsOut
));
1661 logln((UnicodeString
)"Ok: set " + setName
+ " does not contain " + prettify(charsOut
));
1666 UnicodeSetTest::expectPattern(UnicodeSet
& set
,
1667 const UnicodeString
& pattern
,
1668 const UnicodeString
& expectedPairs
){
1669 UErrorCode status
= U_ZERO_ERROR
;
1670 set
.applyPattern(pattern
, status
);
1671 if (U_FAILURE(status
)) {
1672 errln(UnicodeString("FAIL: applyPattern(\"") + pattern
+
1676 if (getPairs(set
) != expectedPairs
) {
1677 errln(UnicodeString("FAIL: applyPattern(\"") + pattern
+
1679 escape(getPairs(set
)) + "\", expected \"" +
1680 escape(expectedPairs
) + "\"");
1682 logln(UnicodeString("Ok: applyPattern(\"") + pattern
+
1684 escape(getPairs(set
)) + "\"");
1687 // the result of calling set.toPattern(), which is the string representation of
1688 // this set(set), is passed to a UnicodeSet constructor, and tested that it
1689 // will produce another set that is equal to this one.
1690 UnicodeString temppattern
;
1691 set
.toPattern(temppattern
);
1692 UnicodeSet
*tempset
=new UnicodeSet(temppattern
, status
);
1693 if (U_FAILURE(status
)) {
1694 errln(UnicodeString("FAIL: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => invalid pattern"));
1697 if(*tempset
!= set
|| getPairs(*tempset
) != getPairs(set
)){
1698 errln(UnicodeString("FAIL: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => pairs \""+ escape(getPairs(*tempset
)) + "\", expected pairs \"" +
1699 escape(getPairs(set
)) + "\""));
1701 logln(UnicodeString("Ok: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => pairs \"" + escape(getPairs(*tempset
)) + "\""));
1709 UnicodeSetTest::expectPairs(const UnicodeSet
& set
, const UnicodeString
& expectedPairs
) {
1710 if (getPairs(set
) != expectedPairs
) {
1711 errln(UnicodeString("FAIL: Expected pair list \"") +
1712 escape(expectedPairs
) + "\", got \"" +
1713 escape(getPairs(set
)) + "\"");
1717 void UnicodeSetTest::expectToPattern(const UnicodeSet
& set
,
1718 const UnicodeString
& expPat
,
1719 const char** expStrings
) {
1721 set
.toPattern(pat
, TRUE
);
1722 if (pat
== expPat
) {
1723 logln((UnicodeString
)"Ok: toPattern() => \"" + pat
+ "\"");
1725 errln((UnicodeString
)"FAIL: toPattern() => \"" + pat
+ "\", expected \"" + expPat
+ "\"");
1728 if (expStrings
== NULL
) {
1732 for (int32_t i
=0; expStrings
[i
] != NULL
; ++i
) {
1733 if (expStrings
[i
] == NOT
) { // sic; pointer comparison
1737 UnicodeString s
= CharsToUnicodeString(expStrings
[i
]);
1738 UBool contained
= set
.contains(s
);
1739 if (contained
== in
) {
1740 logln((UnicodeString
)"Ok: " + expPat
+
1741 (contained
? " contains {" : " does not contain {") +
1742 escape(expStrings
[i
]) + "}");
1744 errln((UnicodeString
)"FAIL: " + expPat
+
1745 (contained
? " contains {" : " does not contain {") +
1746 escape(expStrings
[i
]) + "}");
1751 static UChar
toHexString(int32_t i
) { return (UChar
)(i
+ (i
< 10 ? 0x30 : (0x41 - 10))); }
1754 UnicodeSetTest::doAssert(UBool condition
, const char *message
)
1757 errln(UnicodeString("ERROR : ") + message
);
1762 UnicodeSetTest::escape(const UnicodeString
& s
) {
1764 for (int32_t i
=0; i
<s
.length(); )
1766 UChar32 c
= s
.char32At(i
);
1767 if (0x0020 <= c
&& c
<= 0x007F) {
1771 buf
+= (UChar
)0x5c; buf
+= (UChar
)0x75;
1773 buf
+= (UChar
)0x5c; buf
+= (UChar
)0x55;
1774 buf
+= toHexString((c
& 0xF0000000) >> 28);
1775 buf
+= toHexString((c
& 0x0F000000) >> 24);
1776 buf
+= toHexString((c
& 0x00F00000) >> 20);
1777 buf
+= toHexString((c
& 0x000F0000) >> 16);
1779 buf
+= toHexString((c
& 0xF000) >> 12);
1780 buf
+= toHexString((c
& 0x0F00) >> 8);
1781 buf
+= toHexString((c
& 0x00F0) >> 4);
1782 buf
+= toHexString(c
& 0x000F);