2 **************************************************************************************
3 * Copyright (C) 1999-2006 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 **************************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 **************************************************************************************
12 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/uchar.h"
16 #include "unicode/usetiter.h"
17 #include "unicode/ustring.h"
18 #include "unicode/parsepos.h"
19 #include "unicode/symtable.h"
20 #include "unicode/uversion.h"
24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
25 errln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
26 u_errorName(status));}}
28 #define TEST_ASSERT(expr) {if (!(expr)) { \
29 errln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
31 UnicodeString
operator+(const UnicodeString
& left
, const UnicodeSet
& set
) {
34 return left
+ UnicodeSetTest::escape(pat
);
37 #define CASE(id,test) case id: \
41 logln((UnicodeString)""); \
47 UnicodeSetTest::runIndexedTest(int32_t index
, UBool exec
,
48 const char* &name
, char* /*par*/) {
49 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
52 CASE(1,TestAddRemove
);
53 CASE(2,TestCategories
);
54 CASE(3,TestCloneEqualHash
);
55 CASE(4,TestMinimalRep
);
57 CASE(6,TestScriptSet
);
58 CASE(7,TestPropertySet
);
60 CASE(9,TestExhaustive
);
61 CASE(10,TestToPattern
);
65 CASE(14,TestCloseOver
);
66 CASE(15,TestEscapePattern
);
67 CASE(16,TestInvalidCodePoint
);
68 CASE(17,TestSymbolTable
);
69 CASE(18,TestSurrogate
);
70 CASE(19,TestPosixClasses
);
71 CASE(20,TestIteration
);
72 default: name
= ""; break;
76 static const char NOT
[] = "%%%%";
79 * UVector was improperly copying contents
80 * This code will crash this is still true
82 void UnicodeSetTest::Testj2268() {
84 t
.add(UnicodeString("abc"));
86 UnicodeString ustrPat
;
87 test
.toPattern(ustrPat
, TRUE
);
93 void UnicodeSetTest::TestToPattern() {
94 UErrorCode ec
= U_ZERO_ERROR
;
96 // Test that toPattern() round trips with syntax characters and
99 static const char* OTHER_TOPATTERN_TESTS
[] = {
100 "[[:latin:]&[:greek:]]",
101 "[[:latin:]-[:greek:]]",
102 "[:nonspacing mark:]",
106 for (int32_t j
=0; OTHER_TOPATTERN_TESTS
[j
]!=NULL
; ++j
) {
108 UnicodeSet
s(OTHER_TOPATTERN_TESTS
[j
], ec
);
110 errln((UnicodeString
)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS
[j
]);
113 checkPat(OTHER_TOPATTERN_TESTS
[j
], s
);
116 for (UChar32 i
= 0; i
<= 0x10FFFF; ++i
) {
117 if ((i
<= 0xFF && !u_isalpha(i
)) || u_isspace(i
)) {
119 // check various combinations to make sure they all work.
120 if (i
!= 0 && !toPatternAux(i
, i
)){
123 if (!toPatternAux(0, i
)){
126 if (!toPatternAux(i
, 0xFFFF)){
133 // Test pattern behavior of multicharacter strings.
136 UnicodeSet
* s
= new UnicodeSet("[a-z {aa} {ab}]", ec
);
138 // This loop isn't a loop. It's here to make the compiler happy.
139 // If you're curious, try removing it and changing the 'break'
140 // statements (except for the last) to goto's.
142 if (U_FAILURE(ec
)) break;
143 const char* exp1
[] = {"aa", "ab", NOT
, "ac", NULL
};
144 expectToPattern(*s
, "[a-z{aa}{ab}]", exp1
);
147 const char* exp2
[] = {"aa", "ab", "ac", NOT
, "xy", NULL
};
148 expectToPattern(*s
, "[a-z{aa}{ab}{ac}]", exp2
);
150 s
->applyPattern("[a-z {\\{l} {r\\}}]", ec
);
151 if (U_FAILURE(ec
)) break;
152 const char* exp3
[] = {"{l", "r}", NOT
, "xy", NULL
};
153 expectToPattern(*s
, "[a-z{r\\}}{\\{l}]", exp3
);
156 const char* exp4
[] = {"{l", "r}", "[]", NOT
, "xy", NULL
};
157 expectToPattern(*s
, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4
);
159 s
->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec
);
160 if (U_FAILURE(ec
)) break;
161 const char* exp5
[] = {"\\u4E01\\u4E02", "\n\r", NULL
};
162 expectToPattern(*s
, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5
);
166 s
->add(UnicodeString("abc", ""));
167 s
->add(UnicodeString("abc", ""));
168 const char* exp6
[] = {"abc", NOT
, "ab", NULL
};
169 expectToPattern(*s
, "[{abc}]", exp6
);
174 if (U_FAILURE(ec
)) errln("FAIL: pattern parse error");
178 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
180 s
.add((UChar
)97, (UChar
)98); // 'a', 'b'
181 expectToPattern(s
, "[ab]", NULL
);
184 UBool
UnicodeSetTest::toPatternAux(UChar32 start
, UChar32 end
) {
186 // use Integer.toString because Utility.hex doesn't handle ints
187 UnicodeString pat
= "";
188 // TODO do these in hex
189 //String source = "0x" + Integer.toString(start,16).toUpperCase();
190 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
191 UnicodeString source
;
192 source
= source
+ (uint32_t)start
;
194 source
= source
+ ".." + (uint32_t)end
;
196 testSet
.add(start
, end
);
197 return checkPat(source
, testSet
);
200 UBool
UnicodeSetTest::checkPat(const UnicodeString
& source
,
201 const UnicodeSet
& testSet
) {
202 // What we want to make sure of is that a pattern generated
203 // by toPattern(), with or without escaped unprintables, can
204 // be passed back into the UnicodeSet constructor.
207 testSet
.toPattern(pat0
, TRUE
);
209 if (!checkPat(source
+ " (escaped)", testSet
, pat0
)) return FALSE
;
211 //String pat1 = unescapeLeniently(pat0);
212 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
215 testSet
.toPattern(pat2
, FALSE
);
216 if (!checkPat(source
, testSet
, pat2
)) return FALSE
;
218 //String pat3 = unescapeLeniently(pat2);
219 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
221 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
222 logln((UnicodeString
)source
+ " => " + pat0
+ ", " + pat2
);
226 UBool
UnicodeSetTest::checkPat(const UnicodeString
& source
,
227 const UnicodeSet
& testSet
,
228 const UnicodeString
& pat
) {
229 UErrorCode ec
= U_ZERO_ERROR
;
230 UnicodeSet
testSet2(pat
, ec
);
231 if (testSet2
!= testSet
) {
232 errln((UnicodeString
)"Fail toPattern: " + source
+ " => " + pat
);
239 UnicodeSetTest::TestPatterns(void) {
241 expectPattern(set
, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
242 expectPattern(set
, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
243 expectPattern(set
, UnicodeString("[a\\-z]", ""), "--aazz");
244 expectPattern(set
, UnicodeString("[-az]", ""), "--aazz");
245 expectPattern(set
, UnicodeString("[az-]", ""), "--aazz");
246 expectPattern(set
, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
248 // Throw in a test of complement
251 exp
.append((UChar
)0x0000).append("aeeoouu").append((UChar
)(0x007a+1)).append((UChar
)0xFFFF);
252 expectPairs(set
, exp
);
256 UnicodeSetTest::TestCategories(void) {
257 UErrorCode status
= U_ZERO_ERROR
;
258 const char* pat
= " [:Lu:] "; // Whitespace ok outside [:..:]
259 UnicodeSet
set(pat
, status
);
260 if (U_FAILURE(status
)) {
261 errln((UnicodeString
)"Fail: Can't construct set with " + pat
);
263 expectContainment(set
, pat
, "ABC", "abc");
267 int32_t failures
= 0;
268 // Make sure generation of L doesn't pollute cached Lu set
269 // First generate L, then Lu
270 set
.applyPattern("[:L:]", status
);
271 if (U_FAILURE(status
)) { errln("FAIL"); return; }
272 for (i
=0; i
<0x200; ++i
) {
273 UBool l
= u_isalpha((UChar
)i
);
274 if (l
!= set
.contains(i
)) {
275 errln((UnicodeString
)"FAIL: L contains " + (unsigned short)i
+ " = " +
277 if (++failures
== 10) break;
281 set
.applyPattern("[:Lu:]", status
);
282 if (U_FAILURE(status
)) { errln("FAIL"); return; }
283 for (i
=0; i
<0x200; ++i
) {
284 UBool lu
= (u_charType((UChar
)i
) == U_UPPERCASE_LETTER
);
285 if (lu
!= set
.contains(i
)) {
286 errln((UnicodeString
)"FAIL: Lu contains " + (unsigned short)i
+ " = " +
288 if (++failures
== 20) break;
293 UnicodeSetTest::TestCloneEqualHash(void) {
294 UErrorCode status
= U_ZERO_ERROR
;
295 // set1 and set2 used to be built with the obsolete constructor taking
296 // UCharCategory values; replaced with pattern constructors
298 UnicodeSet
*set1
=new UnicodeSet("\\p{Lowercase Letter}", status
); // :Ll: Letter, lowercase
299 UnicodeSet
*set1a
=new UnicodeSet("[:Ll:]", status
); // Letter, lowercase
300 if (U_FAILURE(status
)){
301 errln((UnicodeString
)"FAIL: Can't construst set with category->Ll");
304 UnicodeSet
*set2
=new UnicodeSet("\\p{Decimal Number}", status
); //Number, Decimal digit
305 UnicodeSet
*set2a
=new UnicodeSet("[:Nd:]", status
); //Number, Decimal digit
306 if (U_FAILURE(status
)){
307 errln((UnicodeString
)"FAIL: Can't construct set with category->Nd");
311 if (*set1
!= *set1a
) {
312 errln("FAIL: category constructor for Ll broken");
314 if (*set2
!= *set2a
) {
315 errln("FAIL: category constructor for Nd broken");
320 logln("Testing copy construction");
321 UnicodeSet
*set1copy
=new UnicodeSet(*set1
);
322 if(*set1
!= *set1copy
|| *set1
== *set2
||
323 getPairs(*set1
) != getPairs(*set1copy
) ||
324 set1
->hashCode() != set1copy
->hashCode()){
325 errln("FAIL : Error in copy construction");
329 logln("Testing =operator");
330 UnicodeSet set1equal
=*set1
;
331 UnicodeSet set2equal
=*set2
;
332 if(set1equal
!= *set1
|| set1equal
!= *set1copy
|| set2equal
!= *set2
||
333 set2equal
== *set1
|| set2equal
== *set1copy
|| set2equal
== set1equal
){
334 errln("FAIL: Error in =operator");
337 logln("Testing clone()");
338 UnicodeSet
*set1clone
=(UnicodeSet
*)set1
->clone();
339 UnicodeSet
*set2clone
=(UnicodeSet
*)set2
->clone();
340 if(*set1clone
!= *set1
|| *set1clone
!= *set1copy
|| *set1clone
!= set1equal
||
341 *set2clone
!= *set2
|| *set2clone
== *set1copy
|| *set2clone
!= set2equal
||
342 *set2clone
== *set1
|| *set2clone
== set1equal
|| *set2clone
== *set1clone
){
343 errln("FAIL: Error in clone");
346 logln("Testing hashcode");
347 if(set1
->hashCode() != set1equal
.hashCode() || set1
->hashCode() != set1clone
->hashCode() ||
348 set2
->hashCode() != set2equal
.hashCode() || set2
->hashCode() != set2clone
->hashCode() ||
349 set1copy
->hashCode() != set1equal
.hashCode() || set1copy
->hashCode() != set1clone
->hashCode() ||
350 set1
->hashCode() == set2
->hashCode() || set1copy
->hashCode() == set2
->hashCode() ||
351 set2
->hashCode() == set1clone
->hashCode() || set2
->hashCode() == set1equal
.hashCode() ){
352 errln("FAIL: Error in hashCode()");
364 UnicodeSetTest::TestAddRemove(void) {
365 UnicodeSet set
; // Construct empty set
366 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
367 doAssert(set
.size() == 0, "size should be 0");
369 doAssert(set
.size() == 0x110000, "size should be 0x110000");
371 set
.add(0x0061, 0x007a);
372 expectPairs(set
, "az");
373 doAssert(set
.isEmpty() == FALSE
, "set should not be empty");
374 doAssert(set
.size() != 0, "size should not be equal to 0");
375 doAssert(set
.size() == 26, "size should be equal to 26");
376 set
.remove(0x006d, 0x0070);
377 expectPairs(set
, "alqz");
378 doAssert(set
.size() == 22, "size should be equal to 22");
379 set
.remove(0x0065, 0x0067);
380 expectPairs(set
, "adhlqz");
381 doAssert(set
.size() == 19, "size should be equal to 19");
382 set
.remove(0x0064, 0x0069);
383 expectPairs(set
, "acjlqz");
384 doAssert(set
.size() == 16, "size should be equal to 16");
385 set
.remove(0x0063, 0x0072);
386 expectPairs(set
, "absz");
387 doAssert(set
.size() == 10, "size should be equal to 10");
388 set
.add(0x0066, 0x0071);
389 expectPairs(set
, "abfqsz");
390 doAssert(set
.size() == 22, "size should be equal to 22");
391 set
.remove(0x0061, 0x0067);
392 expectPairs(set
, "hqsz");
393 set
.remove(0x0061, 0x007a);
394 expectPairs(set
, "");
395 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
396 doAssert(set
.size() == 0, "size should be 0");
398 doAssert(set
.isEmpty() == FALSE
, "set should not be empty");
399 doAssert(set
.size() == 1, "size should not be equal to 1");
402 expectPairs(set
, "ac");
403 doAssert(set
.size() == 3, "size should not be equal to 3");
406 expectPairs(set
, "acpq");
407 doAssert(set
.size() == 5, "size should not be equal to 5");
409 expectPairs(set
, "");
410 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
411 doAssert(set
.size() == 0, "size should be 0");
413 // Try removing an entire set from another set
414 expectPattern(set
, "[c-x]", "cx");
416 expectPattern(set2
, "[f-ky-za-bc[vw]]", "acfkvwyz");
418 expectPairs(set
, "deluxx");
420 // Try adding an entire set to another set
421 expectPattern(set
, "[jackiemclean]", "aacceein");
422 expectPattern(set2
, "[hitoshinamekatajamesanderson]", "aadehkmort");
424 expectPairs(set
, "aacehort");
425 doAssert(set
.containsAll(set2
) == TRUE
, "set should contain all the elements in set2");
427 // Try retaining an set of elements contained in another set (intersection)
429 expectPattern(set3
, "[a-c]", "ac");
430 doAssert(set
.containsAll(set3
) == FALSE
, "set doesn't contain all the elements in set3");
432 expectPairs(set3
, "aacc");
433 doAssert(set
.containsAll(set3
) == TRUE
, "set should contain all the elements in set3");
435 expectPairs(set
, "aacc");
436 doAssert(set
.size() == set3
.size(), "set.size() should be set3.size()");
437 doAssert(set
.containsAll(set3
) == TRUE
, "set should contain all the elements in set3");
439 doAssert(set
.size() != set3
.size(), "set.size() != set3.size()");
441 // Test commutativity
442 expectPattern(set
, "[hitoshinamekatajamesanderson]", "aadehkmort");
443 expectPattern(set2
, "[jackiemclean]", "aacceein");
445 expectPairs(set
, "aacehort");
446 doAssert(set
.containsAll(set2
) == TRUE
, "set should contain all the elements in set2");
454 * Make sure minimal representation is maintained.
456 void UnicodeSetTest::TestMinimalRep() {
457 UErrorCode status
= U_ZERO_ERROR
;
458 // This is pretty thoroughly tested by checkCanonicalRep()
459 // run against the exhaustive operation results. Use the code
460 // here for debugging specific spot problems.
462 // 1 overlap against 2
463 UnicodeSet
set("[h-km-q]", status
);
464 if (U_FAILURE(status
)) { errln("FAIL"); return; }
465 UnicodeSet
set2("[i-o]", status
);
466 if (U_FAILURE(status
)) { errln("FAIL"); return; }
468 expectPairs(set
, "hq");
470 set
.applyPattern("[a-m]", status
);
471 if (U_FAILURE(status
)) { errln("FAIL"); return; }
472 set2
.applyPattern("[e-o]", status
);
473 if (U_FAILURE(status
)) { errln("FAIL"); return; }
475 expectPairs(set
, "ao");
477 set
.applyPattern("[e-o]", status
);
478 if (U_FAILURE(status
)) { errln("FAIL"); return; }
479 set2
.applyPattern("[a-m]", status
);
480 if (U_FAILURE(status
)) { errln("FAIL"); return; }
482 expectPairs(set
, "ao");
483 // 1 overlap against 3
484 set
.applyPattern("[a-eg-mo-w]", status
);
485 if (U_FAILURE(status
)) { errln("FAIL"); return; }
486 set2
.applyPattern("[d-q]", status
);
487 if (U_FAILURE(status
)) { errln("FAIL"); return; }
489 expectPairs(set
, "aw");
492 void UnicodeSetTest::TestAPI() {
493 UErrorCode status
= U_ZERO_ERROR
;
496 if (!set
.isEmpty() || set
.getRangeCount() != 0) {
497 errln((UnicodeString
)"FAIL, set should be empty but isn't: " +
501 // clear(), isEmpty()
504 errln((UnicodeString
)"FAIL, set shouldn't be empty but is: " +
508 if (!set
.isEmpty()) {
509 errln((UnicodeString
)"FAIL, set should be empty but isn't: " +
515 if (set
.size() != 0) {
516 errln((UnicodeString
)"FAIL, size should be 0, but is " + set
.size() +
520 if (set
.size() != 1) {
521 errln((UnicodeString
)"FAIL, size should be 1, but is " + set
.size() +
524 set
.add(0x0031, 0x0039);
525 if (set
.size() != 10) {
526 errln((UnicodeString
)"FAIL, size should be 10, but is " + set
.size() +
530 // contains(first, last)
532 set
.applyPattern("[A-Y 1-8 b-d l-y]", status
);
533 if (U_FAILURE(status
)) { errln("FAIL"); return; }
534 for (int32_t i
= 0; i
<set
.getRangeCount(); ++i
) {
535 UChar32 a
= set
.getRangeStart(i
);
536 UChar32 b
= set
.getRangeEnd(i
);
537 if (!set
.contains(a
, b
)) {
538 errln((UnicodeString
)"FAIL, should contain " + (unsigned short)a
+ '-' + (unsigned short)b
+
539 " but doesn't: " + set
);
541 if (set
.contains((UChar32
)(a
-1), b
)) {
542 errln((UnicodeString
)"FAIL, shouldn't contain " +
543 (unsigned short)(a
-1) + '-' + (unsigned short)b
+
544 " but does: " + set
);
546 if (set
.contains(a
, (UChar32
)(b
+1))) {
547 errln((UnicodeString
)"FAIL, shouldn't contain " +
548 (unsigned short)a
+ '-' + (unsigned short)(b
+1) +
549 " but does: " + set
);
553 // Ported InversionList test.
554 UnicodeSet
a((UChar32
)3,(UChar32
)10);
555 UnicodeSet
b((UChar32
)7,(UChar32
)15);
558 logln((UnicodeString
)"a [3-10]: " + a
);
559 logln((UnicodeString
)"b [7-15]: " + b
);
562 UnicodeSet
exp((UChar32
)3,(UChar32
)15);
564 logln((UnicodeString
)"c.set(a).add(b): " + c
);
566 errln((UnicodeString
)"FAIL: c.set(a).add(b) = " + c
+ ", expect " + exp
);
569 exp
.set((UChar32
)0, (UChar32
)2);
570 exp
.add((UChar32
)16, UnicodeSet::MAX_VALUE
);
572 logln((UnicodeString
)"c.complement(): " + c
);
574 errln((UnicodeString
)"FAIL: c.complement() = " + c
+ ", expect " + exp
);
577 exp
.set((UChar32
)3, (UChar32
)15);
579 logln((UnicodeString
)"c.complement(): " + c
);
581 errln((UnicodeString
)"FAIL: c.complement() = " + c
+ ", expect " + exp
);
585 exp
.set((UChar32
)3,(UChar32
)6);
586 exp
.add((UChar32
)11,(UChar32
) 15);
588 logln((UnicodeString
)"c.set(a).exclusiveOr(b): " + c
);
590 errln((UnicodeString
)"FAIL: c.set(a).exclusiveOr(b) = " + c
+ ", expect " + exp
);
594 bitsToSet(setToBits(c
), c
);
596 logln((UnicodeString
)"bitsToSet(setToBits(c)): " + c
);
598 errln((UnicodeString
)"FAIL: bitsToSet(setToBits(c)) = " + c
+ ", expect " + exp
);
601 // Additional tests for coverage JB#2118
602 //UnicodeSet::complement(class UnicodeString const &)
603 //UnicodeSet::complementAll(class UnicodeString const &)
604 //UnicodeSet::containsNone(class UnicodeSet const &)
605 //UnicodeSet::containsNone(long,long)
606 //UnicodeSet::containsSome(class UnicodeSet const &)
607 //UnicodeSet::containsSome(long,long)
608 //UnicodeSet::removeAll(class UnicodeString const &)
609 //UnicodeSet::retain(long)
610 //UnicodeSet::retainAll(class UnicodeString const &)
611 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
612 //UnicodeSetIterator::getString(void)
614 set
.complement("ab");
615 exp
.applyPattern("[{ab}]", status
);
616 if (U_FAILURE(status
)) { errln("FAIL"); return; }
617 if (set
!= exp
) { errln("FAIL: complement(\"ab\")"); return; }
619 UnicodeSetIterator
iset(set
);
620 if (!iset
.next() || !iset
.isString()) {
621 errln("FAIL: UnicodeSetIterator::next/isString");
622 } else if (iset
.getString() != "ab") {
623 errln("FAIL: UnicodeSetIterator::getString");
626 set
.add((UChar32
)0x61, (UChar32
)0x7A);
627 set
.complementAll("alan");
628 exp
.applyPattern("[{ab}b-kmo-z]", status
);
629 if (U_FAILURE(status
)) { errln("FAIL"); return; }
630 if (set
!= exp
) { errln("FAIL: complementAll(\"alan\")"); return; }
632 exp
.applyPattern("[a-z]", status
);
633 if (U_FAILURE(status
)) { errln("FAIL"); return; }
634 if (set
.containsNone(exp
)) { errln("FAIL: containsNone(UnicodeSet)"); }
635 if (!set
.containsSome(exp
)) { errln("FAIL: containsSome(UnicodeSet)"); }
636 exp
.applyPattern("[aln]", status
);
637 if (U_FAILURE(status
)) { errln("FAIL"); return; }
638 if (!set
.containsNone(exp
)) { errln("FAIL: containsNone(UnicodeSet)"); }
639 if (set
.containsSome(exp
)) { errln("FAIL: containsSome(UnicodeSet)"); }
641 if (set
.containsNone((UChar32
)0x61, (UChar32
)0x7A)) {
642 errln("FAIL: containsNone(UChar32, UChar32)");
644 if (!set
.containsSome((UChar32
)0x61, (UChar32
)0x7A)) {
645 errln("FAIL: containsSome(UChar32, UChar32)");
647 if (!set
.containsNone((UChar32
)0x41, (UChar32
)0x5A)) {
648 errln("FAIL: containsNone(UChar32, UChar32)");
650 if (set
.containsSome((UChar32
)0x41, (UChar32
)0x5A)) {
651 errln("FAIL: containsSome(UChar32, UChar32)");
654 set
.removeAll("liu");
655 exp
.applyPattern("[{ab}b-hj-kmo-tv-z]", status
);
656 if (U_FAILURE(status
)) { errln("FAIL"); return; }
657 if (set
!= exp
) { errln("FAIL: removeAll(\"liu\")"); return; }
659 set
.retainAll("star");
660 exp
.applyPattern("[rst]", status
);
661 if (U_FAILURE(status
)) { errln("FAIL"); return; }
662 if (set
!= exp
) { errln("FAIL: retainAll(\"star\")"); return; }
664 set
.retain((UChar32
)0x73);
665 exp
.applyPattern("[s]", status
);
666 if (U_FAILURE(status
)) { errln("FAIL"); return; }
667 if (set
!= exp
) { errln("FAIL: retain('s')"); return; }
670 int32_t slen
= set
.serialize(buf
, sizeof(buf
)/sizeof(buf
[0]), status
);
671 if (U_FAILURE(status
)) { errln("FAIL: serialize"); return; }
672 if (slen
!= 3 || buf
[0] != 2 || buf
[1] != 0x73 || buf
[2] != 0x74) {
673 errln("FAIL: serialize");
678 void UnicodeSetTest::TestIteration() {
679 UErrorCode ec
= U_ZERO_ERROR
;
683 // 6 code points, 3 ranges, 2 strings, 8 total elements
684 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
685 UnicodeSet
set("[zabyc\\U0001abcd{str1}{str2}]", ec
);
686 TEST_ASSERT_SUCCESS(ec
);
687 UnicodeSetIterator
it(set
);
689 for (outerLoop
=0; outerLoop
<3; outerLoop
++) {
690 // Run the test multiple times, to check that iterator.reset() is working.
691 for (i
=0; i
<10; i
++) {
692 UBool nextv
= it
.next();
693 UBool isString
= it
.isString();
694 int32_t codePoint
= it
.getCodepoint();
695 //int32_t codePointEnd = it.getCodepointEnd();
696 UnicodeString s
= it
.getString();
699 TEST_ASSERT(nextv
== TRUE
);
700 TEST_ASSERT(isString
== FALSE
);
701 TEST_ASSERT(codePoint
==0x61);
702 TEST_ASSERT(s
== "a");
705 TEST_ASSERT(nextv
== TRUE
);
706 TEST_ASSERT(isString
== FALSE
);
707 TEST_ASSERT(codePoint
==0x62);
708 TEST_ASSERT(s
== "b");
711 TEST_ASSERT(nextv
== TRUE
);
712 TEST_ASSERT(isString
== FALSE
);
713 TEST_ASSERT(codePoint
==0x63);
714 TEST_ASSERT(s
== "c");
717 TEST_ASSERT(nextv
== TRUE
);
718 TEST_ASSERT(isString
== FALSE
);
719 TEST_ASSERT(codePoint
==0x79);
720 TEST_ASSERT(s
== "y");
723 TEST_ASSERT(nextv
== TRUE
);
724 TEST_ASSERT(isString
== FALSE
);
725 TEST_ASSERT(codePoint
==0x7a);
726 TEST_ASSERT(s
== "z");
729 TEST_ASSERT(nextv
== TRUE
);
730 TEST_ASSERT(isString
== FALSE
);
731 TEST_ASSERT(codePoint
==0x1abcd);
732 TEST_ASSERT(s
== UnicodeString((UChar32
)0x1abcd));
735 TEST_ASSERT(nextv
== TRUE
);
736 TEST_ASSERT(isString
== TRUE
);
737 TEST_ASSERT(s
== "str1");
740 TEST_ASSERT(nextv
== TRUE
);
741 TEST_ASSERT(isString
== TRUE
);
742 TEST_ASSERT(s
== "str2");
745 TEST_ASSERT(nextv
== FALSE
);
748 TEST_ASSERT(nextv
== FALSE
);
752 it
.reset(); // prepare to run the iteration again.
759 void UnicodeSetTest::TestStrings() {
760 UErrorCode ec
= U_ZERO_ERROR
;
762 UnicodeSet
* testList
[] = {
763 UnicodeSet::createFromAll("abc"),
764 new UnicodeSet("[a-c]", ec
),
766 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
767 new UnicodeSet("[{ll}{ch}a-z]", ec
),
769 UnicodeSet::createFrom("ab}c"),
770 new UnicodeSet("[{ab\\}c}]", ec
),
772 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
773 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec
),
779 errln("FAIL: couldn't construct test sets");
782 for (int32_t i
= 0; testList
[i
] != NULL
; i
+=2) {
784 UnicodeString pat0
, pat1
;
785 testList
[i
]->toPattern(pat0
, TRUE
);
786 testList
[i
+1]->toPattern(pat1
, TRUE
);
787 if (*testList
[i
] == *testList
[i
+1]) {
788 logln((UnicodeString
)"Ok: " + pat0
+ " == " + pat1
);
790 logln((UnicodeString
)"FAIL: " + pat0
+ " != " + pat1
);
794 delete testList
[i
+1];
799 * Test the [:Latin:] syntax.
801 void UnicodeSetTest::TestScriptSet() {
802 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
804 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
807 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
812 * Test the [:Latin:] syntax.
814 void UnicodeSetTest::TestPropertySet() {
815 static const char* DATA
[] = {
816 // Pattern, Chars IN, Chars NOT in
826 "\\P{ GENERAL Category = upper case letter }",
830 // Combining class: @since ICU 2.2
831 // Check both symbolic and numeric
836 "\\p{Canonical Combining Class = 11}",
840 "[:c c c = iota subscript :]",
844 // Bidi class: @since ICU 2.2
845 "\\p{bidiclass=lefttoright}",
849 // Binary properties: @since ICU 2.2
856 // weiv: )(and * were removed from math in Unicode 4.0.1
860 // JB#1767 \N{}, \p{ASCII}
865 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
881 "\\u03D8\\u03D9", // 3.2
884 "\\u1800\\u3400\\U0002f800",
885 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
887 // JB#2350: Case_Sensitive
888 "[:Case Sensitive:]",
889 "A\\u1FFC\\U00010410",
890 ";\\u00B4\\U00010500",
892 // JB#2832: C99-compatibility props
899 " \\u0003\\u0007\\u0009\\u000A\\u000D",
902 "!@#%&*()[]{}-_\\/;:,.?'\"",
909 // Regex compatibility test
910 "[-b]", // leading '-' is literal
914 "[^-b]", // leading '-' is literal
918 "[b-]", // trailing '-' is literal
922 "[^b-]", // trailing '-' is literal
926 "[a-b-]", // trailing '-' is literal
930 "[[a-q]&[p-z]-]", // trailing '-' is literal
934 "[\\s|\\)|:|$|\\>]", // from regex tests
938 "[\\uDC00cd]", // JB#2906: isolated trail at start
940 "ab\\uD800\\U00010000",
942 "[ab\\uD800]", // JB#2906: isolated trail at start
944 "cd\\uDC00\\U00010000",
946 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
948 "ef\\uDC00\\U00010000",
950 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
952 "ef\\uD800\\U00010000",
954 "[:^lccc=0:]", // Lead canonical class
956 "abcd\\u00c0\\u00c5",
958 "[:^tccc=0:]", // Trail canonical class
959 "\\u0300\\u0301\\u00c0\\u00c5",
962 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
963 "\\u0300\\u0301\\u00c0\\u00c5",
966 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
968 "abcd\\u0300\\u0301\\u00c0\\u00c5",
970 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
971 "\\u0F73\\u0F75\\u0F81",
972 "abcd\\u0300\\u0301\\u00c0\\u00c5",
975 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
976 "\\u0888\\uFDD3\\uFFFE\\U00050005"
979 static const int32_t DATA_LEN
= sizeof(DATA
)/sizeof(DATA
[0]);
981 for (int32_t i
=0; i
<DATA_LEN
; i
+=3) {
982 expectContainment(DATA
[i
], CharsToUnicodeString(DATA
[i
+1]),
983 CharsToUnicodeString(DATA
[i
+2]));
988 * Test that Posix style character classes [:digit:], etc.
989 * have the Unicode definitions from TR 18.
991 void UnicodeSetTest::TestPosixClasses() {
993 UErrorCode status
= U_ZERO_ERROR
;
994 UnicodeSet
s1("[:alpha:]", status
);
995 UnicodeSet
s2("\\p{Alphabetic}", status
);
996 TEST_ASSERT_SUCCESS(status
);
1000 UErrorCode status
= U_ZERO_ERROR
;
1001 UnicodeSet
s1("[:lower:]", status
);
1002 UnicodeSet
s2("\\p{lowercase}", status
);
1003 TEST_ASSERT_SUCCESS(status
);
1004 TEST_ASSERT(s1
==s2
);
1007 UErrorCode status
= U_ZERO_ERROR
;
1008 UnicodeSet
s1("[:upper:]", status
);
1009 UnicodeSet
s2("\\p{Uppercase}", status
);
1010 TEST_ASSERT_SUCCESS(status
);
1011 TEST_ASSERT(s1
==s2
);
1014 UErrorCode status
= U_ZERO_ERROR
;
1015 UnicodeSet
s1("[:punct:]", status
);
1016 UnicodeSet
s2("\\p{gc=Punctuation}", status
);
1017 TEST_ASSERT_SUCCESS(status
);
1018 TEST_ASSERT(s1
==s2
);
1021 UErrorCode status
= U_ZERO_ERROR
;
1022 UnicodeSet
s1("[:digit:]", status
);
1023 UnicodeSet
s2("\\p{gc=DecimalNumber}", status
);
1024 TEST_ASSERT_SUCCESS(status
);
1025 TEST_ASSERT(s1
==s2
);
1028 UErrorCode status
= U_ZERO_ERROR
;
1029 UnicodeSet
s1("[:xdigit:]", status
);
1030 UnicodeSet
s2("[\\p{DecimalNumber}\\p{HexDigit}]", status
);
1031 TEST_ASSERT_SUCCESS(status
);
1032 TEST_ASSERT(s1
==s2
);
1035 UErrorCode status
= U_ZERO_ERROR
;
1036 UnicodeSet
s1("[:alnum:]", status
);
1037 UnicodeSet
s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status
);
1038 TEST_ASSERT_SUCCESS(status
);
1039 TEST_ASSERT(s1
==s2
);
1042 UErrorCode status
= U_ZERO_ERROR
;
1043 UnicodeSet
s1("[:space:]", status
);
1044 UnicodeSet
s2("\\p{Whitespace}", status
);
1045 TEST_ASSERT_SUCCESS(status
);
1046 TEST_ASSERT(s1
==s2
);
1049 UErrorCode status
= U_ZERO_ERROR
;
1050 UnicodeSet
s1("[:blank:]", status
);
1051 TEST_ASSERT_SUCCESS(status
);
1052 UnicodeSet
s2("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1054 TEST_ASSERT_SUCCESS(status
);
1055 TEST_ASSERT(s1
==s2
);
1058 UErrorCode status
= U_ZERO_ERROR
;
1059 UnicodeSet
s1("[:cntrl:]", status
);
1060 TEST_ASSERT_SUCCESS(status
);
1061 UnicodeSet
s2("\\p{Control}", status
);
1062 TEST_ASSERT_SUCCESS(status
);
1063 TEST_ASSERT(s1
==s2
);
1066 UErrorCode status
= U_ZERO_ERROR
;
1067 UnicodeSet
s1("[:graph:]", status
);
1068 TEST_ASSERT_SUCCESS(status
);
1069 UnicodeSet
s2("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status
);
1070 TEST_ASSERT_SUCCESS(status
);
1071 TEST_ASSERT(s1
==s2
);
1074 UErrorCode status
= U_ZERO_ERROR
;
1075 UnicodeSet
s1("[:print:]", status
);
1076 TEST_ASSERT_SUCCESS(status
);
1077 UnicodeSet
s2("[[:graph:][:blank:]-[\\p{Control}]]" ,status
);
1078 TEST_ASSERT_SUCCESS(status
);
1079 TEST_ASSERT(s1
==s2
);
1083 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1085 void UnicodeSetTest::TestClone() {
1086 UErrorCode ec
= U_ZERO_ERROR
;
1087 UnicodeSet
s("[abcxyz]", ec
);
1089 expectContainment(t
, "abc", "def");
1093 * Test the indexOf() and charAt() methods.
1095 void UnicodeSetTest::TestIndexOf() {
1096 UErrorCode ec
= U_ZERO_ERROR
;
1097 UnicodeSet
set("[a-cx-y3578]", ec
);
1098 if (U_FAILURE(ec
)) {
1099 errln("FAIL: UnicodeSet constructor");
1102 for (int32_t i
=0; i
<set
.size(); ++i
) {
1103 UChar32 c
= set
.charAt(i
);
1104 if (set
.indexOf(c
) != i
) {
1105 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1106 i
, c
, set
.indexOf(c
));
1109 UChar32 c
= set
.charAt(set
.size());
1111 errln("FAIL: charAt(<out of range>) = %X", c
);
1113 int32_t j
= set
.indexOf((UChar32
)0x71/*'q'*/);
1115 errln((UnicodeString
)"FAIL: indexOf('q') = " + j
);
1122 void UnicodeSetTest::TestCloseOver() {
1123 UErrorCode ec
= U_ZERO_ERROR
;
1125 char CASE
[] = {(char)USET_CASE_INSENSITIVE
};
1126 char CASE_MAPPINGS
[] = {(char)USET_ADD_CASE_MAPPINGS
};
1127 const char* DATA
[] = {
1128 // selector, input, output
1130 "[aq\\u00DF{Bc}{bC}{Fi}]",
1131 "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
1134 "[\\u01F1]", // 'DZ'
1135 "[\\u01F1\\u01F2\\u01F3]",
1139 "[\\u1FB4{\\u03AC\\u03B9}]",
1145 CASE
, // make sure binary search finds limits
1147 "[aA\\uFF3A\\uFF5A]",
1150 "[a-z]","[A-Za-z\\u017F\\u212A]",
1156 CASE
, "[i]", "[iI]",
1158 CASE
, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1159 CASE
, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1161 CASE
, "[\\u0131]", "[\\u0131]", // dotless i
1163 CASE
, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1165 CASE
, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1167 CASE
, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1169 CASE
, "[\\u03f7]", "[\\u03f7\\u03f8]",
1171 CASE
, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1173 CASE
, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1174 CASE
, "[{st}]", "[\\ufb05\\ufb06{st}]",
1176 CASE
, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1178 CASE
, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1180 CASE
, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1183 "[aq\\u00DF{Bc}{bC}{Fi}]",
1184 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1187 "[\\u01F1]", // 'DZ'
1188 "[\\u01F1\\u01F2\\u01F3]",
1200 for (int32_t i
=0; DATA
[i
]!=NULL
; i
+=3) {
1201 int32_t selector
= DATA
[i
][0];
1202 UnicodeString
pat(DATA
[i
+1]);
1203 UnicodeString
exp(DATA
[i
+2]);
1204 s
.applyPattern(pat
, ec
);
1205 s
.closeOver(selector
);
1206 t
.applyPattern(exp
, ec
);
1207 if (U_FAILURE(ec
)) {
1208 errln("FAIL: applyPattern failed");
1212 logln((UnicodeString
)"Ok: " + pat
+ ".closeOver(" + selector
+ ") => " + exp
);
1214 errln((UnicodeString
)"FAIL: " + pat
+ ".closeOver(" + selector
+ ") => " +
1215 s
.toPattern(buf
, TRUE
) + ", expected " + exp
);
1222 * This was used to compare the old implementation (using USET_CASE)
1223 * with the new one (using 0x100 temporarily)
1224 * while transitioning from hardcoded case closure tables in uniset.cpp
1225 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1226 * and using ucase.c functions for closure.
1227 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1229 * Note: The old and new implementation never fully matched because
1230 * the old implementation turned out to not map U+0130 and U+0131 correctly
1231 * (dotted I and dotless i) and because the old implementation's data tables
1232 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1233 * new implementation. (So sigmas and some other characters were not handled
1234 * according to the newer Unicode version.)
1236 UnicodeSet
sens("[:case_sensitive:]", ec
), sens2
, s2
;
1237 UnicodeSetIterator
si(sens
);
1238 UnicodeString str
, buf2
;
1239 const UnicodeString
*pStr
;
1242 if(!si
.isString()) {
1243 c
=si
.getCodepoint();
1252 s
.closeOver(USET_CASE
);
1255 errln("FAIL: closeOver(U+%04x) differs: ", c
);
1256 errln((UnicodeString
)"old "+s
.toPattern(buf
, TRUE
)+" new: "+t
.toPattern(buf2
, TRUE
));
1260 // remove all code points
1261 // should contain all full case folding mapping strings
1262 sens2
.remove(0, 0x10ffff);
1266 pStr
=&si
.getString();
1270 s
.closeOver(USET_CASE
);
1273 errln((UnicodeString
)"FAIL: closeOver("+s2
.toPattern(buf
, TRUE
)+") differs: ");
1274 errln((UnicodeString
)"old "+s
.toPattern(buf
, TRUE
)+" new: "+t
.toPattern(buf2
, TRUE
));
1280 // Test the pattern API
1281 s
.applyPattern("[abc]", USET_CASE_INSENSITIVE
, NULL
, ec
);
1282 if (U_FAILURE(ec
)) {
1283 errln("FAIL: applyPattern failed");
1285 expectContainment(s
, "abcABC", "defDEF");
1287 UnicodeSet
v("[^abc]", USET_CASE_INSENSITIVE
, NULL
, ec
);
1288 if (U_FAILURE(ec
)) {
1289 errln("FAIL: constructor failed");
1291 expectContainment(v
, "defDEF", "abcABC");
1293 UnicodeSet
cm("[abck]", USET_ADD_CASE_MAPPINGS
, NULL
, ec
);
1294 if (U_FAILURE(ec
)) {
1295 errln("FAIL: construct w/case mappings failed");
1297 expectContainment(cm
, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1301 void UnicodeSetTest::TestEscapePattern() {
1302 const char pattern
[] =
1303 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1305 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1306 // We test this with two passes; in the second pass we
1307 // pre-unescape the pattern. Since U+200E is rule whitespace,
1308 // this fails -- which is what we expect.
1309 for (int32_t pass
=1; pass
<=2; ++pass
) {
1310 UErrorCode ec
= U_ZERO_ERROR
;
1311 UnicodeString
pat(pattern
);
1313 pat
= pat
.unescape();
1315 // Pattern is only good for pass 1
1316 UBool isPatternValid
= (pass
==1);
1318 UnicodeSet
set(pat
, ec
);
1319 if (U_SUCCESS(ec
) != isPatternValid
){
1320 errln((UnicodeString
)"FAIL: applyPattern(" +
1321 escape(pat
) + ") => " +
1325 if (U_FAILURE(ec
)) {
1328 if (set
.contains((UChar
)0x0644)){
1329 errln((UnicodeString
)"FAIL: " + escape(pat
) + " contains(U+0664)");
1332 UnicodeString newpat
;
1333 set
.toPattern(newpat
, TRUE
);
1334 if (newpat
== exp
) {
1335 logln(escape(pat
) + " => " + newpat
);
1337 errln((UnicodeString
)"FAIL: " + escape(pat
) + " => " + newpat
);
1340 for (int32_t i
=0; i
<set
.getRangeCount(); ++i
) {
1341 UnicodeString
str("Range ");
1342 str
.append((UChar
)(0x30 + i
))
1344 .append((UChar32
)set
.getRangeStart(i
))
1346 .append((UChar32
)set
.getRangeEnd(i
));
1347 str
= str
+ " (" + set
.getRangeStart(i
) + " - " +
1348 set
.getRangeEnd(i
) + ")";
1349 if (set
.getRangeStart(i
) < 0) {
1350 errln((UnicodeString
)"FAIL: " + escape(str
));
1358 void UnicodeSetTest::expectRange(const UnicodeString
& label
,
1359 const UnicodeSet
& set
,
1360 UChar32 start
, UChar32 end
) {
1361 UnicodeSet
exp(start
, end
);
1364 logln(label
+ " => " + set
.toPattern(pat
, TRUE
));
1367 errln((UnicodeString
)"FAIL: " + label
+ " => " +
1368 set
.toPattern(pat
, TRUE
) +
1369 ", expected " + exp
.toPattern(xpat
, TRUE
));
1373 void UnicodeSetTest::TestInvalidCodePoint() {
1375 const UChar32 DATA
[] = {
1376 // Test range Expected range
1377 0, 0x10FFFF, 0, 0x10FFFF,
1378 (UChar32
)-1, 8, 0, 8,
1379 8, 0x110000, 8, 0x10FFFF
1381 const int32_t DATA_LENGTH
= sizeof(DATA
)/sizeof(DATA
[0]);
1386 for (i
=0; i
<DATA_LENGTH
; i
+=4) {
1387 UChar32 start
= DATA
[i
];
1388 UChar32 end
= DATA
[i
+1];
1389 UChar32 xstart
= DATA
[i
+2];
1390 UChar32 xend
= DATA
[i
+3];
1392 // Try various API using the test code points
1394 UnicodeSet
set(start
, end
);
1395 expectRange((UnicodeString
)"ct(" + start
+ "," + end
+ ")",
1399 set
.set(start
, end
);
1400 expectRange((UnicodeString
)"set(" + start
+ "," + end
+ ")",
1403 UBool b
= set
.contains(start
);
1404 b
= set
.contains(start
, end
);
1405 b
= set
.containsNone(start
, end
);
1406 b
= set
.containsSome(start
, end
);
1408 /*int32_t index = set.indexOf(start);*/
1412 set
.add(start
, end
);
1413 expectRange((UnicodeString
)"add(" + start
+ "," + end
+ ")",
1416 set
.set(0, 0x10FFFF);
1417 set
.retain(start
, end
);
1418 expectRange((UnicodeString
)"retain(" + start
+ "," + end
+ ")",
1422 set
.set(0, 0x10FFFF);
1424 set
.remove(start
, end
);
1426 expectRange((UnicodeString
)"!remove(" + start
+ "," + end
+ ")",
1429 set
.set(0, 0x10FFFF);
1430 set
.complement(start
, end
);
1432 expectRange((UnicodeString
)"!complement(" + start
+ "," + end
+ ")",
1434 set
.complement(start
);
1437 const UChar32 DATA2
[] = {
1443 const int32_t DATA2_LENGTH
= sizeof(DATA2
)/sizeof(DATA2
[0]);
1445 for (i
=0; i
<DATA2_LENGTH
; ++i
) {
1446 UChar32 c
= DATA2
[i
], end
= 0x10FFFF;
1447 UBool valid
= (c
>= 0 && c
<= 0x10FFFF);
1449 UnicodeSet
set(0, 0x10FFFF);
1451 // For single-codepoint contains, invalid codepoints are NOT contained
1452 UBool b
= set
.contains(c
);
1454 logln((UnicodeString
)"[\\u0000-\\U0010FFFF].contains(" + c
+
1457 errln((UnicodeString
)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c
+
1461 // For codepoint range contains, containsNone, and containsSome,
1462 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1463 b
= set
.contains(c
, end
);
1464 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].contains(" + c
+
1465 "," + end
+ ") = " + b
);
1467 b
= set
.containsNone(c
, end
);
1468 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].containsNone(" + c
+
1469 "," + end
+ ") = " + b
);
1471 b
= set
.containsSome(c
, end
);
1472 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].containsSome(" + c
+
1473 "," + end
+ ") = " + b
);
1475 int32_t index
= set
.indexOf(c
);
1476 if ((index
>= 0) == valid
) {
1477 logln((UnicodeString
)"[\\u0000-\\U0010FFFF].indexOf(" + c
+
1480 errln((UnicodeString
)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c
+
1486 // Used by TestSymbolTable
1487 class TokenSymbolTable
: public SymbolTable
{
1491 TokenSymbolTable(UErrorCode
& ec
) : contents(FALSE
, ec
) {
1492 contents
.setValueDeleter(uhash_deleteUnicodeString
);
1495 ~TokenSymbolTable() {}
1498 * (Non-SymbolTable API) Add the given variable and value to
1499 * the table. Variable should NOT contain leading '$'.
1501 void add(const UnicodeString
& var
, const UnicodeString
& value
,
1503 if (U_SUCCESS(ec
)) {
1504 contents
.put(var
, new UnicodeString(value
), ec
);
1511 virtual const UnicodeString
* lookup(const UnicodeString
& s
) const {
1512 return (const UnicodeString
*) contents
.get(s
);
1518 virtual const UnicodeFunctor
* lookupMatcher(UChar32
/*ch*/) const {
1525 virtual UnicodeString
parseReference(const UnicodeString
& text
,
1526 ParsePosition
& pos
, int32_t limit
) const {
1527 int32_t start
= pos
.getIndex();
1529 UnicodeString result
;
1531 UChar c
= text
.charAt(i
);
1532 if ((i
==start
&& !u_isIDStart(c
)) || !u_isIDPart(c
)) {
1537 if (i
== start
) { // No valid name chars
1538 return result
; // Indicate failure with empty string
1541 text
.extractBetween(start
, i
, result
);
1546 void UnicodeSetTest::TestSymbolTable() {
1547 // Multiple test cases can be set up here. Each test case
1548 // is terminated by null:
1549 // var, value, var, value,..., input pat., exp. output pat., null
1550 const char* DATA
[] = {
1551 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL
,
1552 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL
,
1553 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL
,
1557 for (int32_t i
=0; DATA
[i
]!=NULL
; ++i
) {
1558 UErrorCode ec
= U_ZERO_ERROR
;
1559 TokenSymbolTable
sym(ec
);
1560 if (U_FAILURE(ec
)) {
1561 errln("FAIL: couldn't construct TokenSymbolTable");
1566 while (DATA
[i
+2] != NULL
) {
1567 sym
.add(DATA
[i
], DATA
[i
+1], ec
);
1568 if (U_FAILURE(ec
)) {
1569 errln("FAIL: couldn't add to TokenSymbolTable");
1575 // Input pattern and expected output pattern
1576 UnicodeString inpat
= DATA
[i
], exppat
= DATA
[i
+1];
1579 ParsePosition
pos(0);
1580 UnicodeSet
us(inpat
, pos
, USET_IGNORE_SPACE
, &sym
, ec
);
1581 if (U_FAILURE(ec
)) {
1582 errln("FAIL: couldn't construct UnicodeSet");
1587 if (pos
.getIndex() != inpat
.length()) {
1588 errln((UnicodeString
)"Failed to read to end of string \""
1589 + inpat
+ "\": read to "
1590 + pos
.getIndex() + ", length is "
1594 UnicodeSet
us2(exppat
, ec
);
1595 if (U_FAILURE(ec
)) {
1596 errln("FAIL: couldn't construct expected UnicodeSet");
1602 errln((UnicodeString
)"Failed, got " + us
.toPattern(a
, TRUE
) +
1603 ", expected " + us2
.toPattern(b
, TRUE
));
1605 logln((UnicodeString
)"Ok, got " + us
.toPattern(a
, TRUE
));
1610 void UnicodeSetTest::TestSurrogate() {
1611 const char* DATA
[] = {
1612 // These should all behave identically
1613 "[abc\\uD800\\uDC00]",
1614 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1618 for (int i
=0; DATA
[i
] != 0; ++i
) {
1619 UErrorCode ec
= U_ZERO_ERROR
;
1620 logln((UnicodeString
)"Test pattern " + i
+ " :" + DATA
[i
]);
1621 UnicodeSet
set(DATA
[i
], ec
);
1622 if (U_FAILURE(ec
)) {
1623 errln("FAIL: UnicodeSet constructor");
1626 expectContainment(set
,
1627 CharsToUnicodeString("abc\\U00010000"),
1628 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1629 if (set
.size() != 4) {
1630 errln((UnicodeString
)"FAIL: " + DATA
[i
] + ".size() == " +
1631 set
.size() + ", expected 4");
1636 void UnicodeSetTest::TestExhaustive() {
1637 // exhaustive tests. Simulate UnicodeSets with integers.
1638 // That gives us very solid tests (except for large memory tests).
1640 int32_t limit
= 128;
1642 UnicodeSet x
, y
, z
, aa
;
1644 for (int32_t i
= 0; i
< limit
; ++i
) {
1646 logln((UnicodeString
)"Testing " + i
+ ", " + x
);
1647 _testComplement(i
, x
, y
);
1649 // AS LONG AS WE ARE HERE, check roundtrip
1650 checkRoundTrip(bitsToSet(i
, aa
));
1652 for (int32_t j
= 0; j
< limit
; ++j
) {
1653 _testAdd(i
,j
, x
,y
,z
);
1654 _testXor(i
,j
, x
,y
,z
);
1655 _testRetain(i
,j
, x
,y
,z
);
1656 _testRemove(i
,j
, x
,y
,z
);
1661 void UnicodeSetTest::_testComplement(int32_t a
, UnicodeSet
& x
, UnicodeSet
& z
) {
1665 int32_t c
= setToBits(z
);
1667 errln((UnicodeString
)"FAILED: add: ~" + x
+ " != " + z
);
1668 errln((UnicodeString
)"FAILED: add: ~" + a
+ " != " + c
);
1670 checkCanonicalRep(z
, (UnicodeString
)"complement " + a
);
1673 void UnicodeSetTest::_testAdd(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1678 int32_t c
= setToBits(z
);
1680 errln((UnicodeString
)"FAILED: add: " + x
+ " | " + y
+ " != " + z
);
1681 errln((UnicodeString
)"FAILED: add: " + a
+ " | " + b
+ " != " + c
);
1683 checkCanonicalRep(z
, (UnicodeString
)"add " + a
+ "," + b
);
1686 void UnicodeSetTest::_testRetain(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1691 int32_t c
= setToBits(z
);
1693 errln((UnicodeString
)"FAILED: retain: " + x
+ " & " + y
+ " != " + z
);
1694 errln((UnicodeString
)"FAILED: retain: " + a
+ " & " + b
+ " != " + c
);
1696 checkCanonicalRep(z
, (UnicodeString
)"retain " + a
+ "," + b
);
1699 void UnicodeSetTest::_testRemove(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1704 int32_t c
= setToBits(z
);
1705 if (c
!= (a
&~ b
)) {
1706 errln((UnicodeString
)"FAILED: remove: " + x
+ " &~ " + y
+ " != " + z
);
1707 errln((UnicodeString
)"FAILED: remove: " + a
+ " &~ " + b
+ " != " + c
);
1709 checkCanonicalRep(z
, (UnicodeString
)"remove " + a
+ "," + b
);
1712 void UnicodeSetTest::_testXor(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1717 int32_t c
= setToBits(z
);
1719 errln((UnicodeString
)"FAILED: complement: " + x
+ " ^ " + y
+ " != " + z
);
1720 errln((UnicodeString
)"FAILED: complement: " + a
+ " ^ " + b
+ " != " + c
);
1722 checkCanonicalRep(z
, (UnicodeString
)"complement " + a
+ "," + b
);
1726 * Check that ranges are monotonically increasing and non-
1729 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet
& set
, const UnicodeString
& msg
) {
1730 int32_t n
= set
.getRangeCount();
1732 errln((UnicodeString
)"FAIL result of " + msg
+
1733 ": range count should be >= 0 but is " +
1734 n
/*+ " for " + set.toPattern())*/);
1738 for (int32_t i
=0; i
<n
; ++i
) {
1739 UChar32 start
= set
.getRangeStart(i
);
1740 UChar32 end
= set
.getRangeEnd(i
);
1742 errln((UnicodeString
)"FAIL result of " + msg
+
1743 ": range " + (i
+1) +
1744 " start > end: " + (int)start
+ ", " + (int)end
+
1747 if (i
> 0 && start
<= last
) {
1748 errln((UnicodeString
)"FAIL result of " + msg
+
1749 ": range " + (i
+1) +
1750 " overlaps previous range: " + (int)start
+ ", " + (int)end
+
1758 * Convert a bitmask to a UnicodeSet.
1760 UnicodeSet
& UnicodeSetTest::bitsToSet(int32_t a
, UnicodeSet
& result
) {
1762 for (UChar32 i
= 0; i
< 32; ++i
) {
1763 if ((a
& (1<<i
)) != 0) {
1771 * Convert a UnicodeSet to a bitmask. Only the characters
1772 * U+0000 to U+0020 are represented in the bitmask.
1774 int32_t UnicodeSetTest::setToBits(const UnicodeSet
& x
) {
1776 for (int32_t i
= 0; i
< 32; ++i
) {
1777 if (x
.contains((UChar32
)i
)) {
1785 * Return the representation of an inversion list based UnicodeSet
1786 * as a pairs list. Ranges are listed in ascending Unicode order.
1787 * For example, the set [a-zA-M3] is represented as "33AMaz".
1789 UnicodeString
UnicodeSetTest::getPairs(const UnicodeSet
& set
) {
1790 UnicodeString pairs
;
1791 for (int32_t i
=0; i
<set
.getRangeCount(); ++i
) {
1792 UChar32 start
= set
.getRangeStart(i
);
1793 UChar32 end
= set
.getRangeEnd(i
);
1796 i
= set
.getRangeCount(); // Should be unnecessary
1798 pairs
.append((UChar
)start
).append((UChar
)end
);
1804 * Basic consistency check for a few items.
1805 * That the iterator works, and that we can create a pattern and
1806 * get the same thing back
1808 void UnicodeSetTest::checkRoundTrip(const UnicodeSet
& s
) {
1809 UErrorCode ec
= U_ZERO_ERROR
;
1812 checkEqual(s
, t
, "copy ct");
1815 checkEqual(s
, t
, "operator=");
1817 copyWithIterator(t
, s
, FALSE
);
1818 checkEqual(s
, t
, "iterator roundtrip");
1820 copyWithIterator(t
, s
, TRUE
); // try range
1821 checkEqual(s
, t
, "iterator roundtrip");
1823 UnicodeString pat
; s
.toPattern(pat
, FALSE
);
1824 t
.applyPattern(pat
, ec
);
1825 if (U_FAILURE(ec
)) {
1826 errln("FAIL: applyPattern");
1829 checkEqual(s
, t
, "toPattern(false)");
1832 s
.toPattern(pat
, TRUE
);
1833 t
.applyPattern(pat
, ec
);
1834 if (U_FAILURE(ec
)) {
1835 errln("FAIL: applyPattern");
1838 checkEqual(s
, t
, "toPattern(true)");
1842 void UnicodeSetTest::copyWithIterator(UnicodeSet
& t
, const UnicodeSet
& s
, UBool withRange
) {
1844 UnicodeSetIterator
it(s
);
1846 while (it
.nextRange()) {
1847 if (it
.isString()) {
1848 t
.add(it
.getString());
1850 t
.add(it
.getCodepoint(), it
.getCodepointEnd());
1855 if (it
.isString()) {
1856 t
.add(it
.getString());
1858 t
.add(it
.getCodepoint());
1864 UBool
UnicodeSetTest::checkEqual(const UnicodeSet
& s
, const UnicodeSet
& t
, const char* message
) {
1865 UnicodeString source
; s
.toPattern(source
, TRUE
);
1866 UnicodeString result
; t
.toPattern(result
, TRUE
);
1868 errln((UnicodeString
)"FAIL: " + message
1869 + "; source = " + source
1870 + "; result = " + result
1874 logln((UnicodeString
)"Ok: " + message
1875 + "; source = " + source
1876 + "; result = " + result
1883 UnicodeSetTest::expectContainment(const UnicodeString
& pat
,
1884 const UnicodeString
& charsIn
,
1885 const UnicodeString
& charsOut
) {
1886 UErrorCode ec
= U_ZERO_ERROR
;
1887 UnicodeSet
set(pat
, ec
);
1888 if (U_FAILURE(ec
)) {
1889 errln((UnicodeString
)"FAIL: pattern \"" +
1890 pat
+ "\" => " + u_errorName(ec
));
1893 expectContainment(set
, pat
, charsIn
, charsOut
);
1897 UnicodeSetTest::expectContainment(const UnicodeSet
& set
,
1898 const UnicodeString
& charsIn
,
1899 const UnicodeString
& charsOut
) {
1902 expectContainment(set
, pat
, charsIn
, charsOut
);
1906 UnicodeSetTest::expectContainment(const UnicodeSet
& set
,
1907 const UnicodeString
& setName
,
1908 const UnicodeString
& charsIn
,
1909 const UnicodeString
& charsOut
) {
1914 for (i
=0; i
<charsIn
.length(); i
+=U16_LENGTH(c
)) {
1915 c
= charsIn
.char32At(i
);
1916 if (!set
.contains(c
)) {
1920 if (bad
.length() > 0) {
1921 errln((UnicodeString
)"Fail: set " + setName
+ " does not contain " + prettify(bad
) +
1922 ", expected containment of " + prettify(charsIn
));
1924 logln((UnicodeString
)"Ok: set " + setName
+ " contains " + prettify(charsIn
));
1928 for (i
=0; i
<charsOut
.length(); i
+=U16_LENGTH(c
)) {
1929 c
= charsOut
.char32At(i
);
1930 if (set
.contains(c
)) {
1934 if (bad
.length() > 0) {
1935 errln((UnicodeString
)"Fail: set " + setName
+ " contains " + prettify(bad
) +
1936 ", expected non-containment of " + prettify(charsOut
));
1938 logln((UnicodeString
)"Ok: set " + setName
+ " does not contain " + prettify(charsOut
));
1943 UnicodeSetTest::expectPattern(UnicodeSet
& set
,
1944 const UnicodeString
& pattern
,
1945 const UnicodeString
& expectedPairs
){
1946 UErrorCode status
= U_ZERO_ERROR
;
1947 set
.applyPattern(pattern
, status
);
1948 if (U_FAILURE(status
)) {
1949 errln(UnicodeString("FAIL: applyPattern(\"") + pattern
+
1953 if (getPairs(set
) != expectedPairs
) {
1954 errln(UnicodeString("FAIL: applyPattern(\"") + pattern
+
1956 escape(getPairs(set
)) + "\", expected \"" +
1957 escape(expectedPairs
) + "\"");
1959 logln(UnicodeString("Ok: applyPattern(\"") + pattern
+
1961 escape(getPairs(set
)) + "\"");
1964 // the result of calling set.toPattern(), which is the string representation of
1965 // this set(set), is passed to a UnicodeSet constructor, and tested that it
1966 // will produce another set that is equal to this one.
1967 UnicodeString temppattern
;
1968 set
.toPattern(temppattern
);
1969 UnicodeSet
*tempset
=new UnicodeSet(temppattern
, status
);
1970 if (U_FAILURE(status
)) {
1971 errln(UnicodeString("FAIL: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => invalid pattern"));
1974 if(*tempset
!= set
|| getPairs(*tempset
) != getPairs(set
)){
1975 errln(UnicodeString("FAIL: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => pairs \""+ escape(getPairs(*tempset
)) + "\", expected pairs \"" +
1976 escape(getPairs(set
)) + "\""));
1978 logln(UnicodeString("Ok: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => pairs \"" + escape(getPairs(*tempset
)) + "\""));
1986 UnicodeSetTest::expectPairs(const UnicodeSet
& set
, const UnicodeString
& expectedPairs
) {
1987 if (getPairs(set
) != expectedPairs
) {
1988 errln(UnicodeString("FAIL: Expected pair list \"") +
1989 escape(expectedPairs
) + "\", got \"" +
1990 escape(getPairs(set
)) + "\"");
1994 void UnicodeSetTest::expectToPattern(const UnicodeSet
& set
,
1995 const UnicodeString
& expPat
,
1996 const char** expStrings
) {
1998 set
.toPattern(pat
, TRUE
);
1999 if (pat
== expPat
) {
2000 logln((UnicodeString
)"Ok: toPattern() => \"" + pat
+ "\"");
2002 errln((UnicodeString
)"FAIL: toPattern() => \"" + pat
+ "\", expected \"" + expPat
+ "\"");
2005 if (expStrings
== NULL
) {
2009 for (int32_t i
=0; expStrings
[i
] != NULL
; ++i
) {
2010 if (expStrings
[i
] == NOT
) { // sic; pointer comparison
2014 UnicodeString s
= CharsToUnicodeString(expStrings
[i
]);
2015 UBool contained
= set
.contains(s
);
2016 if (contained
== in
) {
2017 logln((UnicodeString
)"Ok: " + expPat
+
2018 (contained
? " contains {" : " does not contain {") +
2019 escape(expStrings
[i
]) + "}");
2021 errln((UnicodeString
)"FAIL: " + expPat
+
2022 (contained
? " contains {" : " does not contain {") +
2023 escape(expStrings
[i
]) + "}");
2028 static UChar
toHexString(int32_t i
) { return (UChar
)(i
+ (i
< 10 ? 0x30 : (0x41 - 10))); }
2031 UnicodeSetTest::doAssert(UBool condition
, const char *message
)
2034 errln(UnicodeString("ERROR : ") + message
);
2039 UnicodeSetTest::escape(const UnicodeString
& s
) {
2041 for (int32_t i
=0; i
<s
.length(); )
2043 UChar32 c
= s
.char32At(i
);
2044 if (0x0020 <= c
&& c
<= 0x007F) {
2048 buf
+= (UChar
)0x5c; buf
+= (UChar
)0x75;
2050 buf
+= (UChar
)0x5c; buf
+= (UChar
)0x55;
2051 buf
+= toHexString((c
& 0xF0000000) >> 28);
2052 buf
+= toHexString((c
& 0x0F000000) >> 24);
2053 buf
+= toHexString((c
& 0x00F00000) >> 20);
2054 buf
+= toHexString((c
& 0x000F0000) >> 16);
2056 buf
+= toHexString((c
& 0xF000) >> 12);
2057 buf
+= toHexString((c
& 0x0F00) >> 8);
2058 buf
+= toHexString((c
& 0x00F0) >> 4);
2059 buf
+= toHexString(c
& 0x000F);