1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
17 #include "unicode/utypes.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33 if (U_FAILURE(status)) { \
34 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
35 u_errorName(status)); \
37 } UPRV_BLOCK_MACRO_END
39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
41 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
43 } UPRV_BLOCK_MACRO_END
45 UnicodeString
operator+(const UnicodeString
& left
, const UnicodeSet
& set
) {
48 return left
+ UnicodeSetTest::escape(pat
);
51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL
) {
54 UConverter
*UnicodeSetTest::openUTF8Converter() {
56 UErrorCode errorCode
=U_ZERO_ERROR
;
57 utf8Cnv
=ucnv_open("UTF-8", &errorCode
);
62 UnicodeSetTest::~UnicodeSetTest() {
67 UnicodeSetTest::runIndexedTest(int32_t index
, UBool exec
,
68 const char* &name
, char* /*par*/) {
70 logln(u
"TestSuite UnicodeSetTest");
73 TESTCASE_AUTO(TestPatterns
);
74 TESTCASE_AUTO(TestAddRemove
);
75 TESTCASE_AUTO(TestCategories
);
76 TESTCASE_AUTO(TestCloneEqualHash
);
77 TESTCASE_AUTO(TestMinimalRep
);
78 TESTCASE_AUTO(TestAPI
);
79 TESTCASE_AUTO(TestScriptSet
);
80 TESTCASE_AUTO(TestPropertySet
);
81 TESTCASE_AUTO(TestClone
);
82 TESTCASE_AUTO(TestExhaustive
);
83 TESTCASE_AUTO(TestToPattern
);
84 TESTCASE_AUTO(TestIndexOf
);
85 TESTCASE_AUTO(TestStrings
);
86 TESTCASE_AUTO(Testj2268
);
87 TESTCASE_AUTO(TestCloseOver
);
88 TESTCASE_AUTO(TestEscapePattern
);
89 TESTCASE_AUTO(TestInvalidCodePoint
);
90 TESTCASE_AUTO(TestSymbolTable
);
91 TESTCASE_AUTO(TestSurrogate
);
92 TESTCASE_AUTO(TestPosixClasses
);
93 TESTCASE_AUTO(TestIteration
);
94 TESTCASE_AUTO(TestFreezable
);
95 TESTCASE_AUTO(TestSpan
);
96 TESTCASE_AUTO(TestStringSpan
);
97 TESTCASE_AUTO(TestUCAUnsafeBackwards
);
98 TESTCASE_AUTO(TestIntOverflow
);
99 TESTCASE_AUTO(TestUnusedCcc
);
100 TESTCASE_AUTO(TestDeepPattern
);
104 static const char NOT
[] = "%%%%";
107 * UVector was improperly copying contents
108 * This code will crash this is still true
110 void UnicodeSetTest::Testj2268() {
112 t
.add(UnicodeString("abc"));
114 UnicodeString ustrPat
;
115 test
.toPattern(ustrPat
, TRUE
);
121 void UnicodeSetTest::TestToPattern() {
122 UErrorCode ec
= U_ZERO_ERROR
;
124 // Test that toPattern() round trips with syntax characters and
127 static const char* OTHER_TOPATTERN_TESTS
[] = {
128 "[[:latin:]&[:greek:]]",
129 "[[:latin:]-[:greek:]]",
130 "[:nonspacing mark:]",
134 for (int32_t j
=0; OTHER_TOPATTERN_TESTS
[j
]!=NULL
; ++j
) {
136 UnicodeSet
s(OTHER_TOPATTERN_TESTS
[j
], ec
);
138 dataerrln((UnicodeString
)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS
[j
] + " - " + UnicodeString(u_errorName(ec
)));
141 checkPat(OTHER_TOPATTERN_TESTS
[j
], s
);
144 for (UChar32 i
= 0; i
<= 0x10FFFF; ++i
) {
145 if ((i
<= 0xFF && !u_isalpha(i
)) || u_isspace(i
)) {
147 // check various combinations to make sure they all work.
148 if (i
!= 0 && !toPatternAux(i
, i
)){
151 if (!toPatternAux(0, i
)){
154 if (!toPatternAux(i
, 0xFFFF)){
161 // Test pattern behavior of multicharacter strings.
164 UnicodeSet
* s
= new UnicodeSet("[a-z {aa} {ab}]", ec
);
166 // This loop isn't a loop. It's here to make the compiler happy.
167 // If you're curious, try removing it and changing the 'break'
168 // statements (except for the last) to goto's.
170 if (U_FAILURE(ec
)) break;
171 const char* exp1
[] = {"aa", "ab", NOT
, "ac", NULL
};
172 expectToPattern(*s
, "[a-z{aa}{ab}]", exp1
);
175 const char* exp2
[] = {"aa", "ab", "ac", NOT
, "xy", NULL
};
176 expectToPattern(*s
, "[a-z{aa}{ab}{ac}]", exp2
);
178 s
->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec
);
179 if (U_FAILURE(ec
)) break;
180 const char* exp3
[] = {"{l", "r}", NOT
, "xy", NULL
};
181 expectToPattern(*s
, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3
);
184 const char* exp4
[] = {"{l", "r}", "[]", NOT
, "xy", NULL
};
185 expectToPattern(*s
, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4
);
187 s
->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec
);
188 if (U_FAILURE(ec
)) break;
189 const char* exp5
[] = {"\\u4E01\\u4E02", "\n\r", NULL
};
190 expectToPattern(*s
, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5
);
194 s
->add(UnicodeString("abc", ""));
195 s
->add(UnicodeString("abc", ""));
196 const char* exp6
[] = {"abc", NOT
, "ab", NULL
};
197 expectToPattern(*s
, "[{abc}]", exp6
);
202 if (U_FAILURE(ec
)) errln("FAIL: pattern parse error");
206 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
208 s
.add((UChar
)97, (UChar
)98); // 'a', 'b'
209 expectToPattern(s
, "[ab]", NULL
);
212 UBool
UnicodeSetTest::toPatternAux(UChar32 start
, UChar32 end
) {
214 // use Integer.toString because Utility.hex doesn't handle ints
215 UnicodeString pat
= "";
216 // TODO do these in hex
217 //String source = "0x" + Integer.toString(start,16).toUpperCase();
218 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
219 UnicodeString source
;
220 source
= source
+ (uint32_t)start
;
222 source
= source
+ ".." + (uint32_t)end
;
224 testSet
.add(start
, end
);
225 return checkPat(source
, testSet
);
228 UBool
UnicodeSetTest::checkPat(const UnicodeString
& source
,
229 const UnicodeSet
& testSet
) {
230 // What we want to make sure of is that a pattern generated
231 // by toPattern(), with or without escaped unprintables, can
232 // be passed back into the UnicodeSet constructor.
235 testSet
.toPattern(pat0
, TRUE
);
237 if (!checkPat(source
+ " (escaped)", testSet
, pat0
)) return FALSE
;
239 //String pat1 = unescapeLeniently(pat0);
240 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
243 testSet
.toPattern(pat2
, FALSE
);
244 if (!checkPat(source
, testSet
, pat2
)) return FALSE
;
246 //String pat3 = unescapeLeniently(pat2);
247 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
249 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
250 logln((UnicodeString
)source
+ " => " + pat0
+ ", " + pat2
);
254 UBool
UnicodeSetTest::checkPat(const UnicodeString
& source
,
255 const UnicodeSet
& testSet
,
256 const UnicodeString
& pat
) {
257 UErrorCode ec
= U_ZERO_ERROR
;
258 UnicodeSet
testSet2(pat
, ec
);
259 if (testSet2
!= testSet
) {
260 errln((UnicodeString
)"Fail toPattern: " + source
+ " => " + pat
);
267 UnicodeSetTest::TestPatterns(void) {
269 expectPattern(set
, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
270 expectPattern(set
, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
271 expectPattern(set
, UnicodeString("[a\\-z]", ""), "--aazz");
272 expectPattern(set
, UnicodeString("[-az]", ""), "--aazz");
273 expectPattern(set
, UnicodeString("[az-]", ""), "--aazz");
274 expectPattern(set
, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
276 // Throw in a test of complement
279 exp
.append((UChar
)0x0000).append("aeeoouu").append((UChar
)(0x007a+1)).append((UChar
)0xFFFF);
280 expectPairs(set
, exp
);
284 UnicodeSetTest::TestCategories(void) {
285 UErrorCode status
= U_ZERO_ERROR
;
286 const char* pat
= " [:Lu:] "; // Whitespace ok outside [:..:]
287 UnicodeSet
set(pat
, status
);
288 if (U_FAILURE(status
)) {
289 dataerrln((UnicodeString
)"Fail: Can't construct set with " + pat
+ " - " + UnicodeString(u_errorName(status
)));
292 expectContainment(set
, pat
, "ABC", "abc");
296 int32_t failures
= 0;
297 // Make sure generation of L doesn't pollute cached Lu set
298 // First generate L, then Lu
299 set
.applyPattern("[:L:]", status
);
300 if (U_FAILURE(status
)) { errln("FAIL"); return; }
301 for (i
=0; i
<0x200; ++i
) {
302 UBool l
= u_isalpha((UChar
)i
);
303 if (l
!= set
.contains(i
)) {
304 errln((UnicodeString
)"FAIL: L contains " + (unsigned short)i
+ " = " +
306 if (++failures
== 10) break;
310 set
.applyPattern("[:Lu:]", status
);
311 if (U_FAILURE(status
)) { errln("FAIL"); return; }
312 for (i
=0; i
<0x200; ++i
) {
313 UBool lu
= (u_charType((UChar
)i
) == U_UPPERCASE_LETTER
);
314 if (lu
!= set
.contains(i
)) {
315 errln((UnicodeString
)"FAIL: Lu contains " + (unsigned short)i
+ " = " +
317 if (++failures
== 20) break;
322 UnicodeSetTest::TestCloneEqualHash(void) {
323 UErrorCode status
= U_ZERO_ERROR
;
324 // set1 and set2 used to be built with the obsolete constructor taking
325 // UCharCategory values; replaced with pattern constructors
327 UnicodeSet
*set1
=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status
); // :Ll: Letter, lowercase
328 UnicodeSet
*set1a
=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status
); // Letter, lowercase
329 if (U_FAILURE(status
)){
330 dataerrln((UnicodeString
)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status
)));
333 UnicodeSet
*set2
=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status
); //Number, Decimal digit
334 UnicodeSet
*set2a
=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status
); //Number, Decimal digit
335 if (U_FAILURE(status
)){
336 errln((UnicodeString
)"FAIL: Can't construct set with category->Nd");
340 if (*set1
!= *set1a
) {
341 errln("FAIL: category constructor for Ll broken");
343 if (*set2
!= *set2a
) {
344 errln("FAIL: category constructor for Nd broken");
349 logln("Testing copy construction");
350 UnicodeSet
*set1copy
=new UnicodeSet(*set1
);
351 if(*set1
!= *set1copy
|| *set1
== *set2
||
352 getPairs(*set1
) != getPairs(*set1copy
) ||
353 set1
->hashCode() != set1copy
->hashCode()){
354 errln("FAIL : Error in copy construction");
358 logln("Testing =operator");
359 UnicodeSet set1equal
=*set1
;
360 UnicodeSet set2equal
=*set2
;
361 if(set1equal
!= *set1
|| set1equal
!= *set1copy
|| set2equal
!= *set2
||
362 set2equal
== *set1
|| set2equal
== *set1copy
|| set2equal
== set1equal
){
363 errln("FAIL: Error in =operator");
366 logln("Testing clone()");
367 UnicodeSet
*set1clone
=set1
->clone();
368 UnicodeSet
*set2clone
=set2
->clone();
369 if(*set1clone
!= *set1
|| *set1clone
!= *set1copy
|| *set1clone
!= set1equal
||
370 *set2clone
!= *set2
|| *set2clone
== *set1copy
|| *set2clone
!= set2equal
||
371 *set2clone
== *set1
|| *set2clone
== set1equal
|| *set2clone
== *set1clone
){
372 errln("FAIL: Error in clone");
375 logln("Testing hashcode");
376 if(set1
->hashCode() != set1equal
.hashCode() || set1
->hashCode() != set1clone
->hashCode() ||
377 set2
->hashCode() != set2equal
.hashCode() || set2
->hashCode() != set2clone
->hashCode() ||
378 set1copy
->hashCode() != set1equal
.hashCode() || set1copy
->hashCode() != set1clone
->hashCode() ||
379 set1
->hashCode() == set2
->hashCode() || set1copy
->hashCode() == set2
->hashCode() ||
380 set2
->hashCode() == set1clone
->hashCode() || set2
->hashCode() == set1equal
.hashCode() ){
381 errln("FAIL: Error in hashCode()");
393 UnicodeSetTest::TestAddRemove(void) {
394 UnicodeSet set
; // Construct empty set
395 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
396 doAssert(set
.size() == 0, "size should be 0");
398 doAssert(set
.size() == 0x110000, "size should be 0x110000");
400 set
.add(0x0061, 0x007a);
401 expectPairs(set
, "az");
402 doAssert(set
.isEmpty() == FALSE
, "set should not be empty");
403 doAssert(set
.size() != 0, "size should not be equal to 0");
404 doAssert(set
.size() == 26, "size should be equal to 26");
405 set
.remove(0x006d, 0x0070);
406 expectPairs(set
, "alqz");
407 doAssert(set
.size() == 22, "size should be equal to 22");
408 set
.remove(0x0065, 0x0067);
409 expectPairs(set
, "adhlqz");
410 doAssert(set
.size() == 19, "size should be equal to 19");
411 set
.remove(0x0064, 0x0069);
412 expectPairs(set
, "acjlqz");
413 doAssert(set
.size() == 16, "size should be equal to 16");
414 set
.remove(0x0063, 0x0072);
415 expectPairs(set
, "absz");
416 doAssert(set
.size() == 10, "size should be equal to 10");
417 set
.add(0x0066, 0x0071);
418 expectPairs(set
, "abfqsz");
419 doAssert(set
.size() == 22, "size should be equal to 22");
420 set
.remove(0x0061, 0x0067);
421 expectPairs(set
, "hqsz");
422 set
.remove(0x0061, 0x007a);
423 expectPairs(set
, "");
424 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
425 doAssert(set
.size() == 0, "size should be 0");
427 doAssert(set
.isEmpty() == FALSE
, "set should not be empty");
428 doAssert(set
.size() == 1, "size should not be equal to 1");
431 expectPairs(set
, "ac");
432 doAssert(set
.size() == 3, "size should not be equal to 3");
435 expectPairs(set
, "acpq");
436 doAssert(set
.size() == 5, "size should not be equal to 5");
438 expectPairs(set
, "");
439 doAssert(set
.isEmpty() == TRUE
, "set should be empty");
440 doAssert(set
.size() == 0, "size should be 0");
442 // Try removing an entire set from another set
443 expectPattern(set
, "[c-x]", "cx");
445 expectPattern(set2
, "[f-ky-za-bc[vw]]", "acfkvwyz");
447 expectPairs(set
, "deluxx");
449 // Try adding an entire set to another set
450 expectPattern(set
, "[jackiemclean]", "aacceein");
451 expectPattern(set2
, "[hitoshinamekatajamesanderson]", "aadehkmort");
453 expectPairs(set
, "aacehort");
454 doAssert(set
.containsAll(set2
) == TRUE
, "set should contain all the elements in set2");
456 // Try retaining an set of elements contained in another set (intersection)
458 expectPattern(set3
, "[a-c]", "ac");
459 doAssert(set
.containsAll(set3
) == FALSE
, "set doesn't contain all the elements in set3");
461 expectPairs(set3
, "aacc");
462 doAssert(set
.containsAll(set3
) == TRUE
, "set should contain all the elements in set3");
464 expectPairs(set
, "aacc");
465 doAssert(set
.size() == set3
.size(), "set.size() should be set3.size()");
466 doAssert(set
.containsAll(set3
) == TRUE
, "set should contain all the elements in set3");
468 doAssert(set
.size() != set3
.size(), "set.size() != set3.size()");
470 // Test commutativity
471 expectPattern(set
, "[hitoshinamekatajamesanderson]", "aadehkmort");
472 expectPattern(set2
, "[jackiemclean]", "aacceein");
474 expectPairs(set
, "aacehort");
475 doAssert(set
.containsAll(set2
) == TRUE
, "set should contain all the elements in set2");
483 * Make sure minimal representation is maintained.
485 void UnicodeSetTest::TestMinimalRep() {
486 UErrorCode status
= U_ZERO_ERROR
;
487 // This is pretty thoroughly tested by checkCanonicalRep()
488 // run against the exhaustive operation results. Use the code
489 // here for debugging specific spot problems.
491 // 1 overlap against 2
492 UnicodeSet
set("[h-km-q]", status
);
493 if (U_FAILURE(status
)) { errln("FAIL"); return; }
494 UnicodeSet
set2("[i-o]", status
);
495 if (U_FAILURE(status
)) { errln("FAIL"); return; }
497 expectPairs(set
, "hq");
499 set
.applyPattern("[a-m]", status
);
500 if (U_FAILURE(status
)) { errln("FAIL"); return; }
501 set2
.applyPattern("[e-o]", status
);
502 if (U_FAILURE(status
)) { errln("FAIL"); return; }
504 expectPairs(set
, "ao");
506 set
.applyPattern("[e-o]", status
);
507 if (U_FAILURE(status
)) { errln("FAIL"); return; }
508 set2
.applyPattern("[a-m]", status
);
509 if (U_FAILURE(status
)) { errln("FAIL"); return; }
511 expectPairs(set
, "ao");
512 // 1 overlap against 3
513 set
.applyPattern("[a-eg-mo-w]", status
);
514 if (U_FAILURE(status
)) { errln("FAIL"); return; }
515 set2
.applyPattern("[d-q]", status
);
516 if (U_FAILURE(status
)) { errln("FAIL"); return; }
518 expectPairs(set
, "aw");
521 void UnicodeSetTest::TestAPI() {
522 UErrorCode status
= U_ZERO_ERROR
;
525 if (!set
.isEmpty() || set
.getRangeCount() != 0) {
526 errln((UnicodeString
)"FAIL, set should be empty but isn't: " +
530 // clear(), isEmpty()
533 errln((UnicodeString
)"FAIL, set shouldn't be empty but is: " +
537 if (!set
.isEmpty()) {
538 errln((UnicodeString
)"FAIL, set should be empty but isn't: " +
544 if (set
.size() != 0) {
545 errln((UnicodeString
)"FAIL, size should be 0, but is " + set
.size() +
549 if (set
.size() != 1) {
550 errln((UnicodeString
)"FAIL, size should be 1, but is " + set
.size() +
553 set
.add(0x0031, 0x0039);
554 if (set
.size() != 10) {
555 errln((UnicodeString
)"FAIL, size should be 10, but is " + set
.size() +
559 // contains(first, last)
561 set
.applyPattern("[A-Y 1-8 b-d l-y]", status
);
562 if (U_FAILURE(status
)) { errln("FAIL"); return; }
563 for (int32_t i
= 0; i
<set
.getRangeCount(); ++i
) {
564 UChar32 a
= set
.getRangeStart(i
);
565 UChar32 b
= set
.getRangeEnd(i
);
566 if (!set
.contains(a
, b
)) {
567 errln((UnicodeString
)"FAIL, should contain " + (unsigned short)a
+ '-' + (unsigned short)b
+
568 " but doesn't: " + set
);
570 if (set
.contains((UChar32
)(a
-1), b
)) {
571 errln((UnicodeString
)"FAIL, shouldn't contain " +
572 (unsigned short)(a
-1) + '-' + (unsigned short)b
+
573 " but does: " + set
);
575 if (set
.contains(a
, (UChar32
)(b
+1))) {
576 errln((UnicodeString
)"FAIL, shouldn't contain " +
577 (unsigned short)a
+ '-' + (unsigned short)(b
+1) +
578 " but does: " + set
);
582 // Ported InversionList test.
583 UnicodeSet
a((UChar32
)3,(UChar32
)10);
584 UnicodeSet
b((UChar32
)7,(UChar32
)15);
587 logln((UnicodeString
)"a [3-10]: " + a
);
588 logln((UnicodeString
)"b [7-15]: " + b
);
591 UnicodeSet
exp((UChar32
)3,(UChar32
)15);
593 logln((UnicodeString
)"c.set(a).add(b): " + c
);
595 errln((UnicodeString
)"FAIL: c.set(a).add(b) = " + c
+ ", expect " + exp
);
598 exp
.set((UChar32
)0, (UChar32
)2);
599 exp
.add((UChar32
)16, UnicodeSet::MAX_VALUE
);
601 logln((UnicodeString
)"c.complement(): " + c
);
603 errln((UnicodeString
)"FAIL: c.complement() = " + c
+ ", expect " + exp
);
606 exp
.set((UChar32
)3, (UChar32
)15);
608 logln((UnicodeString
)"c.complement(): " + c
);
610 errln((UnicodeString
)"FAIL: c.complement() = " + c
+ ", expect " + exp
);
614 exp
.set((UChar32
)3,(UChar32
)6);
615 exp
.add((UChar32
)11,(UChar32
) 15);
617 logln((UnicodeString
)"c.set(a).exclusiveOr(b): " + c
);
619 errln((UnicodeString
)"FAIL: c.set(a).exclusiveOr(b) = " + c
+ ", expect " + exp
);
623 bitsToSet(setToBits(c
), c
);
625 logln((UnicodeString
)"bitsToSet(setToBits(c)): " + c
);
627 errln((UnicodeString
)"FAIL: bitsToSet(setToBits(c)) = " + c
+ ", expect " + exp
);
630 // Additional tests for coverage JB#2118
631 //UnicodeSet::complement(class UnicodeString const &)
632 //UnicodeSet::complementAll(class UnicodeString const &)
633 //UnicodeSet::containsNone(class UnicodeSet const &)
634 //UnicodeSet::containsNone(long,long)
635 //UnicodeSet::containsSome(class UnicodeSet const &)
636 //UnicodeSet::containsSome(long,long)
637 //UnicodeSet::removeAll(class UnicodeString const &)
638 //UnicodeSet::retain(long)
639 //UnicodeSet::retainAll(class UnicodeString const &)
640 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
641 //UnicodeSetIterator::getString(void)
643 set
.complement("ab");
644 exp
.applyPattern("[{ab}]", status
);
645 if (U_FAILURE(status
)) { errln("FAIL"); return; }
646 if (set
!= exp
) { errln("FAIL: complement(\"ab\")"); return; }
648 UnicodeSetIterator
iset(set
);
649 if (!iset
.next() || !iset
.isString()) {
650 errln("FAIL: UnicodeSetIterator::next/isString");
651 } else if (iset
.getString() != "ab") {
652 errln("FAIL: UnicodeSetIterator::getString");
655 set
.add((UChar32
)0x61, (UChar32
)0x7A);
656 set
.complementAll("alan");
657 exp
.applyPattern("[{ab}b-kmo-z]", status
);
658 if (U_FAILURE(status
)) { errln("FAIL"); return; }
659 if (set
!= exp
) { errln("FAIL: complementAll(\"alan\")"); return; }
661 exp
.applyPattern("[a-z]", status
);
662 if (U_FAILURE(status
)) { errln("FAIL"); return; }
663 if (set
.containsNone(exp
)) { errln("FAIL: containsNone(UnicodeSet)"); }
664 if (!set
.containsSome(exp
)) { errln("FAIL: containsSome(UnicodeSet)"); }
665 exp
.applyPattern("[aln]", status
);
666 if (U_FAILURE(status
)) { errln("FAIL"); return; }
667 if (!set
.containsNone(exp
)) { errln("FAIL: containsNone(UnicodeSet)"); }
668 if (set
.containsSome(exp
)) { errln("FAIL: containsSome(UnicodeSet)"); }
670 if (set
.containsNone((UChar32
)0x61, (UChar32
)0x7A)) {
671 errln("FAIL: containsNone(UChar32, UChar32)");
673 if (!set
.containsSome((UChar32
)0x61, (UChar32
)0x7A)) {
674 errln("FAIL: containsSome(UChar32, UChar32)");
676 if (!set
.containsNone((UChar32
)0x41, (UChar32
)0x5A)) {
677 errln("FAIL: containsNone(UChar32, UChar32)");
679 if (set
.containsSome((UChar32
)0x41, (UChar32
)0x5A)) {
680 errln("FAIL: containsSome(UChar32, UChar32)");
683 set
.removeAll("liu");
684 exp
.applyPattern("[{ab}b-hj-kmo-tv-z]", status
);
685 if (U_FAILURE(status
)) { errln("FAIL"); return; }
686 if (set
!= exp
) { errln("FAIL: removeAll(\"liu\")"); return; }
688 set
.retainAll("star");
689 exp
.applyPattern("[rst]", status
);
690 if (U_FAILURE(status
)) { errln("FAIL"); return; }
691 if (set
!= exp
) { errln("FAIL: retainAll(\"star\")"); return; }
693 set
.retain((UChar32
)0x73);
694 exp
.applyPattern("[s]", status
);
695 if (U_FAILURE(status
)) { errln("FAIL"); return; }
696 if (set
!= exp
) { errln("FAIL: retain('s')"); return; }
699 int32_t slen
= set
.serialize(buf
, UPRV_LENGTHOF(buf
), status
);
700 if (U_FAILURE(status
)) { errln("FAIL: serialize"); return; }
701 if (slen
!= 3 || buf
[0] != 2 || buf
[1] != 0x73 || buf
[2] != 0x74) {
702 errln("FAIL: serialize");
706 // Conversions to and from USet
707 UnicodeSet
*uniset
= &set
;
708 USet
*uset
= uniset
->toUSet();
709 TEST_ASSERT((void *)uset
== (void *)uniset
);
710 UnicodeSet
*setx
= UnicodeSet::fromUSet(uset
);
711 TEST_ASSERT((void *)setx
== (void *)uset
);
712 const UnicodeSet
*constSet
= uniset
;
713 const USet
*constUSet
= constSet
->toUSet();
714 TEST_ASSERT((void *)constUSet
== (void *)constSet
);
715 const UnicodeSet
*constSetx
= UnicodeSet::fromUSet(constUSet
);
716 TEST_ASSERT((void *)constSetx
== (void *)constUSet
);
718 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
719 UnicodeString longString
=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
720 UnicodeSet
ac(0x61, 0x63);
721 ac
.remove(0x62).freeze();
722 if( ac
.span(longString
, -5, USET_SPAN_CONTAINED
)!=10 ||
723 ac
.span(longString
, 0, USET_SPAN_CONTAINED
)!=10 ||
724 ac
.span(longString
, 5, USET_SPAN_CONTAINED
)!=10 ||
725 ac
.span(longString
, 10, USET_SPAN_CONTAINED
)!=10 ||
726 ac
.span(longString
, 15, USET_SPAN_CONTAINED
)!=15 ||
727 ac
.span(longString
, 20, USET_SPAN_CONTAINED
)!=30 ||
728 ac
.span(longString
, 25, USET_SPAN_CONTAINED
)!=30 ||
729 ac
.span(longString
, 30, USET_SPAN_CONTAINED
)!=30 ||
730 ac
.span(longString
, 35, USET_SPAN_CONTAINED
)!=30 ||
731 ac
.span(longString
, INT32_MAX
, USET_SPAN_CONTAINED
)!=30
733 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
735 if( ac
.spanBack(longString
, -5, USET_SPAN_CONTAINED
)!=0 ||
736 ac
.spanBack(longString
, 0, USET_SPAN_CONTAINED
)!=0 ||
737 ac
.spanBack(longString
, 5, USET_SPAN_CONTAINED
)!=0 ||
738 ac
.spanBack(longString
, 10, USET_SPAN_CONTAINED
)!=0 ||
739 ac
.spanBack(longString
, 15, USET_SPAN_CONTAINED
)!=15 ||
740 ac
.spanBack(longString
, 20, USET_SPAN_CONTAINED
)!=20 ||
741 ac
.spanBack(longString
, 25, USET_SPAN_CONTAINED
)!=20 ||
742 ac
.spanBack(longString
, 30, USET_SPAN_CONTAINED
)!=20 ||
743 ac
.spanBack(longString
, 35, USET_SPAN_CONTAINED
)!=20 ||
744 ac
.spanBack(longString
, INT32_MAX
, USET_SPAN_CONTAINED
)!=20
746 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
750 void UnicodeSetTest::TestIteration() {
751 UErrorCode ec
= U_ZERO_ERROR
;
755 // 6 code points, 3 ranges, 2 strings, 8 total elements
756 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
757 UnicodeSet
set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec
);
758 TEST_ASSERT_SUCCESS(ec
);
759 UnicodeSetIterator
it(set
);
761 for (outerLoop
=0; outerLoop
<3; outerLoop
++) {
762 // Run the test multiple times, to check that iterator.reset() is working.
763 for (i
=0; i
<10; i
++) {
764 UBool nextv
= it
.next();
765 UBool isString
= it
.isString();
766 int32_t codePoint
= it
.getCodepoint();
767 //int32_t codePointEnd = it.getCodepointEnd();
768 UnicodeString s
= it
.getString();
771 TEST_ASSERT(nextv
== TRUE
);
772 TEST_ASSERT(isString
== FALSE
);
773 TEST_ASSERT(codePoint
==0x61);
774 TEST_ASSERT(s
== "a");
777 TEST_ASSERT(nextv
== TRUE
);
778 TEST_ASSERT(isString
== FALSE
);
779 TEST_ASSERT(codePoint
==0x62);
780 TEST_ASSERT(s
== "b");
783 TEST_ASSERT(nextv
== TRUE
);
784 TEST_ASSERT(isString
== FALSE
);
785 TEST_ASSERT(codePoint
==0x63);
786 TEST_ASSERT(s
== "c");
789 TEST_ASSERT(nextv
== TRUE
);
790 TEST_ASSERT(isString
== FALSE
);
791 TEST_ASSERT(codePoint
==0x79);
792 TEST_ASSERT(s
== "y");
795 TEST_ASSERT(nextv
== TRUE
);
796 TEST_ASSERT(isString
== FALSE
);
797 TEST_ASSERT(codePoint
==0x7a);
798 TEST_ASSERT(s
== "z");
801 TEST_ASSERT(nextv
== TRUE
);
802 TEST_ASSERT(isString
== FALSE
);
803 TEST_ASSERT(codePoint
==0x1abcd);
804 TEST_ASSERT(s
== UnicodeString((UChar32
)0x1abcd));
807 TEST_ASSERT(nextv
== TRUE
);
808 TEST_ASSERT(isString
== TRUE
);
809 TEST_ASSERT(s
== "str1");
812 TEST_ASSERT(nextv
== TRUE
);
813 TEST_ASSERT(isString
== TRUE
);
814 TEST_ASSERT(s
== "str2");
817 TEST_ASSERT(nextv
== FALSE
);
820 TEST_ASSERT(nextv
== FALSE
);
824 it
.reset(); // prepare to run the iteration again.
831 void UnicodeSetTest::TestStrings() {
832 UErrorCode ec
= U_ZERO_ERROR
;
834 UnicodeSet
* testList
[] = {
835 UnicodeSet::createFromAll("abc"),
836 new UnicodeSet("[a-c]", ec
),
838 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
839 new UnicodeSet("[{ll}{ch}a-z]", ec
),
841 UnicodeSet::createFrom("ab}c"),
842 new UnicodeSet("[{ab\\}c}]", ec
),
844 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
845 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec
),
851 errln("FAIL: couldn't construct test sets");
854 for (int32_t i
= 0; testList
[i
] != NULL
; i
+=2) {
856 UnicodeString pat0
, pat1
;
857 testList
[i
]->toPattern(pat0
, TRUE
);
858 testList
[i
+1]->toPattern(pat1
, TRUE
);
859 if (*testList
[i
] == *testList
[i
+1]) {
860 logln((UnicodeString
)"Ok: " + pat0
+ " == " + pat1
);
862 logln((UnicodeString
)"FAIL: " + pat0
+ " != " + pat1
);
866 delete testList
[i
+1];
871 * Test the [:Latin:] syntax.
873 void UnicodeSetTest::TestScriptSet() {
874 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
876 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
879 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
884 * Test the [:Latin:] syntax.
886 void UnicodeSetTest::TestPropertySet() {
887 static const char* const DATA
[] = {
888 // Pattern, Chars IN, Chars NOT in
898 "\\P{ GENERAL Category = upper case letter }",
902 #if !UCONFIG_NO_NORMALIZATION
903 // Combining class: @since ICU 2.2
904 // Check both symbolic and numeric
909 "\\p{Canonical Combining Class = 11}",
913 "[:c c c = iota subscript :]",
918 // Bidi class: @since ICU 2.2
919 "\\p{bidiclass=lefttoright}",
923 // Binary properties: @since ICU 2.2
930 // weiv: )(and * were removed from math in Unicode 4.0.1
934 // JB#1767 \N{}, \p{ASCII}
939 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
955 "\\u03D8\\u03D9", // 3.2
958 "\\u1800\\u3400\\U0002f800",
959 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
961 // JB#2350: Case_Sensitive
962 "[:Case Sensitive:]",
963 "A\\u1FFC\\U00010410",
964 ";\\u00B4\\U00010500",
966 // JB#2832: C99-compatibility props
973 " \\u0003\\u0007\\u0009\\u000A\\u000D",
976 "!@#%&*()[]{}-_\\/;:,.?'\"",
983 // Regex compatibility test
984 "[-b]", // leading '-' is literal
988 "[^-b]", // leading '-' is literal
992 "[b-]", // trailing '-' is literal
996 "[^b-]", // trailing '-' is literal
1000 "[a-b-]", // trailing '-' is literal
1004 "[[a-q]&[p-z]-]", // trailing '-' is literal
1008 "[\\s|\\)|:|$|\\>]", // from regex tests
1012 "[\\uDC00cd]", // JB#2906: isolated trail at start
1014 "ab\\uD800\\U00010000",
1016 "[ab\\uD800]", // JB#2906: isolated trail at start
1018 "cd\\uDC00\\U00010000",
1020 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1022 "ef\\uDC00\\U00010000",
1024 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1026 "ef\\uD800\\U00010000",
1028 #if !UCONFIG_NO_NORMALIZATION
1029 "[:^lccc=0:]", // Lead canonical class
1031 "abcd\\u00c0\\u00c5",
1033 "[:^tccc=0:]", // Trail canonical class
1034 "\\u0300\\u0301\\u00c0\\u00c5",
1037 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1038 "\\u0300\\u0301\\u00c0\\u00c5",
1041 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1043 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1045 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1046 "\\u0F73\\u0F75\\u0F81",
1047 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1048 #endif /* !UCONFIG_NO_NORMALIZATION */
1051 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1052 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1054 // Script_Extensions, new in Unicode 6.0
1056 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1057 "\\u061D\\uFDEF\\uFDFE",
1059 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1060 // so scx-sc is missing U+FDF2.
1061 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1062 "\\u0640\\u064B\\u0650\\u0655",
1066 static const int32_t DATA_LEN
= UPRV_LENGTHOF(DATA
);
1068 for (int32_t i
=0; i
<DATA_LEN
; i
+=3) {
1069 expectContainment(UnicodeString(DATA
[i
], -1, US_INV
), CharsToUnicodeString(DATA
[i
+1]),
1070 CharsToUnicodeString(DATA
[i
+2]));
1075 * Test that Posix style character classes [:digit:], etc.
1076 * have the Unicode definitions from TR 18.
1078 void UnicodeSetTest::TestPosixClasses() {
1080 UErrorCode status
= U_ZERO_ERROR
;
1081 UnicodeSet
s1("[:alpha:]", status
);
1082 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status
);
1083 TEST_ASSERT_SUCCESS(status
);
1084 TEST_ASSERT(s1
==s2
);
1087 UErrorCode status
= U_ZERO_ERROR
;
1088 UnicodeSet
s1("[:lower:]", status
);
1089 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status
);
1090 TEST_ASSERT_SUCCESS(status
);
1091 TEST_ASSERT(s1
==s2
);
1094 UErrorCode status
= U_ZERO_ERROR
;
1095 UnicodeSet
s1("[:upper:]", status
);
1096 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status
);
1097 TEST_ASSERT_SUCCESS(status
);
1098 TEST_ASSERT(s1
==s2
);
1101 UErrorCode status
= U_ZERO_ERROR
;
1102 UnicodeSet
s1("[:punct:]", status
);
1103 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status
);
1104 TEST_ASSERT_SUCCESS(status
);
1105 TEST_ASSERT(s1
==s2
);
1108 UErrorCode status
= U_ZERO_ERROR
;
1109 UnicodeSet
s1("[:digit:]", status
);
1110 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status
);
1111 TEST_ASSERT_SUCCESS(status
);
1112 TEST_ASSERT(s1
==s2
);
1115 UErrorCode status
= U_ZERO_ERROR
;
1116 UnicodeSet
s1("[:xdigit:]", status
);
1117 UnicodeSet
s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status
);
1118 TEST_ASSERT_SUCCESS(status
);
1119 TEST_ASSERT(s1
==s2
);
1122 UErrorCode status
= U_ZERO_ERROR
;
1123 UnicodeSet
s1("[:alnum:]", status
);
1124 UnicodeSet
s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status
);
1125 TEST_ASSERT_SUCCESS(status
);
1126 TEST_ASSERT(s1
==s2
);
1129 UErrorCode status
= U_ZERO_ERROR
;
1130 UnicodeSet
s1("[:space:]", status
);
1131 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status
);
1132 TEST_ASSERT_SUCCESS(status
);
1133 TEST_ASSERT(s1
==s2
);
1136 UErrorCode status
= U_ZERO_ERROR
;
1137 UnicodeSet
s1("[:blank:]", status
);
1138 TEST_ASSERT_SUCCESS(status
);
1139 UnicodeSet
s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1141 TEST_ASSERT_SUCCESS(status
);
1142 TEST_ASSERT(s1
==s2
);
1145 UErrorCode status
= U_ZERO_ERROR
;
1146 UnicodeSet
s1("[:cntrl:]", status
);
1147 TEST_ASSERT_SUCCESS(status
);
1148 UnicodeSet
s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status
);
1149 TEST_ASSERT_SUCCESS(status
);
1150 TEST_ASSERT(s1
==s2
);
1153 UErrorCode status
= U_ZERO_ERROR
;
1154 UnicodeSet
s1("[:graph:]", status
);
1155 TEST_ASSERT_SUCCESS(status
);
1156 UnicodeSet
s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status
);
1157 TEST_ASSERT_SUCCESS(status
);
1158 TEST_ASSERT(s1
==s2
);
1161 UErrorCode status
= U_ZERO_ERROR
;
1162 UnicodeSet
s1("[:print:]", status
);
1163 TEST_ASSERT_SUCCESS(status
);
1164 UnicodeSet
s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status
);
1165 TEST_ASSERT_SUCCESS(status
);
1166 TEST_ASSERT(s1
==s2
);
1170 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1172 void UnicodeSetTest::TestClone() {
1173 UErrorCode ec
= U_ZERO_ERROR
;
1174 UnicodeSet
s("[abcxyz]", ec
);
1176 expectContainment(t
, "abc", "def");
1180 * Test the indexOf() and charAt() methods.
1182 void UnicodeSetTest::TestIndexOf() {
1183 UErrorCode ec
= U_ZERO_ERROR
;
1184 UnicodeSet
set("[a-cx-y3578]", ec
);
1185 if (U_FAILURE(ec
)) {
1186 errln("FAIL: UnicodeSet constructor");
1189 for (int32_t i
=0; i
<set
.size(); ++i
) {
1190 UChar32 c
= set
.charAt(i
);
1191 if (set
.indexOf(c
) != i
) {
1192 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1193 i
, c
, set
.indexOf(c
));
1196 UChar32 c
= set
.charAt(set
.size());
1198 errln("FAIL: charAt(<out of range>) = %X", c
);
1200 int32_t j
= set
.indexOf((UChar32
)0x71/*'q'*/);
1202 errln((UnicodeString
)"FAIL: indexOf('q') = " + j
);
1209 void UnicodeSetTest::TestCloseOver() {
1210 UErrorCode ec
= U_ZERO_ERROR
;
1212 char CASE
[] = {(char)USET_CASE_INSENSITIVE
};
1213 char CASE_MAPPINGS
[] = {(char)USET_ADD_CASE_MAPPINGS
};
1214 const char* DATA
[] = {
1215 // selector, input, output
1217 "[aq\\u00DF{Bc}{bC}{Fi}]",
1218 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1221 "[\\u01F1]", // 'DZ'
1222 "[\\u01F1\\u01F2\\u01F3]",
1226 "[\\u1FB4{\\u03AC\\u03B9}]",
1232 CASE
, // make sure binary search finds limits
1234 "[aA\\uFF3A\\uFF5A]",
1237 "[a-z]","[A-Za-z\\u017F\\u212A]",
1243 CASE
, "[i]", "[iI]",
1245 CASE
, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1246 CASE
, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1248 CASE
, "[\\u0131]", "[\\u0131]", // dotless i
1250 CASE
, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1252 CASE
, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1254 CASE
, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1256 CASE
, "[\\u03f7]", "[\\u03f7\\u03f8]",
1258 CASE
, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1260 CASE
, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1261 CASE
, "[{st}]", "[\\ufb05\\ufb06{st}]",
1263 CASE
, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1265 CASE
, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1267 CASE
, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1269 #if !UCONFIG_NO_FILE_IO
1271 "[aq\\u00DF{Bc}{bC}{Fi}]",
1272 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1276 "[\\u01F1]", // 'DZ'
1277 "[\\u01F1\\u01F2\\u01F3]",
1289 for (int32_t i
=0; DATA
[i
]!=NULL
; i
+=3) {
1290 int32_t selector
= DATA
[i
][0];
1291 UnicodeString
pat(DATA
[i
+1], -1, US_INV
);
1292 UnicodeString
exp(DATA
[i
+2], -1, US_INV
);
1293 s
.applyPattern(pat
, ec
);
1294 s
.closeOver(selector
);
1295 t
.applyPattern(exp
, ec
);
1296 if (U_FAILURE(ec
)) {
1297 errln("FAIL: applyPattern failed");
1301 logln((UnicodeString
)"Ok: " + pat
+ ".closeOver(" + selector
+ ") => " + exp
);
1303 dataerrln((UnicodeString
)"FAIL: " + pat
+ ".closeOver(" + selector
+ ") => " +
1304 s
.toPattern(buf
, TRUE
) + ", expected " + exp
);
1311 * This was used to compare the old implementation (using USET_CASE)
1312 * with the new one (using 0x100 temporarily)
1313 * while transitioning from hardcoded case closure tables in uniset.cpp
1314 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1315 * and using ucase.c functions for closure.
1316 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1318 * Note: The old and new implementation never fully matched because
1319 * the old implementation turned out to not map U+0130 and U+0131 correctly
1320 * (dotted I and dotless i) and because the old implementation's data tables
1321 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1322 * new implementation. (So sigmas and some other characters were not handled
1323 * according to the newer Unicode version.)
1325 UnicodeSet
sens("[:case_sensitive:]", ec
), sens2
, s2
;
1326 UnicodeSetIterator
si(sens
);
1327 UnicodeString str
, buf2
;
1328 const UnicodeString
*pStr
;
1331 if(!si
.isString()) {
1332 c
=si
.getCodepoint();
1341 s
.closeOver(USET_CASE
);
1344 errln("FAIL: closeOver(U+%04x) differs: ", c
);
1345 errln((UnicodeString
)"old "+s
.toPattern(buf
, TRUE
)+" new: "+t
.toPattern(buf2
, TRUE
));
1349 // remove all code points
1350 // should contain all full case folding mapping strings
1351 sens2
.remove(0, 0x10ffff);
1355 pStr
=&si
.getString();
1359 s
.closeOver(USET_CASE
);
1362 errln((UnicodeString
)"FAIL: closeOver("+s2
.toPattern(buf
, TRUE
)+") differs: ");
1363 errln((UnicodeString
)"old "+s
.toPattern(buf
, TRUE
)+" new: "+t
.toPattern(buf2
, TRUE
));
1369 // Test the pattern API
1370 s
.applyPattern("[abc]", USET_CASE_INSENSITIVE
, NULL
, ec
);
1371 if (U_FAILURE(ec
)) {
1372 errln("FAIL: applyPattern failed");
1374 expectContainment(s
, "abcABC", "defDEF");
1376 UnicodeSet
v("[^abc]", USET_CASE_INSENSITIVE
, NULL
, ec
);
1377 if (U_FAILURE(ec
)) {
1378 errln("FAIL: constructor failed");
1380 expectContainment(v
, "defDEF", "abcABC");
1382 UnicodeSet
cm("[abck]", USET_ADD_CASE_MAPPINGS
, NULL
, ec
);
1383 if (U_FAILURE(ec
)) {
1384 errln("FAIL: construct w/case mappings failed");
1386 expectContainment(cm
, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1390 void UnicodeSetTest::TestEscapePattern() {
1391 const char pattern
[] =
1392 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1394 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1395 // We test this with two passes; in the second pass we
1396 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1397 // this fails -- which is what we expect.
1398 for (int32_t pass
=1; pass
<=2; ++pass
) {
1399 UErrorCode ec
= U_ZERO_ERROR
;
1400 UnicodeString
pat(pattern
, -1, US_INV
);
1402 pat
= pat
.unescape();
1404 // Pattern is only good for pass 1
1405 UBool isPatternValid
= (pass
==1);
1407 UnicodeSet
set(pat
, ec
);
1408 if (U_SUCCESS(ec
) != isPatternValid
){
1409 errln((UnicodeString
)"FAIL: applyPattern(" +
1410 escape(pat
) + ") => " +
1414 if (U_FAILURE(ec
)) {
1417 if (set
.contains((UChar
)0x0644)){
1418 errln((UnicodeString
)"FAIL: " + escape(pat
) + " contains(U+0664)");
1421 UnicodeString newpat
;
1422 set
.toPattern(newpat
, TRUE
);
1423 if (newpat
== UnicodeString(exp
, -1, US_INV
)) {
1424 logln(escape(pat
) + " => " + newpat
);
1426 errln((UnicodeString
)"FAIL: " + escape(pat
) + " => " + newpat
);
1429 for (int32_t i
=0; i
<set
.getRangeCount(); ++i
) {
1430 UnicodeString
str("Range ");
1431 str
.append((UChar
)(0x30 + i
))
1433 .append((UChar32
)set
.getRangeStart(i
))
1435 .append((UChar32
)set
.getRangeEnd(i
));
1436 str
= str
+ " (" + set
.getRangeStart(i
) + " - " +
1437 set
.getRangeEnd(i
) + ")";
1438 if (set
.getRangeStart(i
) < 0) {
1439 errln((UnicodeString
)"FAIL: " + escape(str
));
1447 void UnicodeSetTest::expectRange(const UnicodeString
& label
,
1448 const UnicodeSet
& set
,
1449 UChar32 start
, UChar32 end
) {
1450 UnicodeSet
exp(start
, end
);
1453 logln(label
+ " => " + set
.toPattern(pat
, TRUE
));
1456 errln((UnicodeString
)"FAIL: " + label
+ " => " +
1457 set
.toPattern(pat
, TRUE
) +
1458 ", expected " + exp
.toPattern(xpat
, TRUE
));
1462 void UnicodeSetTest::TestInvalidCodePoint() {
1464 const UChar32 DATA
[] = {
1465 // Test range Expected range
1466 0, 0x10FFFF, 0, 0x10FFFF,
1467 (UChar32
)-1, 8, 0, 8,
1468 8, 0x110000, 8, 0x10FFFF
1470 const int32_t DATA_LENGTH
= UPRV_LENGTHOF(DATA
);
1475 for (i
=0; i
<DATA_LENGTH
; i
+=4) {
1476 UChar32 start
= DATA
[i
];
1477 UChar32 end
= DATA
[i
+1];
1478 UChar32 xstart
= DATA
[i
+2];
1479 UChar32 xend
= DATA
[i
+3];
1481 // Try various API using the test code points
1483 UnicodeSet
set(start
, end
);
1484 expectRange((UnicodeString
)"ct(" + start
+ "," + end
+ ")",
1488 set
.set(start
, end
);
1489 expectRange((UnicodeString
)"set(" + start
+ "," + end
+ ")",
1492 UBool b
= set
.contains(start
);
1493 b
= set
.contains(start
, end
);
1494 b
= set
.containsNone(start
, end
);
1495 b
= set
.containsSome(start
, end
);
1496 (void)b
; // Suppress set but not used warning.
1498 /*int32_t index = set.indexOf(start);*/
1502 set
.add(start
, end
);
1503 expectRange((UnicodeString
)"add(" + start
+ "," + end
+ ")",
1506 set
.set(0, 0x10FFFF);
1507 set
.retain(start
, end
);
1508 expectRange((UnicodeString
)"retain(" + start
+ "," + end
+ ")",
1512 set
.set(0, 0x10FFFF);
1514 set
.remove(start
, end
);
1516 expectRange((UnicodeString
)"!remove(" + start
+ "," + end
+ ")",
1519 set
.set(0, 0x10FFFF);
1520 set
.complement(start
, end
);
1522 expectRange((UnicodeString
)"!complement(" + start
+ "," + end
+ ")",
1524 set
.complement(start
);
1527 const UChar32 DATA2
[] = {
1533 const int32_t DATA2_LENGTH
= UPRV_LENGTHOF(DATA2
);
1535 for (i
=0; i
<DATA2_LENGTH
; ++i
) {
1536 UChar32 c
= DATA2
[i
], end
= 0x10FFFF;
1537 UBool valid
= (c
>= 0 && c
<= 0x10FFFF);
1539 UnicodeSet
set(0, 0x10FFFF);
1541 // For single-codepoint contains, invalid codepoints are NOT contained
1542 UBool b
= set
.contains(c
);
1544 logln((UnicodeString
)"[\\u0000-\\U0010FFFF].contains(" + c
+
1547 errln((UnicodeString
)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c
+
1551 // For codepoint range contains, containsNone, and containsSome,
1552 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1553 b
= set
.contains(c
, end
);
1554 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].contains(" + c
+
1555 "," + end
+ ") = " + b
);
1557 b
= set
.containsNone(c
, end
);
1558 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].containsNone(" + c
+
1559 "," + end
+ ") = " + b
);
1561 b
= set
.containsSome(c
, end
);
1562 logln((UnicodeString
)"* [\\u0000-\\U0010FFFF].containsSome(" + c
+
1563 "," + end
+ ") = " + b
);
1565 int32_t index
= set
.indexOf(c
);
1566 if ((index
>= 0) == valid
) {
1567 logln((UnicodeString
)"[\\u0000-\\U0010FFFF].indexOf(" + c
+
1570 errln((UnicodeString
)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c
+
1576 // Used by TestSymbolTable
1577 class TokenSymbolTable
: public SymbolTable
{
1581 TokenSymbolTable(UErrorCode
& ec
) : contents(FALSE
, ec
) {
1582 contents
.setValueDeleter(uprv_deleteUObject
);
1585 ~TokenSymbolTable() {}
1588 * (Non-SymbolTable API) Add the given variable and value to
1589 * the table. Variable should NOT contain leading '$'.
1591 void add(const UnicodeString
& var
, const UnicodeString
& value
,
1593 if (U_SUCCESS(ec
)) {
1594 contents
.put(var
, new UnicodeString(value
), ec
);
1601 virtual const UnicodeString
* lookup(const UnicodeString
& s
) const {
1602 return (const UnicodeString
*) contents
.get(s
);
1608 virtual const UnicodeFunctor
* lookupMatcher(UChar32
/*ch*/) const {
1615 virtual UnicodeString
parseReference(const UnicodeString
& text
,
1616 ParsePosition
& pos
, int32_t limit
) const {
1617 int32_t start
= pos
.getIndex();
1619 UnicodeString result
;
1621 UChar c
= text
.charAt(i
);
1622 if ((i
==start
&& !u_isIDStart(c
)) || !u_isIDPart(c
)) {
1627 if (i
== start
) { // No valid name chars
1628 return result
; // Indicate failure with empty string
1631 text
.extractBetween(start
, i
, result
);
1636 void UnicodeSetTest::TestSymbolTable() {
1637 // Multiple test cases can be set up here. Each test case
1638 // is terminated by null:
1639 // var, value, var, value,..., input pat., exp. output pat., null
1640 const char* DATA
[] = {
1641 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL
,
1642 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL
,
1643 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL
,
1647 for (int32_t i
=0; DATA
[i
]!=NULL
; ++i
) {
1648 UErrorCode ec
= U_ZERO_ERROR
;
1649 TokenSymbolTable
sym(ec
);
1650 if (U_FAILURE(ec
)) {
1651 errln("FAIL: couldn't construct TokenSymbolTable");
1656 while (DATA
[i
+2] != NULL
) {
1657 sym
.add(UnicodeString(DATA
[i
], -1, US_INV
), UnicodeString(DATA
[i
+1], -1, US_INV
), ec
);
1658 if (U_FAILURE(ec
)) {
1659 errln("FAIL: couldn't add to TokenSymbolTable");
1665 // Input pattern and expected output pattern
1666 UnicodeString inpat
= UnicodeString(DATA
[i
], -1, US_INV
), exppat
= UnicodeString(DATA
[i
+1], -1, US_INV
);
1669 ParsePosition
pos(0);
1670 UnicodeSet
us(inpat
, pos
, USET_IGNORE_SPACE
, &sym
, ec
);
1671 if (U_FAILURE(ec
)) {
1672 errln("FAIL: couldn't construct UnicodeSet");
1677 if (pos
.getIndex() != inpat
.length()) {
1678 errln((UnicodeString
)"Failed to read to end of string \""
1679 + inpat
+ "\": read to "
1680 + pos
.getIndex() + ", length is "
1684 UnicodeSet
us2(exppat
, ec
);
1685 if (U_FAILURE(ec
)) {
1686 errln("FAIL: couldn't construct expected UnicodeSet");
1692 errln((UnicodeString
)"Failed, got " + us
.toPattern(a
, TRUE
) +
1693 ", expected " + us2
.toPattern(b
, TRUE
));
1695 logln((UnicodeString
)"Ok, got " + us
.toPattern(a
, TRUE
));
1700 void UnicodeSetTest::TestSurrogate() {
1701 const char* DATA
[] = {
1702 // These should all behave identically
1703 "[abc\\uD800\\uDC00]",
1704 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1708 for (int i
=0; DATA
[i
] != 0; ++i
) {
1709 UErrorCode ec
= U_ZERO_ERROR
;
1710 logln((UnicodeString
)"Test pattern " + i
+ " :" + UnicodeString(DATA
[i
], -1, US_INV
));
1711 UnicodeString str
= UnicodeString(DATA
[i
], -1, US_INV
);
1712 UnicodeSet
set(str
, ec
);
1713 if (U_FAILURE(ec
)) {
1714 errln("FAIL: UnicodeSet constructor");
1717 expectContainment(set
,
1718 CharsToUnicodeString("abc\\U00010000"),
1719 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1720 if (set
.size() != 4) {
1721 errln((UnicodeString
)"FAIL: " + UnicodeString(DATA
[i
], -1, US_INV
) + ".size() == " +
1722 set
.size() + ", expected 4");
1726 UErrorCode subErr
= U_ZERO_ERROR
;
1727 checkRoundTrip(set
);
1728 checkSerializeRoundTrip(set
, subErr
);
1733 void UnicodeSetTest::TestExhaustive() {
1734 // exhaustive tests. Simulate UnicodeSets with integers.
1735 // That gives us very solid tests (except for large memory tests).
1737 int32_t limit
= 128;
1739 UnicodeSet x
, y
, z
, aa
;
1741 for (int32_t i
= 0; i
< limit
; ++i
) {
1743 logln((UnicodeString
)"Testing " + i
+ ", " + x
);
1744 _testComplement(i
, x
, y
);
1746 UnicodeSet
&toTest
= bitsToSet(i
, aa
);
1748 // AS LONG AS WE ARE HERE, check roundtrip
1749 checkRoundTrip(toTest
);
1750 UErrorCode ec
= U_ZERO_ERROR
;
1751 checkSerializeRoundTrip(toTest
, ec
);
1753 for (int32_t j
= 0; j
< limit
; ++j
) {
1754 _testAdd(i
,j
, x
,y
,z
);
1755 _testXor(i
,j
, x
,y
,z
);
1756 _testRetain(i
,j
, x
,y
,z
);
1757 _testRemove(i
,j
, x
,y
,z
);
1762 void UnicodeSetTest::_testComplement(int32_t a
, UnicodeSet
& x
, UnicodeSet
& z
) {
1766 int32_t c
= setToBits(z
);
1768 errln((UnicodeString
)"FAILED: add: ~" + x
+ " != " + z
);
1769 errln((UnicodeString
)"FAILED: add: ~" + a
+ " != " + c
);
1771 checkCanonicalRep(z
, (UnicodeString
)"complement " + a
);
1774 void UnicodeSetTest::_testAdd(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1779 int32_t c
= setToBits(z
);
1781 errln((UnicodeString
)"FAILED: add: " + x
+ " | " + y
+ " != " + z
);
1782 errln((UnicodeString
)"FAILED: add: " + a
+ " | " + b
+ " != " + c
);
1784 checkCanonicalRep(z
, (UnicodeString
)"add " + a
+ "," + b
);
1787 void UnicodeSetTest::_testRetain(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1792 int32_t c
= setToBits(z
);
1794 errln((UnicodeString
)"FAILED: retain: " + x
+ " & " + y
+ " != " + z
);
1795 errln((UnicodeString
)"FAILED: retain: " + a
+ " & " + b
+ " != " + c
);
1797 checkCanonicalRep(z
, (UnicodeString
)"retain " + a
+ "," + b
);
1800 void UnicodeSetTest::_testRemove(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1805 int32_t c
= setToBits(z
);
1806 if (c
!= (a
&~ b
)) {
1807 errln((UnicodeString
)"FAILED: remove: " + x
+ " &~ " + y
+ " != " + z
);
1808 errln((UnicodeString
)"FAILED: remove: " + a
+ " &~ " + b
+ " != " + c
);
1810 checkCanonicalRep(z
, (UnicodeString
)"remove " + a
+ "," + b
);
1813 void UnicodeSetTest::_testXor(int32_t a
, int32_t b
, UnicodeSet
& x
, UnicodeSet
& y
, UnicodeSet
& z
) {
1818 int32_t c
= setToBits(z
);
1820 errln((UnicodeString
)"FAILED: complement: " + x
+ " ^ " + y
+ " != " + z
);
1821 errln((UnicodeString
)"FAILED: complement: " + a
+ " ^ " + b
+ " != " + c
);
1823 checkCanonicalRep(z
, (UnicodeString
)"complement " + a
+ "," + b
);
1827 * Check that ranges are monotonically increasing and non-
1830 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet
& set
, const UnicodeString
& msg
) {
1831 int32_t n
= set
.getRangeCount();
1833 errln((UnicodeString
)"FAIL result of " + msg
+
1834 ": range count should be >= 0 but is " +
1835 n
/*+ " for " + set.toPattern())*/);
1839 for (int32_t i
=0; i
<n
; ++i
) {
1840 UChar32 start
= set
.getRangeStart(i
);
1841 UChar32 end
= set
.getRangeEnd(i
);
1843 errln((UnicodeString
)"FAIL result of " + msg
+
1844 ": range " + (i
+1) +
1845 " start > end: " + (int)start
+ ", " + (int)end
+
1848 if (i
> 0 && start
<= last
) {
1849 errln((UnicodeString
)"FAIL result of " + msg
+
1850 ": range " + (i
+1) +
1851 " overlaps previous range: " + (int)start
+ ", " + (int)end
+
1859 * Convert a bitmask to a UnicodeSet.
1861 UnicodeSet
& UnicodeSetTest::bitsToSet(int32_t a
, UnicodeSet
& result
) {
1863 for (UChar32 i
= 0; i
< 32; ++i
) {
1864 if ((a
& (1<<i
)) != 0) {
1872 * Convert a UnicodeSet to a bitmask. Only the characters
1873 * U+0000 to U+0020 are represented in the bitmask.
1875 int32_t UnicodeSetTest::setToBits(const UnicodeSet
& x
) {
1877 for (int32_t i
= 0; i
< 32; ++i
) {
1878 if (x
.contains((UChar32
)i
)) {
1886 * Return the representation of an inversion list based UnicodeSet
1887 * as a pairs list. Ranges are listed in ascending Unicode order.
1888 * For example, the set [a-zA-M3] is represented as "33AMaz".
1890 UnicodeString
UnicodeSetTest::getPairs(const UnicodeSet
& set
) {
1891 UnicodeString pairs
;
1892 for (int32_t i
=0; i
<set
.getRangeCount(); ++i
) {
1893 UChar32 start
= set
.getRangeStart(i
);
1894 UChar32 end
= set
.getRangeEnd(i
);
1897 i
= set
.getRangeCount(); // Should be unnecessary
1899 pairs
.append((UChar
)start
).append((UChar
)end
);
1905 * Basic consistency check for a few items.
1906 * That the iterator works, and that we can create a pattern and
1907 * get the same thing back
1909 void UnicodeSetTest::checkRoundTrip(const UnicodeSet
& s
) {
1912 checkEqual(s
, t
, "copy ct");
1916 UnicodeSet
t(0xabcd, 0xdef0); // dummy contents should be overwritten
1918 checkEqual(s
, t
, "operator=");
1923 copyWithIterator(t
, s
, FALSE
);
1924 checkEqual(s
, t
, "iterator roundtrip");
1929 copyWithIterator(t
, s
, TRUE
); // try range
1930 checkEqual(s
, t
, "iterator roundtrip");
1936 UErrorCode ec
= U_ZERO_ERROR
;
1937 s
.toPattern(pat
, FALSE
);
1938 t
.applyPattern(pat
, ec
);
1939 if (U_FAILURE(ec
)) {
1940 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec
));
1943 checkEqual(s
, t
, "toPattern(false)");
1950 UErrorCode ec
= U_ZERO_ERROR
;
1951 s
.toPattern(pat
, TRUE
);
1952 t
.applyPattern(pat
, ec
);
1953 if (U_FAILURE(ec
)) {
1954 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec
));
1957 checkEqual(s
, t
, "toPattern(true)");
1962 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet
& t
, UErrorCode
&status
) {
1963 if(U_FAILURE(status
)) return;
1964 int32_t len
= t
.serialize(serializeBuffer
.getAlias(), serializeBuffer
.getCapacity(), status
);
1965 if(status
== U_BUFFER_OVERFLOW_ERROR
) {
1966 status
= U_ZERO_ERROR
;
1967 serializeBuffer
.resize(len
);
1968 len
= t
.serialize(serializeBuffer
.getAlias(), serializeBuffer
.getCapacity(), status
);
1969 // let 2nd error stand
1971 if(U_FAILURE(status
)) {
1972 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status
));
1975 UnicodeSet
deserialized(serializeBuffer
.getAlias(), len
, UnicodeSet::kSerialized
, status
);
1976 if(U_FAILURE(status
)) {
1977 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status
), serializeBuffer
.getAlias(), len
, t
.getRangeCount());
1981 checkEqual(t
, deserialized
, "Set was unequal when deserialized");
1984 void UnicodeSetTest::copyWithIterator(UnicodeSet
& t
, const UnicodeSet
& s
, UBool withRange
) {
1986 UnicodeSetIterator
it(s
);
1988 while (it
.nextRange()) {
1989 if (it
.isString()) {
1990 t
.add(it
.getString());
1992 t
.add(it
.getCodepoint(), it
.getCodepointEnd());
1997 if (it
.isString()) {
1998 t
.add(it
.getString());
2000 t
.add(it
.getCodepoint());
2006 UBool
UnicodeSetTest::checkEqual(const UnicodeSet
& s
, const UnicodeSet
& t
, const char* message
) {
2007 assertEquals(UnicodeString("RangeCount: ","") + message
, s
.getRangeCount(), t
.getRangeCount());
2008 assertEquals(UnicodeString("size: ","") + message
, s
.size(), t
.size());
2009 UnicodeString source
; s
.toPattern(source
, TRUE
);
2010 UnicodeString result
; t
.toPattern(result
, TRUE
);
2012 errln((UnicodeString
)"FAIL: " + message
2013 + "; source = " + source
2014 + "; result = " + result
2018 logln((UnicodeString
)"Ok: " + message
2019 + "; source = " + source
2020 + "; result = " + result
2027 UnicodeSetTest::expectContainment(const UnicodeString
& pat
,
2028 const UnicodeString
& charsIn
,
2029 const UnicodeString
& charsOut
) {
2030 UErrorCode ec
= U_ZERO_ERROR
;
2031 UnicodeSet
set(pat
, ec
);
2032 if (U_FAILURE(ec
)) {
2033 dataerrln((UnicodeString
)"FAIL: pattern \"" +
2034 pat
+ "\" => " + u_errorName(ec
));
2037 expectContainment(set
, pat
, charsIn
, charsOut
);
2041 UnicodeSetTest::expectContainment(const UnicodeSet
& set
,
2042 const UnicodeString
& charsIn
,
2043 const UnicodeString
& charsOut
) {
2046 expectContainment(set
, pat
, charsIn
, charsOut
);
2050 UnicodeSetTest::expectContainment(const UnicodeSet
& set
,
2051 const UnicodeString
& setName
,
2052 const UnicodeString
& charsIn
,
2053 const UnicodeString
& charsOut
) {
2058 for (i
=0; i
<charsIn
.length(); i
+=U16_LENGTH(c
)) {
2059 c
= charsIn
.char32At(i
);
2060 if (!set
.contains(c
)) {
2064 if (bad
.length() > 0) {
2065 errln((UnicodeString
)"Fail: set " + setName
+ " does not contain " + prettify(bad
) +
2066 ", expected containment of " + prettify(charsIn
));
2068 logln((UnicodeString
)"Ok: set " + setName
+ " contains " + prettify(charsIn
));
2072 for (i
=0; i
<charsOut
.length(); i
+=U16_LENGTH(c
)) {
2073 c
= charsOut
.char32At(i
);
2074 if (set
.contains(c
)) {
2078 if (bad
.length() > 0) {
2079 errln((UnicodeString
)"Fail: set " + setName
+ " contains " + prettify(bad
) +
2080 ", expected non-containment of " + prettify(charsOut
));
2082 logln((UnicodeString
)"Ok: set " + setName
+ " does not contain " + prettify(charsOut
));
2087 UnicodeSetTest::expectPattern(UnicodeSet
& set
,
2088 const UnicodeString
& pattern
,
2089 const UnicodeString
& expectedPairs
){
2090 UErrorCode status
= U_ZERO_ERROR
;
2091 set
.applyPattern(pattern
, status
);
2092 if (U_FAILURE(status
)) {
2093 errln(UnicodeString("FAIL: applyPattern(\"") + pattern
+
2097 if (getPairs(set
) != expectedPairs
) {
2098 errln(UnicodeString("FAIL: applyPattern(\"") + pattern
+
2100 escape(getPairs(set
)) + "\", expected \"" +
2101 escape(expectedPairs
) + "\"");
2103 logln(UnicodeString("Ok: applyPattern(\"") + pattern
+
2105 escape(getPairs(set
)) + "\"");
2108 // the result of calling set.toPattern(), which is the string representation of
2109 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2110 // will produce another set that is equal to this one.
2111 UnicodeString temppattern
;
2112 set
.toPattern(temppattern
);
2113 UnicodeSet
*tempset
=new UnicodeSet(temppattern
, status
);
2114 if (U_FAILURE(status
)) {
2115 errln(UnicodeString("FAIL: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => invalid pattern"));
2118 if(*tempset
!= set
|| getPairs(*tempset
) != getPairs(set
)){
2119 errln(UnicodeString("FAIL: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => pairs \""+ escape(getPairs(*tempset
)) + "\", expected pairs \"" +
2120 escape(getPairs(set
)) + "\""));
2122 logln(UnicodeString("Ok: applyPattern(\""+ pattern
+ "\").toPattern() => " + temppattern
+ " => pairs \"" + escape(getPairs(*tempset
)) + "\""));
2130 UnicodeSetTest::expectPairs(const UnicodeSet
& set
, const UnicodeString
& expectedPairs
) {
2131 if (getPairs(set
) != expectedPairs
) {
2132 errln(UnicodeString("FAIL: Expected pair list \"") +
2133 escape(expectedPairs
) + "\", got \"" +
2134 escape(getPairs(set
)) + "\"");
2138 void UnicodeSetTest::expectToPattern(const UnicodeSet
& set
,
2139 const UnicodeString
& expPat
,
2140 const char** expStrings
) {
2142 set
.toPattern(pat
, TRUE
);
2143 if (pat
== expPat
) {
2144 logln((UnicodeString
)"Ok: toPattern() => \"" + pat
+ "\"");
2146 errln((UnicodeString
)"FAIL: toPattern() => \"" + pat
+ "\", expected \"" + expPat
+ "\"");
2149 if (expStrings
== NULL
) {
2153 for (int32_t i
=0; expStrings
[i
] != NULL
; ++i
) {
2154 if (expStrings
[i
] == NOT
) { // sic; pointer comparison
2158 UnicodeString s
= CharsToUnicodeString(expStrings
[i
]);
2159 UBool contained
= set
.contains(s
);
2160 if (contained
== in
) {
2161 logln((UnicodeString
)"Ok: " + expPat
+
2162 (contained
? " contains {" : " does not contain {") +
2163 escape(expStrings
[i
]) + "}");
2165 errln((UnicodeString
)"FAIL: " + expPat
+
2166 (contained
? " contains {" : " does not contain {") +
2167 escape(expStrings
[i
]) + "}");
2172 static UChar
toHexString(int32_t i
) { return (UChar
)(i
+ (i
< 10 ? 0x30 : (0x41 - 10))); }
2175 UnicodeSetTest::doAssert(UBool condition
, const char *message
)
2178 errln(UnicodeString("ERROR : ") + message
);
2183 UnicodeSetTest::escape(const UnicodeString
& s
) {
2185 for (int32_t i
=0; i
<s
.length(); )
2187 UChar32 c
= s
.char32At(i
);
2188 if (0x0020 <= c
&& c
<= 0x007F) {
2192 buf
+= (UChar
)0x5c; buf
+= (UChar
)0x75;
2194 buf
+= (UChar
)0x5c; buf
+= (UChar
)0x55;
2195 buf
+= toHexString((c
& 0xF0000000) >> 28);
2196 buf
+= toHexString((c
& 0x0F000000) >> 24);
2197 buf
+= toHexString((c
& 0x00F00000) >> 20);
2198 buf
+= toHexString((c
& 0x000F0000) >> 16);
2200 buf
+= toHexString((c
& 0xF000) >> 12);
2201 buf
+= toHexString((c
& 0x0F00) >> 8);
2202 buf
+= toHexString((c
& 0x00F0) >> 4);
2203 buf
+= toHexString(c
& 0x000F);
2210 void UnicodeSetTest::TestFreezable() {
2211 UErrorCode errorCode
=U_ZERO_ERROR
;
2212 UnicodeString idPattern
=UNICODE_STRING("[:ID_Continue:]", 15);
2213 UnicodeSet
idSet(idPattern
, errorCode
);
2214 if(U_FAILURE(errorCode
)) {
2215 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode
));
2219 UnicodeString wsPattern
=UNICODE_STRING("[:White_Space:]", 15);
2220 UnicodeSet
wsSet(wsPattern
, errorCode
);
2221 if(U_FAILURE(errorCode
)) {
2222 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode
));
2226 idSet
.add(idPattern
);
2227 UnicodeSet
frozen(idSet
);
2230 if(idSet
.isFrozen() || !frozen
.isFrozen()) {
2231 errln("FAIL: isFrozen() is wrong");
2233 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2234 errln("FAIL: a copy-constructed frozen set differs from its original");
2238 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2239 errln("FAIL: a frozen set was modified by operator=");
2242 UnicodeSet
frozen2(frozen
);
2243 if(frozen2
!=frozen
|| frozen2
!=idSet
) {
2244 errln("FAIL: a copied frozen set differs from its frozen original");
2246 if(!frozen2
.isFrozen()) {
2247 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2249 UnicodeSet
frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2250 if(frozen3
.contains(0, 4) || !frozen3
.contains(5, 55) || frozen3
.contains(56, 0x10ffff)) {
2251 errln("FAIL: UnicodeSet(5, 55) failed");
2254 if(!frozen3
.isFrozen()) {
2255 errln("FAIL: copying a frozen set results in a thawed one");
2258 UnicodeSet
*cloned
=frozen
.clone();
2259 if(!cloned
->isFrozen() || *cloned
!=frozen
|| cloned
->containsSome(0xd802, 0xd805)) {
2260 errln("FAIL: clone() failed");
2262 cloned
->add(0xd802, 0xd805);
2263 if(cloned
->containsSome(0xd802, 0xd805)) {
2264 errln("FAIL: unable to modify clone");
2268 UnicodeSet
*thawed
=frozen
.cloneAsThawed();
2269 if(thawed
->isFrozen() || *thawed
!=frozen
|| thawed
->containsSome(0xd802, 0xd805)) {
2270 errln("FAIL: cloneAsThawed() failed");
2272 thawed
->add(0xd802, 0xd805);
2273 if(!thawed
->contains(0xd802, 0xd805)) {
2274 errln("FAIL: unable to modify thawed clone");
2279 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2280 errln("FAIL: UnicodeSet::set() modified a frozen set");
2284 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2285 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2288 frozen
.closeOver(USET_CASE_INSENSITIVE
);
2289 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2290 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2294 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2295 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2300 applyPattern(wsPattern
, errorCode
).
2301 applyPattern(wsPattern
, USET_IGNORE_SPACE
, NULL
, errorCode
).
2302 applyPattern(wsPattern
, pos
, USET_IGNORE_SPACE
, NULL
, errorCode
).
2303 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS
, 230, errorCode
).
2304 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode
);
2305 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2306 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2311 add(0xd802, 0xd805).
2315 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2316 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2322 retainAll(wsPattern
).
2324 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2325 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2332 removeAll(idPattern
).
2334 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2335 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2341 complement(0x64, 0x69).
2342 complement(idPattern
).
2343 complementAll(idPattern
).
2344 complementAll(idSet
);
2345 if(frozen
!=idSet
|| !(frozen
==idSet
)) {
2346 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2350 // Test span() etc. -------------------------------------------------------- ***
2352 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2354 appendUTF8(const UChar
*s
, int32_t length
, char *t
, int32_t capacity
) {
2355 UErrorCode errorCode
=U_ZERO_ERROR
;
2357 u_strToUTF8(t
, capacity
, &length8
, s
, length
, &errorCode
);
2358 if(U_SUCCESS(errorCode
)) {
2361 // The string contains an unpaired surrogate.
2362 // Ignore this string.
2367 class UnicodeSetWithStringsIterator
;
2369 // Make the strings in a UnicodeSet easily accessible.
2370 class UnicodeSetWithStrings
{
2372 UnicodeSetWithStrings(const UnicodeSet
&normalSet
) :
2373 set(normalSet
), stringsLength(0), hasSurrogates(FALSE
) {
2374 int32_t size
=set
.size();
2375 if(size
>0 && set
.charAt(size
-1)<0) {
2376 // If a set's last element is not a code point, then it must contain strings.
2377 // Iterate over the set, skip all code point ranges, and cache the strings.
2378 // Convert them to UTF-8 for spanUTF8().
2379 UnicodeSetIterator
iter(set
);
2380 const UnicodeString
*s
;
2382 int32_t length8
, utf8Count
=0;
2383 while(iter
.nextRange() && stringsLength
<UPRV_LENGTHOF(strings
)) {
2384 if(iter
.isString()) {
2385 // Store the pointer to the set's string element
2386 // which we happen to know is a stable pointer.
2387 strings
[stringsLength
]=s
=&iter
.getString();
2389 utf8Lengths
[stringsLength
]=length8
=
2390 appendUTF8(s
->getBuffer(), s
->length(),
2391 s8
, (int32_t)(sizeof(utf8
)-utf8Count
));
2393 hasSurrogates
=TRUE
; // Contains unpaired surrogates.
2402 const UnicodeSet
&getSet() const {
2406 UBool
hasStrings() const {
2407 return (UBool
)(stringsLength
>0);
2410 UBool
hasStringsWithSurrogates() const {
2411 return hasSurrogates
;
2415 friend class UnicodeSetWithStringsIterator
;
2417 const UnicodeSet
&set
;
2419 const UnicodeString
*strings
[20];
2420 int32_t stringsLength
;
2421 UBool hasSurrogates
;
2424 int32_t utf8Lengths
[20];
2427 class UnicodeSetWithStringsIterator
{
2429 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings
&set
) :
2430 fSet(set
), nextStringIndex(0), nextUTF8Start(0) {
2434 nextStringIndex
=nextUTF8Start
=0;
2437 const UnicodeString
*nextString() {
2438 if(nextStringIndex
<fSet
.stringsLength
) {
2439 return fSet
.strings
[nextStringIndex
++];
2445 // Do not mix with calls to nextString().
2446 const char *nextUTF8(int32_t &length
) {
2447 if(nextStringIndex
<fSet
.stringsLength
) {
2448 const char *s8
=fSet
.utf8
+nextUTF8Start
;
2449 nextUTF8Start
+=length
=fSet
.utf8Lengths
[nextStringIndex
++];
2458 const UnicodeSetWithStrings
&fSet
;
2459 int32_t nextStringIndex
;
2460 int32_t nextUTF8Start
;
2463 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2464 // at code point boundaries.
2465 // That is, each edge of a match must not be in the middle of a surrogate pair.
2467 matches16CPB(const UChar
*s
, int32_t start
, int32_t limit
, const UnicodeString
&t
) {
2470 int32_t length
=t
.length();
2471 return 0==t
.compare(s
, length
) &&
2472 !(0<start
&& U16_IS_LEAD(s
[-1]) && U16_IS_TRAIL(s
[0])) &&
2473 !(length
<limit
&& U16_IS_LEAD(s
[length
-1]) && U16_IS_TRAIL(s
[length
]));
2476 // Implement span() with contains() for comparison.
2477 static int32_t containsSpanUTF16(const UnicodeSetWithStrings
&set
, const UChar
*s
, int32_t length
,
2478 USetSpanCondition spanCondition
) {
2479 const UnicodeSet
&realSet(set
.getSet());
2480 if(!set
.hasStrings()) {
2481 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2482 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2486 int32_t start
=0, prev
;
2487 while((prev
=start
)<length
) {
2488 U16_NEXT(s
, start
, length
, c
);
2489 if(realSet
.contains(c
)!=spanCondition
) {
2494 } else if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
2495 UnicodeSetWithStringsIterator
iter(set
);
2497 int32_t start
, next
;
2498 for(start
=next
=0; start
<length
;) {
2499 U16_NEXT(s
, next
, length
, c
);
2500 if(realSet
.contains(c
)) {
2503 const UnicodeString
*str
;
2505 while((str
=iter
.nextString())!=NULL
) {
2506 if(str
->length()<=(length
-start
) && matches16CPB(s
, start
, length
, *str
)) {
2507 // spanNeedsStrings=TRUE;
2514 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2515 UnicodeSetWithStringsIterator
iter(set
);
2517 int32_t start
, next
, maxSpanLimit
=0;
2518 for(start
=next
=0; start
<length
;) {
2519 U16_NEXT(s
, next
, length
, c
);
2520 if(!realSet
.contains(c
)) {
2521 next
=start
; // Do not span this single, not-contained code point.
2523 const UnicodeString
*str
;
2525 while((str
=iter
.nextString())!=NULL
) {
2526 if(str
->length()<=(length
-start
) && matches16CPB(s
, start
, length
, *str
)) {
2527 // spanNeedsStrings=TRUE;
2528 int32_t matchLimit
=start
+str
->length();
2529 if(matchLimit
==length
) {
2532 if(spanCondition
==USET_SPAN_CONTAINED
) {
2533 // Iterate for the shortest match at each position.
2534 // Recurse for each but the shortest match.
2536 next
=matchLimit
; // First match from start.
2538 if(matchLimit
<next
) {
2539 // Remember shortest match from start for iteration.
2544 // Recurse for non-shortest match from start.
2545 int32_t spanLength
=containsSpanUTF16(set
, s
+matchLimit
, length
-matchLimit
,
2546 USET_SPAN_CONTAINED
);
2547 if((matchLimit
+spanLength
)>maxSpanLimit
) {
2548 maxSpanLimit
=matchLimit
+spanLength
;
2549 if(maxSpanLimit
==length
) {
2554 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2555 if(matchLimit
>next
) {
2556 // Remember longest match from start.
2563 break; // No match from start.
2567 if(start
>maxSpanLimit
) {
2570 return maxSpanLimit
;
2575 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings
&set
, const UChar
*s
, int32_t length
,
2576 USetSpanCondition spanCondition
) {
2580 const UnicodeSet
&realSet(set
.getSet());
2581 if(!set
.hasStrings()) {
2582 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2583 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2587 int32_t prev
=length
;
2589 U16_PREV(s
, 0, length
, c
);
2590 if(realSet
.contains(c
)!=spanCondition
) {
2593 } while((prev
=length
)>0);
2595 } else if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
2596 UnicodeSetWithStringsIterator
iter(set
);
2598 int32_t prev
=length
, length0
=length
;
2600 U16_PREV(s
, 0, length
, c
);
2601 if(realSet
.contains(c
)) {
2604 const UnicodeString
*str
;
2606 while((str
=iter
.nextString())!=NULL
) {
2607 if(str
->length()<=prev
&& matches16CPB(s
, prev
-str
->length(), length0
, *str
)) {
2608 // spanNeedsStrings=TRUE;
2612 } while((prev
=length
)>0);
2614 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2615 UnicodeSetWithStringsIterator
iter(set
);
2617 int32_t prev
=length
, minSpanStart
=length
, length0
=length
;
2619 U16_PREV(s
, 0, length
, c
);
2620 if(!realSet
.contains(c
)) {
2621 length
=prev
; // Do not span this single, not-contained code point.
2623 const UnicodeString
*str
;
2625 while((str
=iter
.nextString())!=NULL
) {
2626 if(str
->length()<=prev
&& matches16CPB(s
, prev
-str
->length(), length0
, *str
)) {
2627 // spanNeedsStrings=TRUE;
2628 int32_t matchStart
=prev
-str
->length();
2632 if(spanCondition
==USET_SPAN_CONTAINED
) {
2633 // Iterate for the shortest match at each position.
2634 // Recurse for each but the shortest match.
2636 length
=matchStart
; // First match from prev.
2638 if(matchStart
>length
) {
2639 // Remember shortest match from prev for iteration.
2640 int32_t temp
=length
;
2644 // Recurse for non-shortest match from prev.
2645 int32_t spanStart
=containsSpanBackUTF16(set
, s
, matchStart
,
2646 USET_SPAN_CONTAINED
);
2647 if(spanStart
<minSpanStart
) {
2648 minSpanStart
=spanStart
;
2649 if(minSpanStart
==0) {
2654 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2655 if(matchStart
<length
) {
2656 // Remember longest match from prev.
2663 break; // No match from prev.
2665 } while((prev
=length
)>0);
2666 if(prev
<minSpanStart
) {
2669 return minSpanStart
;
2674 static int32_t containsSpanUTF8(const UnicodeSetWithStrings
&set
, const char *s
, int32_t length
,
2675 USetSpanCondition spanCondition
) {
2676 const UnicodeSet
&realSet(set
.getSet());
2677 if(!set
.hasStrings()) {
2678 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2679 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2683 int32_t start
=0, prev
;
2684 while((prev
=start
)<length
) {
2685 U8_NEXT_OR_FFFD(s
, start
, length
, c
);
2686 if(realSet
.contains(c
)!=spanCondition
) {
2691 } else if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
2692 UnicodeSetWithStringsIterator
iter(set
);
2694 int32_t start
, next
;
2695 for(start
=next
=0; start
<length
;) {
2696 U8_NEXT_OR_FFFD(s
, next
, length
, c
);
2697 if(realSet
.contains(c
)) {
2703 while((s8
=iter
.nextUTF8(length8
))!=NULL
) {
2704 if(length8
!=0 && length8
<=(length
-start
) && 0==memcmp(s
+start
, s8
, length8
)) {
2705 // spanNeedsStrings=TRUE;
2712 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2713 UnicodeSetWithStringsIterator
iter(set
);
2715 int32_t start
, next
, maxSpanLimit
=0;
2716 for(start
=next
=0; start
<length
;) {
2717 U8_NEXT_OR_FFFD(s
, next
, length
, c
);
2718 if(!realSet
.contains(c
)) {
2719 next
=start
; // Do not span this single, not-contained code point.
2724 while((s8
=iter
.nextUTF8(length8
))!=NULL
) {
2725 if(length8
!=0 && length8
<=(length
-start
) && 0==memcmp(s
+start
, s8
, length8
)) {
2726 // spanNeedsStrings=TRUE;
2727 int32_t matchLimit
=start
+length8
;
2728 if(matchLimit
==length
) {
2731 if(spanCondition
==USET_SPAN_CONTAINED
) {
2732 // Iterate for the shortest match at each position.
2733 // Recurse for each but the shortest match.
2735 next
=matchLimit
; // First match from start.
2737 if(matchLimit
<next
) {
2738 // Remember shortest match from start for iteration.
2743 // Recurse for non-shortest match from start.
2744 int32_t spanLength
=containsSpanUTF8(set
, s
+matchLimit
, length
-matchLimit
,
2745 USET_SPAN_CONTAINED
);
2746 if((matchLimit
+spanLength
)>maxSpanLimit
) {
2747 maxSpanLimit
=matchLimit
+spanLength
;
2748 if(maxSpanLimit
==length
) {
2753 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2754 if(matchLimit
>next
) {
2755 // Remember longest match from start.
2762 break; // No match from start.
2766 if(start
>maxSpanLimit
) {
2769 return maxSpanLimit
;
2774 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings
&set
, const char *s
, int32_t length
,
2775 USetSpanCondition spanCondition
) {
2779 const UnicodeSet
&realSet(set
.getSet());
2780 if(!set
.hasStrings()) {
2781 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2782 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2786 int32_t prev
=length
;
2788 U8_PREV_OR_FFFD(s
, 0, length
, c
);
2789 if(realSet
.contains(c
)!=spanCondition
) {
2792 } while((prev
=length
)>0);
2794 } else if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
2795 UnicodeSetWithStringsIterator
iter(set
);
2797 int32_t prev
=length
;
2799 U8_PREV_OR_FFFD(s
, 0, length
, c
);
2800 if(realSet
.contains(c
)) {
2806 while((s8
=iter
.nextUTF8(length8
))!=NULL
) {
2807 if(length8
!=0 && length8
<=prev
&& 0==memcmp(s
+prev
-length8
, s8
, length8
)) {
2808 // spanNeedsStrings=TRUE;
2812 } while((prev
=length
)>0);
2814 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2815 UnicodeSetWithStringsIterator
iter(set
);
2817 int32_t prev
=length
, minSpanStart
=length
;
2819 U8_PREV_OR_FFFD(s
, 0, length
, c
);
2820 if(!realSet
.contains(c
)) {
2821 length
=prev
; // Do not span this single, not-contained code point.
2826 while((s8
=iter
.nextUTF8(length8
))!=NULL
) {
2827 if(length8
!=0 && length8
<=prev
&& 0==memcmp(s
+prev
-length8
, s8
, length8
)) {
2828 // spanNeedsStrings=TRUE;
2829 int32_t matchStart
=prev
-length8
;
2833 if(spanCondition
==USET_SPAN_CONTAINED
) {
2834 // Iterate for the shortest match at each position.
2835 // Recurse for each but the shortest match.
2837 length
=matchStart
; // First match from prev.
2839 if(matchStart
>length
) {
2840 // Remember shortest match from prev for iteration.
2841 int32_t temp
=length
;
2845 // Recurse for non-shortest match from prev.
2846 int32_t spanStart
=containsSpanBackUTF8(set
, s
, matchStart
,
2847 USET_SPAN_CONTAINED
);
2848 if(spanStart
<minSpanStart
) {
2849 minSpanStart
=spanStart
;
2850 if(minSpanStart
==0) {
2855 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2856 if(matchStart
<length
) {
2857 // Remember longest match from prev.
2864 break; // No match from prev.
2866 } while((prev
=length
)>0);
2867 if(prev
<minSpanStart
) {
2870 return minSpanStart
;
2875 // spans to be performed and compared
2889 SPAN_CONTAINED
=0x100,
2891 SPAN_CONDITION
=0x300,
2896 static inline USetSpanCondition
invertSpanCondition(USetSpanCondition spanCondition
, USetSpanCondition contained
) {
2897 return spanCondition
== USET_SPAN_NOT_CONTAINED
? contained
: USET_SPAN_NOT_CONTAINED
;
2900 static inline int32_t slen(const void *s
, UBool isUTF16
) {
2901 return isUTF16
? u_strlen((const UChar
*)s
) : static_cast<int32_t>(strlen((const char *)s
));
2905 * Count spans on a string with the method according to type and set the span limits.
2906 * The set may be the complement of the original.
2907 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2908 * according to the expected number of spans.
2909 * Sets typeName to an empty string if there is no such type.
2910 * Returns -1 if the span option is filtered out.
2912 static int32_t getSpans(const UnicodeSetWithStrings
&set
, UBool isComplement
,
2913 const void *s
, int32_t length
, UBool isUTF16
,
2914 uint32_t whichSpans
,
2915 int type
, const char *&typeName
,
2916 int32_t limits
[], int32_t limitsCapacity
,
2917 int32_t expectCount
) {
2918 const UnicodeSet
&realSet(set
.getSet());
2919 int32_t start
, count
;
2920 USetSpanCondition spanCondition
, firstSpanCondition
, contained
;
2923 if(type
<0 || 7<type
) {
2928 static const char *const typeNames16
[]={
2929 "contains", "contains(LM)",
2931 "containsBack", "containsBack(LM)",
2932 "spanBack", "spanBack(LM)"
2935 static const char *const typeNames8
[]={
2936 "containsUTF8", "containsUTF8(LM)",
2937 "spanUTF8", "spanUTF8(LM)",
2938 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2939 "spanBackUTF8", "spanBackUTF8(LM)"
2942 typeName
= isUTF16
? typeNames16
[type
] : typeNames8
[type
];
2944 // filter span options
2947 if((whichSpans
&SPAN_FWD
)==0) {
2953 if((whichSpans
&SPAN_BACK
)==0) {
2959 // use USET_SPAN_CONTAINED
2960 if((whichSpans
&SPAN_CONTAINED
)==0) {
2963 contained
=USET_SPAN_CONTAINED
;
2965 // use USET_SPAN_SIMPLE
2966 if((whichSpans
&SPAN_SIMPLE
)==0) {
2969 contained
=USET_SPAN_SIMPLE
;
2972 // Default first span condition for going forward with an uncomplemented set.
2973 spanCondition
=USET_SPAN_NOT_CONTAINED
;
2975 spanCondition
=invertSpanCondition(spanCondition
, contained
);
2978 // First span condition for span(), used to terminate the spanBack() iteration.
2979 firstSpanCondition
=spanCondition
;
2981 // spanBack(): Its initial span condition is span()'s last span condition,
2982 // which is the opposite of span()'s first span condition
2983 // if we expect an even number of spans.
2984 // (The loop inverts spanCondition (expectCount-1) times
2985 // before the expectCount'th span() call.)
2986 // If we do not compare forward and backward directions, then we do not have an
2987 // expectCount and just start with firstSpanCondition.
2988 if(!isForward
&& (whichSpans
&SPAN_FWD
)!=0 && (expectCount
&1)==0) {
2989 spanCondition
=invertSpanCondition(spanCondition
, contained
);
2998 length
=slen(s
, isUTF16
);
3001 start
+= isUTF16
? containsSpanUTF16(set
, (const UChar
*)s
+start
, length
-start
, spanCondition
) :
3002 containsSpanUTF8(set
, (const char *)s
+start
, length
-start
, spanCondition
);
3003 if(count
<limitsCapacity
) {
3004 limits
[count
]=start
;
3010 spanCondition
=invertSpanCondition(spanCondition
, contained
);
3017 start
+= isUTF16
? realSet
.span((const UChar
*)s
+start
, length
>=0 ? length
-start
: length
, spanCondition
) :
3018 realSet
.spanUTF8((const char *)s
+start
, length
>=0 ? length
-start
: length
, spanCondition
);
3019 if(count
<limitsCapacity
) {
3020 limits
[count
]=start
;
3023 if(length
>=0 ? start
>=length
:
3024 isUTF16
? ((const UChar
*)s
)[start
]==0 :
3025 ((const char *)s
)[start
]==0
3029 spanCondition
=invertSpanCondition(spanCondition
, contained
);
3035 length
=slen(s
, isUTF16
);
3039 if(count
<=limitsCapacity
) {
3040 limits
[limitsCapacity
-count
]=length
;
3042 length
= isUTF16
? containsSpanBackUTF16(set
, (const UChar
*)s
, length
, spanCondition
) :
3043 containsSpanBackUTF8(set
, (const char *)s
, length
, spanCondition
);
3044 if(length
==0 && spanCondition
==firstSpanCondition
) {
3047 spanCondition
=invertSpanCondition(spanCondition
, contained
);
3049 if(count
<limitsCapacity
) {
3050 memmove(limits
, limits
+(limitsCapacity
-count
), count
*4);
3057 if(count
<=limitsCapacity
) {
3058 limits
[limitsCapacity
-count
]= length
>=0 ? length
: slen(s
, isUTF16
);
3060 // Note: Length<0 is tested only for the first spanBack().
3061 // If we wanted to keep length<0 for all spanBack()s, we would have to
3062 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3063 length
= isUTF16
? realSet
.spanBack((const UChar
*)s
, length
, spanCondition
) :
3064 realSet
.spanBackUTF8((const char *)s
, length
, spanCondition
);
3065 if(length
==0 && spanCondition
==firstSpanCondition
) {
3068 spanCondition
=invertSpanCondition(spanCondition
, contained
);
3070 if(count
<limitsCapacity
) {
3071 memmove(limits
, limits
+(limitsCapacity
-count
), count
*4);
3082 // sets to be tested; odd index=isComplement
3091 static const char *const setNames
[SET_COUNT
]={
3099 * Verify that we get the same results whether we look at text with contains(),
3100 * span() or spanBack(), using unfrozen or frozen versions of the set,
3101 * and using the set or its complement (switching the spanConditions accordingly).
3102 * The latter verifies that
3103 * set.span(spanCondition) == set.complement().span(!spanCondition).
3105 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3106 * or returned to the caller (with an input expectCount<0).
3108 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings
*sets
[4],
3109 const void *s
, int32_t length
, UBool isUTF16
,
3110 uint32_t whichSpans
,
3111 int32_t expectLimits
[], int32_t &expectCount
,
3112 const char *testName
, int32_t index
) {
3113 int32_t limits
[500];
3114 int32_t limitsCount
;
3117 const char *typeName
;
3120 for(i
=0; i
<SET_COUNT
; ++i
) {
3122 // Even-numbered sets are original, uncomplemented sets.
3123 if((whichSpans
&SPAN_SET
)==0) {
3127 // Odd-numbered sets are complemented.
3128 if((whichSpans
&SPAN_COMPLEMENT
)==0) {
3132 for(type
=0;; ++type
) {
3133 limitsCount
=getSpans(*sets
[i
], (UBool
)(i
&1),
3137 limits
, UPRV_LENGTHOF(limits
), expectCount
);
3138 if(typeName
[0]==0) {
3139 break; // All types tried.
3142 continue; // Span option filtered out.
3145 expectCount
=limitsCount
;
3146 if(limitsCount
>UPRV_LENGTHOF(limits
)) {
3147 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3148 testName
, (long)index
, setNames
[i
], typeName
, (long)limitsCount
, (long)UPRV_LENGTHOF(limits
));
3151 memcpy(expectLimits
, limits
, limitsCount
*4);
3152 } else if(limitsCount
!=expectCount
) {
3153 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3154 testName
, (long)index
, setNames
[i
], typeName
, (long)limitsCount
, (long)expectCount
);
3156 for(j
=0; j
<limitsCount
; ++j
) {
3157 if(limits
[j
]!=expectLimits
[j
]) {
3158 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3159 testName
, (long)index
, setNames
[i
], typeName
, (long)limitsCount
,
3160 j
, (long)limits
[j
], (long)expectLimits
[j
]);
3168 // Compare span() with containsAll()/containsNone(),
3169 // but only if we have expectLimits[] from the uncomplemented set.
3170 if(isUTF16
&& (whichSpans
&SPAN_SET
)!=0) {
3171 const UChar
*s16
=(const UChar
*)s
;
3172 UnicodeString string
;
3173 int32_t prev
=0, limit
, length
;
3174 for(i
=0; i
<expectCount
; ++i
) {
3175 limit
=expectLimits
[i
];
3178 string
.setTo(FALSE
, s16
+prev
, length
); // read-only alias
3180 if(!sets
[SLOW
]->getSet().containsAll(string
)) {
3181 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3182 testName
, (long)index
, setNames
[SLOW
], (long)prev
, (long)limit
);
3185 if(!sets
[FAST
]->getSet().containsAll(string
)) {
3186 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3187 testName
, (long)index
, setNames
[FAST
], (long)prev
, (long)limit
);
3191 if(!sets
[SLOW
]->getSet().containsNone(string
)) {
3192 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3193 testName
, (long)index
, setNames
[SLOW
], (long)prev
, (long)limit
);
3196 if(!sets
[FAST
]->getSet().containsNone(string
)) {
3197 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3198 testName
, (long)index
, setNames
[FAST
], (long)prev
, (long)limit
);
3208 // Specifically test either UTF-16 or UTF-8.
3209 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings
*sets
[4],
3210 const void *s
, int32_t length
, UBool isUTF16
,
3211 uint32_t whichSpans
,
3212 const char *testName
, int32_t index
) {
3213 int32_t expectLimits
[500];
3214 int32_t expectCount
=-1;
3215 testSpan(sets
, s
, length
, isUTF16
, whichSpans
, expectLimits
, expectCount
, testName
, index
);
3218 UBool
stringContainsUnpairedSurrogate(const UChar
*s
, int32_t length
) {
3225 if(0xd800<=c
&& c
<0xe000) {
3226 if(c
>=0xdc00 || length
==0 || !U16_IS_TRAIL(c2
=*s
++)) {
3233 while((c
=*s
++)!=0) {
3234 if(0xd800<=c
&& c
<0xe000) {
3235 if(c
>=0xdc00 || !U16_IS_TRAIL(c2
=*s
++)) {
3244 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3245 // unless either UTF is turned off in whichSpans.
3246 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3247 // have the same contains(c) value as U+FFFD.
3248 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings
*sets
[4],
3249 const UChar
*s16
, int32_t length16
,
3250 uint32_t whichSpans
,
3251 const char *testName
, int32_t index
) {
3252 int32_t expectLimits
[500];
3253 int32_t expectCount
;
3255 expectCount
=-1; // Get expectLimits[] from testSpan().
3257 if((whichSpans
&SPAN_UTF16
)!=0) {
3258 testSpan(sets
, s16
, length16
, TRUE
, whichSpans
, expectLimits
, expectCount
, testName
, index
);
3260 if((whichSpans
&SPAN_UTF8
)==0) {
3264 // Convert s16[] and expectLimits[] to UTF-8.
3266 int32_t offsets
[3000];
3268 const UChar
*s16Limit
=s16
+length16
;
3270 char *tLimit
=t
+sizeof(s8
);
3272 UErrorCode errorCode
=U_ZERO_ERROR
;
3274 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3275 ucnv_fromUnicode(openUTF8Converter(), &t
, tLimit
, &s16
, s16Limit
, o
, TRUE
, &errorCode
);
3276 if(U_FAILURE(errorCode
)) {
3277 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3278 testName
, (long)index
, u_errorName(errorCode
));
3279 ucnv_resetFromUnicode(utf8Cnv
);
3282 int32_t length8
=(int32_t)(t
-(char *)s8
);
3284 // Convert expectLimits[].
3285 int32_t i
, j
, expect
;
3286 for(i
=j
=0; i
<expectCount
; ++i
) {
3287 expect
=expectLimits
[i
];
3288 if(expect
==length16
) {
3289 expectLimits
[i
]=length8
;
3291 while(offsets
[j
]<expect
) {
3298 testSpan(sets
, s8
, length8
, FALSE
, whichSpans
, expectLimits
, expectCount
, testName
, index
);
3301 static UChar32
nextCodePoint(UChar32 c
) {
3302 // Skip some large and boring ranges.
3323 // Verify that all implementations represent the same set.
3324 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings
*sets
[4], uint32_t whichSpans
, const char *testName
) {
3325 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3326 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3327 // Skip the UTF-8 part of the test - if the string contains surrogates -
3328 // because it is likely to produce a different result.
3329 UBool inconsistentSurrogates
=
3330 (!(sets
[0]->getSet().contains(0xfffd) ?
3331 sets
[0]->getSet().contains(0xd800, 0xdfff) :
3332 sets
[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3333 sets
[0]->hasStringsWithSurrogates());
3337 uint32_t localWhichSpans
;
3340 for(first
=c
=0;; c
=nextCodePoint(c
)) {
3341 if(c
>0x10ffff || length
>(UPRV_LENGTHOF(s
)-U16_MAX_LENGTH
)) {
3342 localWhichSpans
=whichSpans
;
3343 if(stringContainsUnpairedSurrogate(s
, length
) && inconsistentSurrogates
) {
3344 localWhichSpans
&=~SPAN_UTF8
;
3346 testSpanBothUTFs(sets
, s
, length
, localWhichSpans
, testName
, first
);
3353 U16_APPEND_UNSAFE(s
, length
, c
);
3357 // Test with a particular, interesting string.
3358 // Specify length and try NUL-termination.
3359 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings
*sets
[4], uint32_t whichSpans
, const char *testName
) {
3360 static const UChar s
[]={
3361 0x61, 0x62, 0x20, // Latin, space
3362 0x3b1, 0x3b2, 0x3b3, // Greek
3363 0xd900, // lead surrogate
3364 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3365 0xdc05, // trail surrogate
3366 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3367 0xd900, 0xdc05, // unassigned supplementary
3368 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3369 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3373 if((whichSpans
&SPAN_UTF16
)==0) {
3376 testSpan(sets
, s
, -1, TRUE
, (whichSpans
&~SPAN_UTF8
), testName
, 0);
3377 testSpan(sets
, s
, UPRV_LENGTHOF(s
)-1, TRUE
, (whichSpans
&~SPAN_UTF8
), testName
, 1);
3380 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings
*sets
[4], uint32_t whichSpans
, const char *testName
) {
3381 static const char s
[]={
3384 /* trail byte in lead position */
3389 /* truncated multi-byte sequences */
3401 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3403 /* trail byte in lead position */
3420 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3422 /* trail byte in lead position */
3433 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3435 /* trail byte in lead position */
3441 "\xF1\x90\x80\x85" // unassigned supplementary
3443 /* trail byte in lead position */
3446 "\xfc\x80\x80\x80\x80"
3448 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3450 /* trail byte in lead position */
3453 /* complete sequences but non-shortest forms or out of range etc. */
3459 "\xf8\x80\x80\x80\x80"
3460 "\xfc\x80\x80\x80\x80\x80"
3464 /* trail byte in lead position */
3467 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3470 if((whichSpans
&SPAN_UTF8
)==0) {
3473 testSpan(sets
, s
, -1, FALSE
, (whichSpans
&~SPAN_UTF16
), testName
, 0);
3474 testSpan(sets
, s
, UPRV_LENGTHOF(s
)-1, FALSE
, (whichSpans
&~SPAN_UTF16
), testName
, 1);
3477 // Take a set of span options and multiply them so that
3478 // each portion only has one of the options a, b and c.
3479 // If b==0, then the set of options is just modified with mask and a.
3480 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3482 addAlternative(uint32_t whichSpans
[], int32_t whichSpansCount
,
3483 uint32_t mask
, uint32_t a
, uint32_t b
, uint32_t c
) {
3487 for(i
=0; i
<whichSpansCount
; ++i
) {
3488 s
=whichSpans
[i
]&mask
;
3491 whichSpans
[whichSpansCount
+i
]=s
|b
;
3493 whichSpans
[2*whichSpansCount
+i
]=s
|c
;
3497 return b
==0 ? whichSpansCount
: c
==0 ? 2*whichSpansCount
: 3*whichSpansCount
;
3500 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3501 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3502 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3503 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3505 void UnicodeSetTest::TestSpan() {
3506 // "[...]" is a UnicodeSet pattern.
3507 // "*" performs tests on all Unicode code points and on a selection of
3508 // malformed UTF-8/16 strings.
3509 // "-options" limits the scope of testing for the current set.
3510 // By default, the test verifies that equivalent boundaries are found
3511 // for UTF-16 and UTF-8, going forward and backward,
3512 // alternating USET_SPAN_NOT_CONTAINED with
3513 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3514 // Single-character options:
3515 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3516 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3517 // or the set contains strings with unpaired surrogates
3518 // which do not translate to valid UTF-8.
3519 // c -- set.span() and set.complement().span() boundaries may differ.
3520 // Cause: Set strings are not complemented.
3521 // b -- span() and spanBack() boundaries may differ.
3522 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3523 // and spanBack(USET_SPAN_SIMPLE) are defined to
3524 // match with non-overlapping substrings.
3525 // For example, with a set containing "ab" and "ba",
3526 // span() of "aba" yields boundaries { 0, 2, 3 }
3527 // because the initial "ab" matches from 0 to 2,
3528 // while spanBack() yields boundaries { 0, 1, 3 }
3529 // because the final "ba" matches from 1 to 3.
3530 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3531 // Cause: Strings in the set overlap, and a longer match may
3532 // require a sequence including non-longest substrings.
3533 // For example, with a set containing "ab", "abc" and "cd",
3534 // span(contained) of "abcd" spans the entire string
3535 // but span(longest match) only spans the first 3 characters.
3536 // Each "-options" first resets all options and then applies the specified options.
3537 // A "-" without options resets the options.
3538 // The options are also reset for each new set.
3539 // Other strings will be spanned.
3540 static const char *const testdata
[]={
3547 "[\\u0000-\\U0010FFFF]",
3549 "[\\u0000\\u0080\\u0800\\U00010000]",
3551 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3553 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3556 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3560 // Overlapping strings cause overlapping attempts to match.
3561 "[x{xy}{xya}{axy}{ax}]",
3564 // More repetitions of "xya" would take too long with the recursive
3565 // reference implementation.
3566 // containsAll()=FALSE
3569 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3570 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3573 "xyaxyaxyaxya" // span() ends here.
3576 // containsAll()=TRUE
3587 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3589 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3590 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3592 "byaya", // span() -> { 5 }
3593 "byay", // span() -> { 4 }
3594 "bya", // span() -> { 3 }
3596 // span(longest match) will not span the whole string.
3606 // spanBack(longest match) will not span the whole string.
3615 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3616 // and UTF-8 trail bytes.
3617 // Copies of above test sets and strings, but transliterated to have
3618 // different code points with similar trail units.
3619 // Previous: a b c d
3620 // Unicode: 042B 30AB 200AB 204AB
3621 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3622 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3623 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3625 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3627 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3629 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3631 // Stress bookkeeping and recursion.
3632 // The following strings are barely doable with the recursive
3633 // reference implementation.
3634 // The not-contained character at the end prevents an early exit from the span().
3638 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3639 // On complement sets, span() and spanBack() get different results
3640 // because b is not in the complement set and there is an odd number of b's
3641 // in the test string.
3643 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3645 // Test with set strings with an initial or final code point span
3647 "[a{" _64_a _64_a _64_a _64_a
"b}"
3648 "{a" _64_b _64_b _64_b _64_b
"}]",
3650 _64_a _64_a _64_a _63_a
"b",
3651 _64_a _64_a _64_a _64_a
"b",
3652 _64_a _64_a _64_a _64_a
"aaaabbbb",
3653 "a" _64_b _64_b _64_b _63_b
,
3654 "a" _64_b _64_b _64_b _64_b
,
3655 "aaaabbbb" _64_b _64_b _64_b _64_b
,
3657 // Test with strings containing unpaired surrogates.
3658 // They are not representable in UTF-8, and a leading trail surrogate
3659 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3660 // U+20001 == \\uD840\\uDC01
3661 // U+20400 == \\uD841\\uDC00
3662 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3664 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3666 uint32_t whichSpans
[96]={ SPAN_ALL
};
3667 int32_t whichSpansCount
=1;
3669 UnicodeSet
*sets
[SET_COUNT
]={ NULL
};
3670 const UnicodeSetWithStrings
*sets_with_str
[SET_COUNT
]={ NULL
};
3672 char testName
[1024];
3673 char *testNameLimit
=testName
;
3676 for(i
=0; i
<UPRV_LENGTHOF(testdata
); ++i
) {
3677 const char *s
=testdata
[i
];
3679 // Create new test sets from this pattern.
3680 for(j
=0; j
<SET_COUNT
; ++j
) {
3681 delete sets_with_str
[j
];
3684 UErrorCode errorCode
=U_ZERO_ERROR
;
3685 sets
[SLOW
]=new UnicodeSet(UnicodeString(s
, -1, US_INV
).unescape(), errorCode
);
3686 if(U_FAILURE(errorCode
)) {
3687 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s
, u_errorName(errorCode
));
3690 sets
[SLOW_NOT
]=new UnicodeSet(*sets
[SLOW
]);
3691 sets
[SLOW_NOT
]->complement();
3692 // Intermediate set: Test cloning of a frozen set.
3693 UnicodeSet
*fast
=new UnicodeSet(*sets
[SLOW
]);
3695 sets
[FAST
]=fast
->clone();
3697 UnicodeSet
*fastNot
=new UnicodeSet(*sets
[SLOW_NOT
]);
3699 sets
[FAST_NOT
]=fastNot
->clone();
3702 for(j
=0; j
<SET_COUNT
; ++j
) {
3703 sets_with_str
[j
]=new UnicodeSetWithStrings(*sets
[j
]);
3706 strcpy(testName
, s
);
3707 testNameLimit
=strchr(testName
, 0);
3708 *testNameLimit
++=':';
3711 whichSpans
[0]=SPAN_ALL
;
3713 } else if(s
[0]=='-') {
3714 whichSpans
[0]=SPAN_ALL
;
3720 whichSpansCount
=addAlternative(whichSpans
, whichSpansCount
,
3727 whichSpansCount
=addAlternative(whichSpans
, whichSpansCount
,
3734 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3735 // USET_SPAN_SIMPLE only FWD, and separately
3736 // USET_SPAN_SIMPLE only BACK
3737 whichSpansCount
=addAlternative(whichSpans
, whichSpansCount
,
3738 ~(SPAN_DIRS
|SPAN_CONDITION
),
3739 SPAN_DIRS
|SPAN_CONTAINED
,
3740 SPAN_FWD
|SPAN_SIMPLE
,
3741 SPAN_BACK
|SPAN_SIMPLE
);
3744 whichSpansCount
=addAlternative(whichSpans
, whichSpansCount
,
3751 errln("FAIL: unrecognized span set option in \"%s\"", testdata
[i
]);
3755 } else if(0==strcmp(s
, "*")) {
3756 strcpy(testNameLimit
, "bad_string");
3757 for(j
=0; j
<whichSpansCount
; ++j
) {
3758 if(whichSpansCount
>1) {
3759 sprintf(testNameLimit
+10 /* strlen("bad_string") */,
3763 testSpanUTF16String(sets_with_str
, whichSpans
[j
], testName
);
3764 testSpanUTF8String(sets_with_str
, whichSpans
[j
], testName
);
3767 strcpy(testNameLimit
, "contents");
3768 for(j
=0; j
<whichSpansCount
; ++j
) {
3769 if(whichSpansCount
>1) {
3770 sprintf(testNameLimit
+8 /* strlen("contents") */,
3774 testSpanContents(sets_with_str
, whichSpans
[j
], testName
);
3777 UnicodeString string
=UnicodeString(s
, -1, US_INV
).unescape();
3778 strcpy(testNameLimit
, "test_string");
3779 for(j
=0; j
<whichSpansCount
; ++j
) {
3780 if(whichSpansCount
>1) {
3781 sprintf(testNameLimit
+11 /* strlen("test_string") */,
3785 testSpanBothUTFs(sets_with_str
, string
.getBuffer(), string
.length(), whichSpans
[j
], testName
, i
);
3789 for(j
=0; j
<SET_COUNT
; ++j
) {
3790 delete sets_with_str
[j
];
3795 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3796 void UnicodeSetTest::TestStringSpan() {
3797 static const char *pattern
="[x{xy}{xya}{axy}{ax}]";
3798 static const char *const string
=
3800 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3802 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3804 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3807 UErrorCode errorCode
=U_ZERO_ERROR
;
3808 UnicodeString pattern16
=UnicodeString(pattern
, -1, US_INV
);
3809 UnicodeSet
set(pattern16
, errorCode
);
3810 if(U_FAILURE(errorCode
)) {
3811 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern
, u_errorName(errorCode
));
3815 UnicodeString string16
=UnicodeString(string
, -1, US_INV
).unescape();
3817 if(set
.containsAll(string16
)) {
3818 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern
, string
);
3821 // Remove trailing "aaaa".
3822 string16
.truncate(string16
.length()-4);
3823 if(!set
.containsAll(string16
)) {
3824 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern
, string
);
3827 string16
=UNICODE_STRING_SIMPLE("byayaxya");
3828 const UChar
*s16
=string16
.getBuffer();
3829 int32_t length16
=string16
.length();
3830 (void)length16
; // Suppress set but not used warning.
3831 if( set
.span(s16
, 8, USET_SPAN_NOT_CONTAINED
)!=4 ||
3832 set
.span(s16
, 7, USET_SPAN_NOT_CONTAINED
)!=4 ||
3833 set
.span(s16
, 6, USET_SPAN_NOT_CONTAINED
)!=4 ||
3834 set
.span(s16
, 5, USET_SPAN_NOT_CONTAINED
)!=5 ||
3835 set
.span(s16
, 4, USET_SPAN_NOT_CONTAINED
)!=4 ||
3836 set
.span(s16
, 3, USET_SPAN_NOT_CONTAINED
)!=3
3838 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern
);
3841 pattern
="[a{ab}{abc}{cd}]";
3842 pattern16
=UnicodeString(pattern
, -1, US_INV
);
3843 set
.applyPattern(pattern16
, errorCode
);
3844 if(U_FAILURE(errorCode
)) {
3845 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern
, u_errorName(errorCode
));
3848 string16
=UNICODE_STRING_SIMPLE("acdabcdabccd");
3849 s16
=string16
.getBuffer();
3850 length16
=string16
.length();
3851 if( set
.span(s16
, 12, USET_SPAN_CONTAINED
)!=12 ||
3852 set
.span(s16
, 12, USET_SPAN_SIMPLE
)!=6 ||
3853 set
.span(s16
+7, 5, USET_SPAN_SIMPLE
)!=5
3855 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern
);
3858 pattern
="[d{cd}{bcd}{ab}]";
3859 pattern16
=UnicodeString(pattern
, -1, US_INV
);
3860 set
.applyPattern(pattern16
, errorCode
).freeze();
3861 if(U_FAILURE(errorCode
)) {
3862 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern
, u_errorName(errorCode
));
3865 string16
=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3866 s16
=string16
.getBuffer();
3867 length16
=string16
.length();
3868 if( set
.spanBack(s16
, 12, USET_SPAN_CONTAINED
)!=0 ||
3869 set
.spanBack(s16
, 12, USET_SPAN_SIMPLE
)!=6 ||
3870 set
.spanBack(s16
, 5, USET_SPAN_SIMPLE
)!=0
3872 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern
);
3877 * Including collationroot.h fails here with
3878 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3879 * .. so, we skip this test on Windows.
3881 * the cause is that intltest builds with /Za which disables language extensions - which means
3882 * windows header files can't be used.
3884 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3885 #include "collationroot.h"
3886 #include "collationtailoring.h"
3889 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3890 #if U_PLATFORM_HAS_WIN32_API
3891 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3892 #elif !UCONFIG_NO_COLLATION
3893 UErrorCode errorCode
= U_ZERO_ERROR
;
3895 // Get the unsafeBackwardsSet
3896 const CollationCacheEntry
*rootEntry
= CollationRoot::getRootCacheEntry(errorCode
);
3897 if(U_FAILURE(errorCode
)) {
3898 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode
));
3901 //const UVersionInfo &version = rootEntry->tailoring->version;
3902 const UnicodeSet
*unsafeBackwardSet
= rootEntry
->tailoring
->unsafeBackwardSet
;
3904 checkSerializeRoundTrip(*unsafeBackwardSet
, errorCode
);
3906 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3908 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3909 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3910 UnicodeSet surrogates
;
3911 surrogates
.add(0xd83a); // a lead surrogate
3912 surrogates
.add(0xdc00, 0xdfff); // a range of trail surrogates
3914 surrogates
.toPattern(pat
, FALSE
); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3915 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3916 // so that at least one type of surrogate code points are escaped,
3917 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3918 errorCode
= U_ZERO_ERROR
;
3920 s2
.applyPattern(pat
, errorCode
); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3921 if(U_FAILURE(errorCode
)) {
3922 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode
));
3924 checkEqual(surrogates
, s2
, "surrogates to/from pattern");
3926 // This occurs in the UCA unsafe-backwards set.
3927 checkRoundTrip(*unsafeBackwardSet
);
3932 void UnicodeSetTest::TestIntOverflow() {
3933 // This test triggers undefined double->int conversion behavior
3934 // if the implementation is not careful.
3935 IcuTestErrorCode
errorCode(*this, "TestIntOverflow");
3936 UnicodeSet
set(u
"[:ccc=2222222222222222222:]", errorCode
);
3937 assertTrue("[:ccc=int_overflow:] -> empty set", set
.isEmpty());
3938 assertEquals("[:ccc=int_overflow:] -> illegal argument",
3939 U_ILLEGAL_ARGUMENT_ERROR
, errorCode
.reset());
3942 void UnicodeSetTest::TestUnusedCcc() {
3943 #if !UCONFIG_NO_NORMALIZATION
3944 // All numeric ccc values 0..255 are valid, but many are unused.
3945 IcuTestErrorCode
errorCode(*this, "TestUnusedCcc");
3946 UnicodeSet
ccc2(u
"[:ccc=2:]", errorCode
);
3947 assertSuccess("[:ccc=2:]", errorCode
);
3948 assertTrue("[:ccc=2:] -> empty set", ccc2
.isEmpty());
3950 UnicodeSet
ccc255(u
"[:ccc=255:]", errorCode
);
3951 assertSuccess("[:ccc=255:]", errorCode
);
3952 assertTrue("[:ccc=255:] -> empty set", ccc255
.isEmpty());
3954 // Non-integer values and values outside 0..255 are invalid.
3955 UnicodeSet
ccc_1(u
"[:ccc=-1:]", errorCode
);
3956 assertEquals("[:ccc=-1:] -> illegal argument",
3957 U_ILLEGAL_ARGUMENT_ERROR
, errorCode
.reset());
3958 assertTrue("[:ccc=-1:] -> empty set", ccc_1
.isEmpty());
3960 UnicodeSet
ccc256(u
"[:ccc=256:]", errorCode
);
3961 assertEquals("[:ccc=256:] -> illegal argument",
3962 U_ILLEGAL_ARGUMENT_ERROR
, errorCode
.reset());
3963 assertTrue("[:ccc=256:] -> empty set", ccc256
.isEmpty());
3965 UnicodeSet
ccc1_1(u
"[:ccc=1.1:]", errorCode
);
3966 assertEquals("[:ccc=1.1:] -> illegal argument",
3967 U_ILLEGAL_ARGUMENT_ERROR
, errorCode
.reset());
3968 assertTrue("[:ccc=1.1:] -> empty set", ccc1_1
.isEmpty());
3972 void UnicodeSetTest::TestDeepPattern() {
3973 IcuTestErrorCode
errorCode(*this, "TestDeepPattern");
3974 // Nested ranges are parsed via recursion which can use a lot of stack space.
3975 // After a reasonable limit, we should get an error.
3976 constexpr int32_t DEPTH
= 20000;
3977 UnicodeString pattern
, suffix
;
3978 for (int32_t i
= 0; i
< DEPTH
; ++i
) {
3979 pattern
.append(u
"[a", 2);
3982 pattern
.append(suffix
);
3983 UnicodeSet
set(pattern
, errorCode
);
3984 assertTrue("[a[a[a...1000s...]]] -> error", errorCode
.isFailure());