]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/usettest.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
1 /*
2 ********************************************************************************
3 * Copyright (C) 1999-2011 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 ********************************************************************************
10 */
11
12 #include <stdio.h>
13
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "hash.h"
26
27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28
29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31 u_errorName(status));}}
32
33 #define TEST_ASSERT(expr) {if (!(expr)) { \
34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35
36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37 UnicodeString pat;
38 set.toPattern(pat);
39 return left + UnicodeSetTest::escape(pat);
40 }
41
42 #define CASE(id,test) case id: \
43 name = #test; \
44 if (exec) { \
45 logln(#test "---"); \
46 logln(); \
47 test(); \
48 } \
49 break
50
51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53
54 UConverter *UnicodeSetTest::openUTF8Converter() {
55 if(utf8Cnv==NULL) {
56 UErrorCode errorCode=U_ZERO_ERROR;
57 utf8Cnv=ucnv_open("UTF-8", &errorCode);
58 }
59 return utf8Cnv;
60 }
61
62 UnicodeSetTest::~UnicodeSetTest() {
63 ucnv_close(utf8Cnv);
64 }
65
66 void
67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68 const char* &name, char* /*par*/) {
69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70 switch (index) {
71 CASE(0,TestPatterns);
72 CASE(1,TestAddRemove);
73 CASE(2,TestCategories);
74 CASE(3,TestCloneEqualHash);
75 CASE(4,TestMinimalRep);
76 CASE(5,TestAPI);
77 CASE(6,TestScriptSet);
78 CASE(7,TestPropertySet);
79 CASE(8,TestClone);
80 CASE(9,TestExhaustive);
81 CASE(10,TestToPattern);
82 CASE(11,TestIndexOf);
83 CASE(12,TestStrings);
84 CASE(13,Testj2268);
85 CASE(14,TestCloseOver);
86 CASE(15,TestEscapePattern);
87 CASE(16,TestInvalidCodePoint);
88 CASE(17,TestSymbolTable);
89 CASE(18,TestSurrogate);
90 CASE(19,TestPosixClasses);
91 CASE(20,TestIteration);
92 CASE(21,TestFreezable);
93 CASE(22,TestSpan);
94 CASE(23,TestStringSpan);
95 default: name = ""; break;
96 }
97 }
98
99 static const char NOT[] = "%%%%";
100
101 /**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105 void UnicodeSetTest::Testj2268() {
106 UnicodeSet t;
107 t.add(UnicodeString("abc"));
108 UnicodeSet test(t);
109 UnicodeString ustrPat;
110 test.toPattern(ustrPat, TRUE);
111 }
112
113 /**
114 * Test toPattern().
115 */
116 void UnicodeSetTest::TestToPattern() {
117 UErrorCode ec = U_ZERO_ERROR;
118
119 // Test that toPattern() round trips with syntax characters and
120 // whitespace.
121 {
122 static const char* OTHER_TOPATTERN_TESTS[] = {
123 "[[:latin:]&[:greek:]]",
124 "[[:latin:]-[:greek:]]",
125 "[:nonspacing mark:]",
126 NULL
127 };
128
129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130 ec = U_ZERO_ERROR;
131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132 if (U_FAILURE(ec)) {
133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
134 continue;
135 }
136 checkPat(OTHER_TOPATTERN_TESTS[j], s);
137 }
138
139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142 // check various combinations to make sure they all work.
143 if (i != 0 && !toPatternAux(i, i)){
144 continue;
145 }
146 if (!toPatternAux(0, i)){
147 continue;
148 }
149 if (!toPatternAux(i, 0xFFFF)){
150 continue;
151 }
152 }
153 }
154 }
155
156 // Test pattern behavior of multicharacter strings.
157 {
158 ec = U_ZERO_ERROR;
159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161 // This loop isn't a loop. It's here to make the compiler happy.
162 // If you're curious, try removing it and changing the 'break'
163 // statements (except for the last) to goto's.
164 for (;;) {
165 if (U_FAILURE(ec)) break;
166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169 s->add("ac");
170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174 if (U_FAILURE(ec)) break;
175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177
178 s->add("[]");
179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181
182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183 if (U_FAILURE(ec)) break;
184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186
187 // j2189
188 s->clear();
189 s->add(UnicodeString("abc", ""));
190 s->add(UnicodeString("abc", ""));
191 const char* exp6[] = {"abc", NOT, "ab", NULL};
192 expectToPattern(*s, "[{abc}]", exp6);
193
194 break;
195 }
196
197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198 delete s;
199 }
200
201 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202 UnicodeSet s;
203 s.add((UChar)97, (UChar)98); // 'a', 'b'
204 expectToPattern(s, "[ab]", NULL);
205 }
206
207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209 // use Integer.toString because Utility.hex doesn't handle ints
210 UnicodeString pat = "";
211 // TODO do these in hex
212 //String source = "0x" + Integer.toString(start,16).toUpperCase();
213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214 UnicodeString source;
215 source = source + (uint32_t)start;
216 if (start != end)
217 source = source + ".." + (uint32_t)end;
218 UnicodeSet testSet;
219 testSet.add(start, end);
220 return checkPat(source, testSet);
221 }
222
223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224 const UnicodeSet& testSet) {
225 // What we want to make sure of is that a pattern generated
226 // by toPattern(), with or without escaped unprintables, can
227 // be passed back into the UnicodeSet constructor.
228 UnicodeString pat0;
229
230 testSet.toPattern(pat0, TRUE);
231
232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234 //String pat1 = unescapeLeniently(pat0);
235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237 UnicodeString pat2;
238 testSet.toPattern(pat2, FALSE);
239 if (!checkPat(source, testSet, pat2)) return FALSE;
240
241 //String pat3 = unescapeLeniently(pat2);
242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246 return TRUE;
247 }
248
249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250 const UnicodeSet& testSet,
251 const UnicodeString& pat) {
252 UErrorCode ec = U_ZERO_ERROR;
253 UnicodeSet testSet2(pat, ec);
254 if (testSet2 != testSet) {
255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256 return FALSE;
257 }
258 return TRUE;
259 }
260
261 void
262 UnicodeSetTest::TestPatterns(void) {
263 UnicodeSet set;
264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271 // Throw in a test of complement
272 set.complement();
273 UnicodeString exp;
274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275 expectPairs(set, exp);
276 }
277
278 void
279 UnicodeSetTest::TestCategories(void) {
280 UErrorCode status = U_ZERO_ERROR;
281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282 UnicodeSet set(pat, status);
283 if (U_FAILURE(status)) {
284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285 return;
286 } else {
287 expectContainment(set, pat, "ABC", "abc");
288 }
289
290 UChar32 i;
291 int32_t failures = 0;
292 // Make sure generation of L doesn't pollute cached Lu set
293 // First generate L, then Lu
294 set.applyPattern("[:L:]", status);
295 if (U_FAILURE(status)) { errln("FAIL"); return; }
296 for (i=0; i<0x200; ++i) {
297 UBool l = u_isalpha((UChar)i);
298 if (l != set.contains(i)) {
299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300 set.contains(i));
301 if (++failures == 10) break;
302 }
303 }
304
305 set.applyPattern("[:Lu:]", status);
306 if (U_FAILURE(status)) { errln("FAIL"); return; }
307 for (i=0; i<0x200; ++i) {
308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309 if (lu != set.contains(i)) {
310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311 set.contains(i));
312 if (++failures == 20) break;
313 }
314 }
315 }
316 void
317 UnicodeSetTest::TestCloneEqualHash(void) {
318 UErrorCode status = U_ZERO_ERROR;
319 // set1 and set2 used to be built with the obsolete constructor taking
320 // UCharCategory values; replaced with pattern constructors
321 // markus 20030502
322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
324 if (U_FAILURE(status)){
325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
326 return;
327 }
328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
330 if (U_FAILURE(status)){
331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332 return;
333 }
334
335 if (*set1 != *set1a) {
336 errln("FAIL: category constructor for Ll broken");
337 }
338 if (*set2 != *set2a) {
339 errln("FAIL: category constructor for Nd broken");
340 }
341 delete set1a;
342 delete set2a;
343
344 logln("Testing copy construction");
345 UnicodeSet *set1copy=new UnicodeSet(*set1);
346 if(*set1 != *set1copy || *set1 == *set2 ||
347 getPairs(*set1) != getPairs(*set1copy) ||
348 set1->hashCode() != set1copy->hashCode()){
349 errln("FAIL : Error in copy construction");
350 return;
351 }
352
353 logln("Testing =operator");
354 UnicodeSet set1equal=*set1;
355 UnicodeSet set2equal=*set2;
356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358 errln("FAIL: Error in =operator");
359 }
360
361 logln("Testing clone()");
362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367 errln("FAIL: Error in clone");
368 }
369
370 logln("Testing hashcode");
371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376 errln("FAIL: Error in hashCode()");
377 }
378
379 delete set1;
380 delete set1copy;
381 delete set2;
382 delete set1clone;
383 delete set2clone;
384
385
386 }
387 void
388 UnicodeSetTest::TestAddRemove(void) {
389 UnicodeSet set; // Construct empty set
390 doAssert(set.isEmpty() == TRUE, "set should be empty");
391 doAssert(set.size() == 0, "size should be 0");
392 set.complement();
393 doAssert(set.size() == 0x110000, "size should be 0x110000");
394 set.clear();
395 set.add(0x0061, 0x007a);
396 expectPairs(set, "az");
397 doAssert(set.isEmpty() == FALSE, "set should not be empty");
398 doAssert(set.size() != 0, "size should not be equal to 0");
399 doAssert(set.size() == 26, "size should be equal to 26");
400 set.remove(0x006d, 0x0070);
401 expectPairs(set, "alqz");
402 doAssert(set.size() == 22, "size should be equal to 22");
403 set.remove(0x0065, 0x0067);
404 expectPairs(set, "adhlqz");
405 doAssert(set.size() == 19, "size should be equal to 19");
406 set.remove(0x0064, 0x0069);
407 expectPairs(set, "acjlqz");
408 doAssert(set.size() == 16, "size should be equal to 16");
409 set.remove(0x0063, 0x0072);
410 expectPairs(set, "absz");
411 doAssert(set.size() == 10, "size should be equal to 10");
412 set.add(0x0066, 0x0071);
413 expectPairs(set, "abfqsz");
414 doAssert(set.size() == 22, "size should be equal to 22");
415 set.remove(0x0061, 0x0067);
416 expectPairs(set, "hqsz");
417 set.remove(0x0061, 0x007a);
418 expectPairs(set, "");
419 doAssert(set.isEmpty() == TRUE, "set should be empty");
420 doAssert(set.size() == 0, "size should be 0");
421 set.add(0x0061);
422 doAssert(set.isEmpty() == FALSE, "set should not be empty");
423 doAssert(set.size() == 1, "size should not be equal to 1");
424 set.add(0x0062);
425 set.add(0x0063);
426 expectPairs(set, "ac");
427 doAssert(set.size() == 3, "size should not be equal to 3");
428 set.add(0x0070);
429 set.add(0x0071);
430 expectPairs(set, "acpq");
431 doAssert(set.size() == 5, "size should not be equal to 5");
432 set.clear();
433 expectPairs(set, "");
434 doAssert(set.isEmpty() == TRUE, "set should be empty");
435 doAssert(set.size() == 0, "size should be 0");
436
437 // Try removing an entire set from another set
438 expectPattern(set, "[c-x]", "cx");
439 UnicodeSet set2;
440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441 set.removeAll(set2);
442 expectPairs(set, "deluxx");
443
444 // Try adding an entire set to another set
445 expectPattern(set, "[jackiemclean]", "aacceein");
446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447 set.addAll(set2);
448 expectPairs(set, "aacehort");
449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450
451 // Try retaining an set of elements contained in another set (intersection)
452 UnicodeSet set3;
453 expectPattern(set3, "[a-c]", "ac");
454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455 set3.remove(0x0062);
456 expectPairs(set3, "aacc");
457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458 set.retainAll(set3);
459 expectPairs(set, "aacc");
460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462 set.clear();
463 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464
465 // Test commutativity
466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467 expectPattern(set2, "[jackiemclean]", "aacceein");
468 set.addAll(set2);
469 expectPairs(set, "aacehort");
470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471
472
473
474
475 }
476
477 /**
478 * Make sure minimal representation is maintained.
479 */
480 void UnicodeSetTest::TestMinimalRep() {
481 UErrorCode status = U_ZERO_ERROR;
482 // This is pretty thoroughly tested by checkCanonicalRep()
483 // run against the exhaustive operation results. Use the code
484 // here for debugging specific spot problems.
485
486 // 1 overlap against 2
487 UnicodeSet set("[h-km-q]", status);
488 if (U_FAILURE(status)) { errln("FAIL"); return; }
489 UnicodeSet set2("[i-o]", status);
490 if (U_FAILURE(status)) { errln("FAIL"); return; }
491 set.addAll(set2);
492 expectPairs(set, "hq");
493 // right
494 set.applyPattern("[a-m]", status);
495 if (U_FAILURE(status)) { errln("FAIL"); return; }
496 set2.applyPattern("[e-o]", status);
497 if (U_FAILURE(status)) { errln("FAIL"); return; }
498 set.addAll(set2);
499 expectPairs(set, "ao");
500 // left
501 set.applyPattern("[e-o]", status);
502 if (U_FAILURE(status)) { errln("FAIL"); return; }
503 set2.applyPattern("[a-m]", status);
504 if (U_FAILURE(status)) { errln("FAIL"); return; }
505 set.addAll(set2);
506 expectPairs(set, "ao");
507 // 1 overlap against 3
508 set.applyPattern("[a-eg-mo-w]", status);
509 if (U_FAILURE(status)) { errln("FAIL"); return; }
510 set2.applyPattern("[d-q]", status);
511 if (U_FAILURE(status)) { errln("FAIL"); return; }
512 set.addAll(set2);
513 expectPairs(set, "aw");
514 }
515
516 void UnicodeSetTest::TestAPI() {
517 UErrorCode status = U_ZERO_ERROR;
518 // default ct
519 UnicodeSet set;
520 if (!set.isEmpty() || set.getRangeCount() != 0) {
521 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522 set);
523 }
524
525 // clear(), isEmpty()
526 set.add(0x0061);
527 if (set.isEmpty()) {
528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529 set);
530 }
531 set.clear();
532 if (!set.isEmpty()) {
533 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534 set);
535 }
536
537 // size()
538 set.clear();
539 if (set.size() != 0) {
540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541 ": " + set);
542 }
543 set.add(0x0061);
544 if (set.size() != 1) {
545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546 ": " + set);
547 }
548 set.add(0x0031, 0x0039);
549 if (set.size() != 10) {
550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551 ": " + set);
552 }
553
554 // contains(first, last)
555 set.clear();
556 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557 if (U_FAILURE(status)) { errln("FAIL"); return; }
558 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559 UChar32 a = set.getRangeStart(i);
560 UChar32 b = set.getRangeEnd(i);
561 if (!set.contains(a, b)) {
562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563 " but doesn't: " + set);
564 }
565 if (set.contains((UChar32)(a-1), b)) {
566 errln((UnicodeString)"FAIL, shouldn't contain " +
567 (unsigned short)(a-1) + '-' + (unsigned short)b +
568 " but does: " + set);
569 }
570 if (set.contains(a, (UChar32)(b+1))) {
571 errln((UnicodeString)"FAIL, shouldn't contain " +
572 (unsigned short)a + '-' + (unsigned short)(b+1) +
573 " but does: " + set);
574 }
575 }
576
577 // Ported InversionList test.
578 UnicodeSet a((UChar32)3,(UChar32)10);
579 UnicodeSet b((UChar32)7,(UChar32)15);
580 UnicodeSet c;
581
582 logln((UnicodeString)"a [3-10]: " + a);
583 logln((UnicodeString)"b [7-15]: " + b);
584 c = a;
585 c.addAll(b);
586 UnicodeSet exp((UChar32)3,(UChar32)15);
587 if (c == exp) {
588 logln((UnicodeString)"c.set(a).add(b): " + c);
589 } else {
590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591 }
592 c.complement();
593 exp.set((UChar32)0, (UChar32)2);
594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595 if (c == exp) {
596 logln((UnicodeString)"c.complement(): " + c);
597 } else {
598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599 }
600 c.complement();
601 exp.set((UChar32)3, (UChar32)15);
602 if (c == exp) {
603 logln((UnicodeString)"c.complement(): " + c);
604 } else {
605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606 }
607 c = a;
608 c.complementAll(b);
609 exp.set((UChar32)3,(UChar32)6);
610 exp.add((UChar32)11,(UChar32) 15);
611 if (c == exp) {
612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613 } else {
614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615 }
616
617 exp = c;
618 bitsToSet(setToBits(c), c);
619 if (c == exp) {
620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621 } else {
622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623 }
624
625 // Additional tests for coverage JB#2118
626 //UnicodeSet::complement(class UnicodeString const &)
627 //UnicodeSet::complementAll(class UnicodeString const &)
628 //UnicodeSet::containsNone(class UnicodeSet const &)
629 //UnicodeSet::containsNone(long,long)
630 //UnicodeSet::containsSome(class UnicodeSet const &)
631 //UnicodeSet::containsSome(long,long)
632 //UnicodeSet::removeAll(class UnicodeString const &)
633 //UnicodeSet::retain(long)
634 //UnicodeSet::retainAll(class UnicodeString const &)
635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636 //UnicodeSetIterator::getString(void)
637 set.clear();
638 set.complement("ab");
639 exp.applyPattern("[{ab}]", status);
640 if (U_FAILURE(status)) { errln("FAIL"); return; }
641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642
643 UnicodeSetIterator iset(set);
644 if (!iset.next() || !iset.isString()) {
645 errln("FAIL: UnicodeSetIterator::next/isString");
646 } else if (iset.getString() != "ab") {
647 errln("FAIL: UnicodeSetIterator::getString");
648 }
649
650 set.add((UChar32)0x61, (UChar32)0x7A);
651 set.complementAll("alan");
652 exp.applyPattern("[{ab}b-kmo-z]", status);
653 if (U_FAILURE(status)) { errln("FAIL"); return; }
654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655
656 exp.applyPattern("[a-z]", status);
657 if (U_FAILURE(status)) { errln("FAIL"); return; }
658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660 exp.applyPattern("[aln]", status);
661 if (U_FAILURE(status)) { errln("FAIL"); return; }
662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664
665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666 errln("FAIL: containsNone(UChar32, UChar32)");
667 }
668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669 errln("FAIL: containsSome(UChar32, UChar32)");
670 }
671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672 errln("FAIL: containsNone(UChar32, UChar32)");
673 }
674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675 errln("FAIL: containsSome(UChar32, UChar32)");
676 }
677
678 set.removeAll("liu");
679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680 if (U_FAILURE(status)) { errln("FAIL"); return; }
681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682
683 set.retainAll("star");
684 exp.applyPattern("[rst]", status);
685 if (U_FAILURE(status)) { errln("FAIL"); return; }
686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687
688 set.retain((UChar32)0x73);
689 exp.applyPattern("[s]", status);
690 if (U_FAILURE(status)) { errln("FAIL"); return; }
691 if (set != exp) { errln("FAIL: retain('s')"); return; }
692
693 uint16_t buf[32];
694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697 errln("FAIL: serialize");
698 return;
699 }
700
701 // Conversions to and from USet
702 UnicodeSet *uniset = &set;
703 USet *uset = uniset->toUSet();
704 TEST_ASSERT((void *)uset == (void *)uniset);
705 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706 TEST_ASSERT((void *)setx == (void *)uset);
707 const UnicodeSet *constSet = uniset;
708 const USet *constUSet = constSet->toUSet();
709 TEST_ASSERT((void *)constUSet == (void *)constSet);
710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711 TEST_ASSERT((void *)constSetx == (void *)constUSet);
712
713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715 UnicodeSet ac(0x61, 0x63);
716 ac.remove(0x62).freeze();
717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727 ) {
728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729 }
730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740 ) {
741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742 }
743 }
744
745 void UnicodeSetTest::TestIteration() {
746 UErrorCode ec = U_ZERO_ERROR;
747 int i = 0;
748 int outerLoop;
749
750 // 6 code points, 3 ranges, 2 strings, 8 total elements
751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
753 TEST_ASSERT_SUCCESS(ec);
754 UnicodeSetIterator it(set);
755
756 for (outerLoop=0; outerLoop<3; outerLoop++) {
757 // Run the test multiple times, to check that iterator.reset() is working.
758 for (i=0; i<10; i++) {
759 UBool nextv = it.next();
760 UBool isString = it.isString();
761 int32_t codePoint = it.getCodepoint();
762 //int32_t codePointEnd = it.getCodepointEnd();
763 UnicodeString s = it.getString();
764 switch (i) {
765 case 0:
766 TEST_ASSERT(nextv == TRUE);
767 TEST_ASSERT(isString == FALSE);
768 TEST_ASSERT(codePoint==0x61);
769 TEST_ASSERT(s == "a");
770 break;
771 case 1:
772 TEST_ASSERT(nextv == TRUE);
773 TEST_ASSERT(isString == FALSE);
774 TEST_ASSERT(codePoint==0x62);
775 TEST_ASSERT(s == "b");
776 break;
777 case 2:
778 TEST_ASSERT(nextv == TRUE);
779 TEST_ASSERT(isString == FALSE);
780 TEST_ASSERT(codePoint==0x63);
781 TEST_ASSERT(s == "c");
782 break;
783 case 3:
784 TEST_ASSERT(nextv == TRUE);
785 TEST_ASSERT(isString == FALSE);
786 TEST_ASSERT(codePoint==0x79);
787 TEST_ASSERT(s == "y");
788 break;
789 case 4:
790 TEST_ASSERT(nextv == TRUE);
791 TEST_ASSERT(isString == FALSE);
792 TEST_ASSERT(codePoint==0x7a);
793 TEST_ASSERT(s == "z");
794 break;
795 case 5:
796 TEST_ASSERT(nextv == TRUE);
797 TEST_ASSERT(isString == FALSE);
798 TEST_ASSERT(codePoint==0x1abcd);
799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800 break;
801 case 6:
802 TEST_ASSERT(nextv == TRUE);
803 TEST_ASSERT(isString == TRUE);
804 TEST_ASSERT(s == "str1");
805 break;
806 case 7:
807 TEST_ASSERT(nextv == TRUE);
808 TEST_ASSERT(isString == TRUE);
809 TEST_ASSERT(s == "str2");
810 break;
811 case 8:
812 TEST_ASSERT(nextv == FALSE);
813 break;
814 case 9:
815 TEST_ASSERT(nextv == FALSE);
816 break;
817 }
818 }
819 it.reset(); // prepare to run the iteration again.
820 }
821 }
822
823
824
825
826 void UnicodeSetTest::TestStrings() {
827 UErrorCode ec = U_ZERO_ERROR;
828
829 UnicodeSet* testList[] = {
830 UnicodeSet::createFromAll("abc"),
831 new UnicodeSet("[a-c]", ec),
832
833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834 new UnicodeSet("[{ll}{ch}a-z]", ec),
835
836 UnicodeSet::createFrom("ab}c"),
837 new UnicodeSet("[{ab\\}c}]", ec),
838
839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841
842 NULL
843 };
844
845 if (U_FAILURE(ec)) {
846 errln("FAIL: couldn't construct test sets");
847 }
848
849 for (int32_t i = 0; testList[i] != NULL; i+=2) {
850 if (U_SUCCESS(ec)) {
851 UnicodeString pat0, pat1;
852 testList[i]->toPattern(pat0, TRUE);
853 testList[i+1]->toPattern(pat1, TRUE);
854 if (*testList[i] == *testList[i+1]) {
855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856 } else {
857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858 }
859 }
860 delete testList[i];
861 delete testList[i+1];
862 }
863 }
864
865 /**
866 * Test the [:Latin:] syntax.
867 */
868 void UnicodeSetTest::TestScriptSet() {
869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
870
871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
872
873 /* Jitterbug 1423 */
874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
875
876 }
877
878 /**
879 * Test the [:Latin:] syntax.
880 */
881 void UnicodeSetTest::TestPropertySet() {
882 static const char* const DATA[] = {
883 // Pattern, Chars IN, Chars NOT in
884
885 "[:Latin:]",
886 "aA",
887 "\\u0391\\u03B1",
888
889 "[\\p{Greek}]",
890 "\\u0391\\u03B1",
891 "aA",
892
893 "\\P{ GENERAL Category = upper case letter }",
894 "abc",
895 "ABC",
896
897 #if !UCONFIG_NO_NORMALIZATION
898 // Combining class: @since ICU 2.2
899 // Check both symbolic and numeric
900 "\\p{ccc=Nukta}",
901 "\\u0ABC",
902 "abc",
903
904 "\\p{Canonical Combining Class = 11}",
905 "\\u05B1",
906 "\\u05B2",
907
908 "[:c c c = iota subscript :]",
909 "\\u0345",
910 "xyz",
911 #endif
912
913 // Bidi class: @since ICU 2.2
914 "\\p{bidiclass=lefttoright}",
915 "abc",
916 "\\u0671\\u0672",
917
918 // Binary properties: @since ICU 2.2
919 "\\p{ideographic}",
920 "\\u4E0A",
921 "x",
922
923 "[:math=false:]",
924 "q)*(",
925 // weiv: )(and * were removed from math in Unicode 4.0.1
926 //"(*+)",
927 "+<>^",
928
929 // JB#1767 \N{}, \p{ASCII}
930 "[:Ascii:]",
931 "abc\\u0000\\u007F",
932 "\\u0080\\u4E00",
933
934 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
935 "az",
936 "qrs",
937
938 // JB#2015
939 "[:any:]",
940 "a\\U0010FFFF",
941 "",
942
943 "[:nv=0.5:]",
944 "\\u00BD\\u0F2A",
945 "\\u00BC",
946
947 // JB#2653: Age
948 "[:Age=1.1:]",
949 "\\u03D6", // 1.1
950 "\\u03D8\\u03D9", // 3.2
951
952 "[:Age=3.1:]",
953 "\\u1800\\u3400\\U0002f800",
954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955
956 // JB#2350: Case_Sensitive
957 "[:Case Sensitive:]",
958 "A\\u1FFC\\U00010410",
959 ";\\u00B4\\U00010500",
960
961 // JB#2832: C99-compatibility props
962 "[:blank:]",
963 " \\u0009",
964 "1-9A-Z",
965
966 "[:graph:]",
967 "19AZ",
968 " \\u0003\\u0007\\u0009\\u000A\\u000D",
969
970 "[:punct:]",
971 "!@#%&*()[]{}-_\\/;:,.?'\"",
972 "09azAZ",
973
974 "[:xdigit:]",
975 "09afAF",
976 "gG!",
977
978 // Regex compatibility test
979 "[-b]", // leading '-' is literal
980 "-b",
981 "ac",
982
983 "[^-b]", // leading '-' is literal
984 "ac",
985 "-b",
986
987 "[b-]", // trailing '-' is literal
988 "-b",
989 "ac",
990
991 "[^b-]", // trailing '-' is literal
992 "ac",
993 "-b",
994
995 "[a-b-]", // trailing '-' is literal
996 "ab-",
997 "c=",
998
999 "[[a-q]&[p-z]-]", // trailing '-' is literal
1000 "pq-",
1001 "or=",
1002
1003 "[\\s|\\)|:|$|\\>]", // from regex tests
1004 "s|):$>",
1005 "abc",
1006
1007 "[\\uDC00cd]", // JB#2906: isolated trail at start
1008 "cd\\uDC00",
1009 "ab\\uD800\\U00010000",
1010
1011 "[ab\\uD800]", // JB#2906: isolated trail at start
1012 "ab\\uD800",
1013 "cd\\uDC00\\U00010000",
1014
1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016 "abcd\\uD800",
1017 "ef\\uDC00\\U00010000",
1018
1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020 "abcd\\uDC00",
1021 "ef\\uD800\\U00010000",
1022
1023 #if !UCONFIG_NO_NORMALIZATION
1024 "[:^lccc=0:]", // Lead canonical class
1025 "\\u0300\\u0301",
1026 "abcd\\u00c0\\u00c5",
1027
1028 "[:^tccc=0:]", // Trail canonical class
1029 "\\u0300\\u0301\\u00c0\\u00c5",
1030 "abcd",
1031
1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033 "\\u0300\\u0301\\u00c0\\u00c5",
1034 "abcd",
1035
1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037 "",
1038 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041 "\\u0F73\\u0F75\\u0F81",
1042 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043 #endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045 "[:Assigned:]",
1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049 // Script_Extensions, new in Unicode 6.0
1050 "[:scx=Arab:]",
1051 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052 "\\u061D\\u065F\\uFDEF\\uFDFE",
1053
1054 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055 // so scx-sc is missing U+FDF2.
1056 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057 "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058 "\\uFDF2"
1059 };
1060
1061 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062
1063 for (int32_t i=0; i<DATA_LEN; i+=3) {
1064 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065 CharsToUnicodeString(DATA[i+2]));
1066 }
1067 }
1068
1069 /**
1070 * Test that Posix style character classes [:digit:], etc.
1071 * have the Unicode definitions from TR 18.
1072 */
1073 void UnicodeSetTest::TestPosixClasses() {
1074 {
1075 UErrorCode status = U_ZERO_ERROR;
1076 UnicodeSet s1("[:alpha:]", status);
1077 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078 TEST_ASSERT_SUCCESS(status);
1079 TEST_ASSERT(s1==s2);
1080 }
1081 {
1082 UErrorCode status = U_ZERO_ERROR;
1083 UnicodeSet s1("[:lower:]", status);
1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085 TEST_ASSERT_SUCCESS(status);
1086 TEST_ASSERT(s1==s2);
1087 }
1088 {
1089 UErrorCode status = U_ZERO_ERROR;
1090 UnicodeSet s1("[:upper:]", status);
1091 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092 TEST_ASSERT_SUCCESS(status);
1093 TEST_ASSERT(s1==s2);
1094 }
1095 {
1096 UErrorCode status = U_ZERO_ERROR;
1097 UnicodeSet s1("[:punct:]", status);
1098 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099 TEST_ASSERT_SUCCESS(status);
1100 TEST_ASSERT(s1==s2);
1101 }
1102 {
1103 UErrorCode status = U_ZERO_ERROR;
1104 UnicodeSet s1("[:digit:]", status);
1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106 TEST_ASSERT_SUCCESS(status);
1107 TEST_ASSERT(s1==s2);
1108 }
1109 {
1110 UErrorCode status = U_ZERO_ERROR;
1111 UnicodeSet s1("[:xdigit:]", status);
1112 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113 TEST_ASSERT_SUCCESS(status);
1114 TEST_ASSERT(s1==s2);
1115 }
1116 {
1117 UErrorCode status = U_ZERO_ERROR;
1118 UnicodeSet s1("[:alnum:]", status);
1119 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120 TEST_ASSERT_SUCCESS(status);
1121 TEST_ASSERT(s1==s2);
1122 }
1123 {
1124 UErrorCode status = U_ZERO_ERROR;
1125 UnicodeSet s1("[:space:]", status);
1126 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127 TEST_ASSERT_SUCCESS(status);
1128 TEST_ASSERT(s1==s2);
1129 }
1130 {
1131 UErrorCode status = U_ZERO_ERROR;
1132 UnicodeSet s1("[:blank:]", status);
1133 TEST_ASSERT_SUCCESS(status);
1134 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135 status);
1136 TEST_ASSERT_SUCCESS(status);
1137 TEST_ASSERT(s1==s2);
1138 }
1139 {
1140 UErrorCode status = U_ZERO_ERROR;
1141 UnicodeSet s1("[:cntrl:]", status);
1142 TEST_ASSERT_SUCCESS(status);
1143 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144 TEST_ASSERT_SUCCESS(status);
1145 TEST_ASSERT(s1==s2);
1146 }
1147 {
1148 UErrorCode status = U_ZERO_ERROR;
1149 UnicodeSet s1("[:graph:]", status);
1150 TEST_ASSERT_SUCCESS(status);
1151 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152 TEST_ASSERT_SUCCESS(status);
1153 TEST_ASSERT(s1==s2);
1154 }
1155 {
1156 UErrorCode status = U_ZERO_ERROR;
1157 UnicodeSet s1("[:print:]", status);
1158 TEST_ASSERT_SUCCESS(status);
1159 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160 TEST_ASSERT_SUCCESS(status);
1161 TEST_ASSERT(s1==s2);
1162 }
1163 }
1164 /**
1165 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1166 */
1167 void UnicodeSetTest::TestClone() {
1168 UErrorCode ec = U_ZERO_ERROR;
1169 UnicodeSet s("[abcxyz]", ec);
1170 UnicodeSet t(s);
1171 expectContainment(t, "abc", "def");
1172 }
1173
1174 /**
1175 * Test the indexOf() and charAt() methods.
1176 */
1177 void UnicodeSetTest::TestIndexOf() {
1178 UErrorCode ec = U_ZERO_ERROR;
1179 UnicodeSet set("[a-cx-y3578]", ec);
1180 if (U_FAILURE(ec)) {
1181 errln("FAIL: UnicodeSet constructor");
1182 return;
1183 }
1184 for (int32_t i=0; i<set.size(); ++i) {
1185 UChar32 c = set.charAt(i);
1186 if (set.indexOf(c) != i) {
1187 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188 i, c, set.indexOf(c));
1189 }
1190 }
1191 UChar32 c = set.charAt(set.size());
1192 if (c != -1) {
1193 errln("FAIL: charAt(<out of range>) = %X", c);
1194 }
1195 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196 if (j != -1) {
1197 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198 }
1199 }
1200
1201 /**
1202 * Test closure API.
1203 */
1204 void UnicodeSetTest::TestCloseOver() {
1205 UErrorCode ec = U_ZERO_ERROR;
1206
1207 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209 const char* DATA[] = {
1210 // selector, input, output
1211 CASE,
1212 "[aq\\u00DF{Bc}{bC}{Fi}]",
1213 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215 CASE,
1216 "[\\u01F1]", // 'DZ'
1217 "[\\u01F1\\u01F2\\u01F3]",
1218
1219 CASE,
1220 "[\\u1FB4]",
1221 "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223 CASE,
1224 "[{F\\uFB01}]",
1225 "[\\uFB03{ffi}]",
1226
1227 CASE, // make sure binary search finds limits
1228 "[a\\uFF3A]",
1229 "[aA\\uFF3A\\uFF5A]",
1230
1231 CASE,
1232 "[a-z]","[A-Za-z\\u017F\\u212A]",
1233 CASE,
1234 "[abc]","[A-Ca-c]",
1235 CASE,
1236 "[ABC]","[A-Ca-c]",
1237
1238 CASE, "[i]", "[iI]",
1239
1240 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1241 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1242
1243 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1244
1245 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1252
1253 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1256 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1257
1258 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1259
1260 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264 #if !UCONFIG_NO_FILE_IO
1265 CASE_MAPPINGS,
1266 "[aq\\u00DF{Bc}{bC}{Fi}]",
1267 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268 #endif
1269
1270 CASE_MAPPINGS,
1271 "[\\u01F1]", // 'DZ'
1272 "[\\u01F1\\u01F2\\u01F3]",
1273
1274 CASE_MAPPINGS,
1275 "[a-z]",
1276 "[A-Za-z]",
1277
1278 NULL
1279 };
1280
1281 UnicodeSet s;
1282 UnicodeSet t;
1283 UnicodeString buf;
1284 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285 int32_t selector = DATA[i][0];
1286 UnicodeString pat(DATA[i+1], -1, US_INV);
1287 UnicodeString exp(DATA[i+2], -1, US_INV);
1288 s.applyPattern(pat, ec);
1289 s.closeOver(selector);
1290 t.applyPattern(exp, ec);
1291 if (U_FAILURE(ec)) {
1292 errln("FAIL: applyPattern failed");
1293 continue;
1294 }
1295 if (s == t) {
1296 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297 } else {
1298 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299 s.toPattern(buf, TRUE) + ", expected " + exp);
1300 }
1301 }
1302
1303 #if 0
1304 /*
1305 * Unused test code.
1306 * This was used to compare the old implementation (using USET_CASE)
1307 * with the new one (using 0x100 temporarily)
1308 * while transitioning from hardcoded case closure tables in uniset.cpp
1309 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310 * and using ucase.c functions for closure.
1311 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312 *
1313 * Note: The old and new implementation never fully matched because
1314 * the old implementation turned out to not map U+0130 and U+0131 correctly
1315 * (dotted I and dotless i) and because the old implementation's data tables
1316 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317 * new implementation. (So sigmas and some other characters were not handled
1318 * according to the newer Unicode version.)
1319 */
1320 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321 UnicodeSetIterator si(sens);
1322 UnicodeString str, buf2;
1323 const UnicodeString *pStr;
1324 UChar32 c;
1325 while(si.next()) {
1326 if(!si.isString()) {
1327 c=si.getCodepoint();
1328 s.clear();
1329 s.add(c);
1330
1331 str.setTo(c);
1332 str.foldCase();
1333 sens2.add(str);
1334
1335 t=s;
1336 s.closeOver(USET_CASE);
1337 t.closeOver(0x100);
1338 if(s!=t) {
1339 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341 }
1342 }
1343 }
1344 // remove all code points
1345 // should contain all full case folding mapping strings
1346 sens2.remove(0, 0x10ffff);
1347 si.reset(sens2);
1348 while(si.next()) {
1349 if(si.isString()) {
1350 pStr=&si.getString();
1351 s.clear();
1352 s.add(*pStr);
1353 t=s2=s;
1354 s.closeOver(USET_CASE);
1355 t.closeOver(0x100);
1356 if(s!=t) {
1357 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359 }
1360 }
1361 }
1362 #endif
1363
1364 // Test the pattern API
1365 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366 if (U_FAILURE(ec)) {
1367 errln("FAIL: applyPattern failed");
1368 } else {
1369 expectContainment(s, "abcABC", "defDEF");
1370 }
1371 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372 if (U_FAILURE(ec)) {
1373 errln("FAIL: constructor failed");
1374 } else {
1375 expectContainment(v, "defDEF", "abcABC");
1376 }
1377 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378 if (U_FAILURE(ec)) {
1379 errln("FAIL: construct w/case mappings failed");
1380 } else {
1381 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382 }
1383 }
1384
1385 void UnicodeSetTest::TestEscapePattern() {
1386 const char pattern[] =
1387 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388 const char exp[] =
1389 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390 // We test this with two passes; in the second pass we
1391 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1392 // this fails -- which is what we expect.
1393 for (int32_t pass=1; pass<=2; ++pass) {
1394 UErrorCode ec = U_ZERO_ERROR;
1395 UnicodeString pat(pattern, -1, US_INV);
1396 if (pass==2) {
1397 pat = pat.unescape();
1398 }
1399 // Pattern is only good for pass 1
1400 UBool isPatternValid = (pass==1);
1401
1402 UnicodeSet set(pat, ec);
1403 if (U_SUCCESS(ec) != isPatternValid){
1404 errln((UnicodeString)"FAIL: applyPattern(" +
1405 escape(pat) + ") => " +
1406 u_errorName(ec));
1407 continue;
1408 }
1409 if (U_FAILURE(ec)) {
1410 continue;
1411 }
1412 if (set.contains((UChar)0x0644)){
1413 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414 }
1415
1416 UnicodeString newpat;
1417 set.toPattern(newpat, TRUE);
1418 if (newpat == UnicodeString(exp, -1, US_INV)) {
1419 logln(escape(pat) + " => " + newpat);
1420 } else {
1421 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422 }
1423
1424 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425 UnicodeString str("Range ");
1426 str.append((UChar)(0x30 + i))
1427 .append(": ")
1428 .append((UChar32)set.getRangeStart(i))
1429 .append(" - ")
1430 .append((UChar32)set.getRangeEnd(i));
1431 str = str + " (" + set.getRangeStart(i) + " - " +
1432 set.getRangeEnd(i) + ")";
1433 if (set.getRangeStart(i) < 0) {
1434 errln((UnicodeString)"FAIL: " + escape(str));
1435 } else {
1436 logln(escape(str));
1437 }
1438 }
1439 }
1440 }
1441
1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
1443 const UnicodeSet& set,
1444 UChar32 start, UChar32 end) {
1445 UnicodeSet exp(start, end);
1446 UnicodeString pat;
1447 if (set == exp) {
1448 logln(label + " => " + set.toPattern(pat, TRUE));
1449 } else {
1450 UnicodeString xpat;
1451 errln((UnicodeString)"FAIL: " + label + " => " +
1452 set.toPattern(pat, TRUE) +
1453 ", expected " + exp.toPattern(xpat, TRUE));
1454 }
1455 }
1456
1457 void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459 const UChar32 DATA[] = {
1460 // Test range Expected range
1461 0, 0x10FFFF, 0, 0x10FFFF,
1462 (UChar32)-1, 8, 0, 8,
1463 8, 0x110000, 8, 0x10FFFF
1464 };
1465 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466
1467 UnicodeString pat;
1468 int32_t i;
1469
1470 for (i=0; i<DATA_LENGTH; i+=4) {
1471 UChar32 start = DATA[i];
1472 UChar32 end = DATA[i+1];
1473 UChar32 xstart = DATA[i+2];
1474 UChar32 xend = DATA[i+3];
1475
1476 // Try various API using the test code points
1477
1478 UnicodeSet set(start, end);
1479 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480 set, xstart, xend);
1481
1482 set.clear();
1483 set.set(start, end);
1484 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485 set, xstart, xend);
1486
1487 UBool b = set.contains(start);
1488 b = set.contains(start, end);
1489 b = set.containsNone(start, end);
1490 b = set.containsSome(start, end);
1491
1492 /*int32_t index = set.indexOf(start);*/
1493
1494 set.clear();
1495 set.add(start);
1496 set.add(start, end);
1497 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498 set, xstart, xend);
1499
1500 set.set(0, 0x10FFFF);
1501 set.retain(start, end);
1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503 set, xstart, xend);
1504 set.retain(start);
1505
1506 set.set(0, 0x10FFFF);
1507 set.remove(start);
1508 set.remove(start, end);
1509 set.complement();
1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511 set, xstart, xend);
1512
1513 set.set(0, 0x10FFFF);
1514 set.complement(start, end);
1515 set.complement();
1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517 set, xstart, xend);
1518 set.complement(start);
1519 }
1520
1521 const UChar32 DATA2[] = {
1522 0,
1523 0x10FFFF,
1524 (UChar32)-1,
1525 0x110000
1526 };
1527 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1528
1529 for (i=0; i<DATA2_LENGTH; ++i) {
1530 UChar32 c = DATA2[i], end = 0x10FFFF;
1531 UBool valid = (c >= 0 && c <= 0x10FFFF);
1532
1533 UnicodeSet set(0, 0x10FFFF);
1534
1535 // For single-codepoint contains, invalid codepoints are NOT contained
1536 UBool b = set.contains(c);
1537 if (b == valid) {
1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539 ") = " + b);
1540 } else {
1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542 ") = " + b);
1543 }
1544
1545 // For codepoint range contains, containsNone, and containsSome,
1546 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547 b = set.contains(c, end);
1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549 "," + end + ") = " + b);
1550
1551 b = set.containsNone(c, end);
1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553 "," + end + ") = " + b);
1554
1555 b = set.containsSome(c, end);
1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557 "," + end + ") = " + b);
1558
1559 int32_t index = set.indexOf(c);
1560 if ((index >= 0) == valid) {
1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562 ") = " + index);
1563 } else {
1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565 ") = " + index);
1566 }
1567 }
1568 }
1569
1570 // Used by TestSymbolTable
1571 class TokenSymbolTable : public SymbolTable {
1572 public:
1573 Hashtable contents;
1574
1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1576 contents.setValueDeleter(uprv_deleteUObject);
1577 }
1578
1579 ~TokenSymbolTable() {}
1580
1581 /**
1582 * (Non-SymbolTable API) Add the given variable and value to
1583 * the table. Variable should NOT contain leading '$'.
1584 */
1585 void add(const UnicodeString& var, const UnicodeString& value,
1586 UErrorCode& ec) {
1587 if (U_SUCCESS(ec)) {
1588 contents.put(var, new UnicodeString(value), ec);
1589 }
1590 }
1591
1592 /**
1593 * SymbolTable API
1594 */
1595 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596 return (const UnicodeString*) contents.get(s);
1597 }
1598
1599 /**
1600 * SymbolTable API
1601 */
1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603 return NULL;
1604 }
1605
1606 /**
1607 * SymbolTable API
1608 */
1609 virtual UnicodeString parseReference(const UnicodeString& text,
1610 ParsePosition& pos, int32_t limit) const {
1611 int32_t start = pos.getIndex();
1612 int32_t i = start;
1613 UnicodeString result;
1614 while (i < limit) {
1615 UChar c = text.charAt(i);
1616 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617 break;
1618 }
1619 ++i;
1620 }
1621 if (i == start) { // No valid name chars
1622 return result; // Indicate failure with empty string
1623 }
1624 pos.setIndex(i);
1625 text.extractBetween(start, i, result);
1626 return result;
1627 }
1628 };
1629
1630 void UnicodeSetTest::TestSymbolTable() {
1631 // Multiple test cases can be set up here. Each test case
1632 // is terminated by null:
1633 // var, value, var, value,..., input pat., exp. output pat., null
1634 const char* DATA[] = {
1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638 NULL
1639 };
1640
1641 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642 UErrorCode ec = U_ZERO_ERROR;
1643 TokenSymbolTable sym(ec);
1644 if (U_FAILURE(ec)) {
1645 errln("FAIL: couldn't construct TokenSymbolTable");
1646 continue;
1647 }
1648
1649 // Set up variables
1650 while (DATA[i+2] != NULL) {
1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1652 if (U_FAILURE(ec)) {
1653 errln("FAIL: couldn't add to TokenSymbolTable");
1654 continue;
1655 }
1656 i += 2;
1657 }
1658
1659 // Input pattern and expected output pattern
1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1661 i += 2;
1662
1663 ParsePosition pos(0);
1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665 if (U_FAILURE(ec)) {
1666 errln("FAIL: couldn't construct UnicodeSet");
1667 continue;
1668 }
1669
1670 // results
1671 if (pos.getIndex() != inpat.length()) {
1672 errln((UnicodeString)"Failed to read to end of string \""
1673 + inpat + "\": read to "
1674 + pos.getIndex() + ", length is "
1675 + inpat.length());
1676 }
1677
1678 UnicodeSet us2(exppat, ec);
1679 if (U_FAILURE(ec)) {
1680 errln("FAIL: couldn't construct expected UnicodeSet");
1681 continue;
1682 }
1683
1684 UnicodeString a, b;
1685 if (us != us2) {
1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687 ", expected " + us2.toPattern(b, TRUE));
1688 } else {
1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690 }
1691 }
1692 }
1693
1694 void UnicodeSetTest::TestSurrogate() {
1695 const char* DATA[] = {
1696 // These should all behave identically
1697 "[abc\\uD800\\uDC00]",
1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699 "[abc\\U00010000]",
1700 0
1701 };
1702 for (int i=0; DATA[i] != 0; ++i) {
1703 UErrorCode ec = U_ZERO_ERROR;
1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706 UnicodeSet set(str, ec);
1707 if (U_FAILURE(ec)) {
1708 errln("FAIL: UnicodeSet constructor");
1709 continue;
1710 }
1711 expectContainment(set,
1712 CharsToUnicodeString("abc\\U00010000"),
1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714 if (set.size() != 4) {
1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1716 set.size() + ", expected 4");
1717 }
1718 }
1719 }
1720
1721 void UnicodeSetTest::TestExhaustive() {
1722 // exhaustive tests. Simulate UnicodeSets with integers.
1723 // That gives us very solid tests (except for large memory tests).
1724
1725 int32_t limit = 128;
1726
1727 UnicodeSet x, y, z, aa;
1728
1729 for (int32_t i = 0; i < limit; ++i) {
1730 bitsToSet(i, x);
1731 logln((UnicodeString)"Testing " + i + ", " + x);
1732 _testComplement(i, x, y);
1733
1734 // AS LONG AS WE ARE HERE, check roundtrip
1735 checkRoundTrip(bitsToSet(i, aa));
1736
1737 for (int32_t j = 0; j < limit; ++j) {
1738 _testAdd(i,j, x,y,z);
1739 _testXor(i,j, x,y,z);
1740 _testRetain(i,j, x,y,z);
1741 _testRemove(i,j, x,y,z);
1742 }
1743 }
1744 }
1745
1746 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1747 bitsToSet(a, x);
1748 z = x;
1749 z.complement();
1750 int32_t c = setToBits(z);
1751 if (c != (~a)) {
1752 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1753 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1754 }
1755 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1756 }
1757
1758 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1759 bitsToSet(a, x);
1760 bitsToSet(b, y);
1761 z = x;
1762 z.addAll(y);
1763 int32_t c = setToBits(z);
1764 if (c != (a | b)) {
1765 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1766 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1767 }
1768 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1769 }
1770
1771 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772 bitsToSet(a, x);
1773 bitsToSet(b, y);
1774 z = x;
1775 z.retainAll(y);
1776 int32_t c = setToBits(z);
1777 if (c != (a & b)) {
1778 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1779 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1780 }
1781 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1782 }
1783
1784 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785 bitsToSet(a, x);
1786 bitsToSet(b, y);
1787 z = x;
1788 z.removeAll(y);
1789 int32_t c = setToBits(z);
1790 if (c != (a &~ b)) {
1791 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1792 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1793 }
1794 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1795 }
1796
1797 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798 bitsToSet(a, x);
1799 bitsToSet(b, y);
1800 z = x;
1801 z.complementAll(y);
1802 int32_t c = setToBits(z);
1803 if (c != (a ^ b)) {
1804 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1805 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1806 }
1807 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1808 }
1809
1810 /**
1811 * Check that ranges are monotonically increasing and non-
1812 * overlapping.
1813 */
1814 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1815 int32_t n = set.getRangeCount();
1816 if (n < 0) {
1817 errln((UnicodeString)"FAIL result of " + msg +
1818 ": range count should be >= 0 but is " +
1819 n /*+ " for " + set.toPattern())*/);
1820 return;
1821 }
1822 UChar32 last = 0;
1823 for (int32_t i=0; i<n; ++i) {
1824 UChar32 start = set.getRangeStart(i);
1825 UChar32 end = set.getRangeEnd(i);
1826 if (start > end) {
1827 errln((UnicodeString)"FAIL result of " + msg +
1828 ": range " + (i+1) +
1829 " start > end: " + (int)start + ", " + (int)end +
1830 " for " + set);
1831 }
1832 if (i > 0 && start <= last) {
1833 errln((UnicodeString)"FAIL result of " + msg +
1834 ": range " + (i+1) +
1835 " overlaps previous range: " + (int)start + ", " + (int)end +
1836 " for " + set);
1837 }
1838 last = end;
1839 }
1840 }
1841
1842 /**
1843 * Convert a bitmask to a UnicodeSet.
1844 */
1845 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1846 result.clear();
1847 for (UChar32 i = 0; i < 32; ++i) {
1848 if ((a & (1<<i)) != 0) {
1849 result.add(i);
1850 }
1851 }
1852 return result;
1853 }
1854
1855 /**
1856 * Convert a UnicodeSet to a bitmask. Only the characters
1857 * U+0000 to U+0020 are represented in the bitmask.
1858 */
1859 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1860 int32_t result = 0;
1861 for (int32_t i = 0; i < 32; ++i) {
1862 if (x.contains((UChar32)i)) {
1863 result |= (1<<i);
1864 }
1865 }
1866 return result;
1867 }
1868
1869 /**
1870 * Return the representation of an inversion list based UnicodeSet
1871 * as a pairs list. Ranges are listed in ascending Unicode order.
1872 * For example, the set [a-zA-M3] is represented as "33AMaz".
1873 */
1874 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1875 UnicodeString pairs;
1876 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1877 UChar32 start = set.getRangeStart(i);
1878 UChar32 end = set.getRangeEnd(i);
1879 if (end > 0xFFFF) {
1880 end = 0xFFFF;
1881 i = set.getRangeCount(); // Should be unnecessary
1882 }
1883 pairs.append((UChar)start).append((UChar)end);
1884 }
1885 return pairs;
1886 }
1887
1888 /**
1889 * Basic consistency check for a few items.
1890 * That the iterator works, and that we can create a pattern and
1891 * get the same thing back
1892 */
1893 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1894 UErrorCode ec = U_ZERO_ERROR;
1895
1896 UnicodeSet t(s);
1897 checkEqual(s, t, "copy ct");
1898
1899 t = s;
1900 checkEqual(s, t, "operator=");
1901
1902 copyWithIterator(t, s, FALSE);
1903 checkEqual(s, t, "iterator roundtrip");
1904
1905 copyWithIterator(t, s, TRUE); // try range
1906 checkEqual(s, t, "iterator roundtrip");
1907
1908 UnicodeString pat; s.toPattern(pat, FALSE);
1909 t.applyPattern(pat, ec);
1910 if (U_FAILURE(ec)) {
1911 errln("FAIL: applyPattern");
1912 return;
1913 } else {
1914 checkEqual(s, t, "toPattern(false)");
1915 }
1916
1917 s.toPattern(pat, TRUE);
1918 t.applyPattern(pat, ec);
1919 if (U_FAILURE(ec)) {
1920 errln("FAIL: applyPattern");
1921 return;
1922 } else {
1923 checkEqual(s, t, "toPattern(true)");
1924 }
1925 }
1926
1927 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1928 t.clear();
1929 UnicodeSetIterator it(s);
1930 if (withRange) {
1931 while (it.nextRange()) {
1932 if (it.isString()) {
1933 t.add(it.getString());
1934 } else {
1935 t.add(it.getCodepoint(), it.getCodepointEnd());
1936 }
1937 }
1938 } else {
1939 while (it.next()) {
1940 if (it.isString()) {
1941 t.add(it.getString());
1942 } else {
1943 t.add(it.getCodepoint());
1944 }
1945 }
1946 }
1947 }
1948
1949 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1950 UnicodeString source; s.toPattern(source, TRUE);
1951 UnicodeString result; t.toPattern(result, TRUE);
1952 if (s != t) {
1953 errln((UnicodeString)"FAIL: " + message
1954 + "; source = " + source
1955 + "; result = " + result
1956 );
1957 return FALSE;
1958 } else {
1959 logln((UnicodeString)"Ok: " + message
1960 + "; source = " + source
1961 + "; result = " + result
1962 );
1963 }
1964 return TRUE;
1965 }
1966
1967 void
1968 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1969 const UnicodeString& charsIn,
1970 const UnicodeString& charsOut) {
1971 UErrorCode ec = U_ZERO_ERROR;
1972 UnicodeSet set(pat, ec);
1973 if (U_FAILURE(ec)) {
1974 dataerrln((UnicodeString)"FAIL: pattern \"" +
1975 pat + "\" => " + u_errorName(ec));
1976 return;
1977 }
1978 expectContainment(set, pat, charsIn, charsOut);
1979 }
1980
1981 void
1982 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1983 const UnicodeString& charsIn,
1984 const UnicodeString& charsOut) {
1985 UnicodeString pat;
1986 set.toPattern(pat);
1987 expectContainment(set, pat, charsIn, charsOut);
1988 }
1989
1990 void
1991 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1992 const UnicodeString& setName,
1993 const UnicodeString& charsIn,
1994 const UnicodeString& charsOut) {
1995 UnicodeString bad;
1996 UChar32 c;
1997 int32_t i;
1998
1999 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2000 c = charsIn.char32At(i);
2001 if (!set.contains(c)) {
2002 bad.append(c);
2003 }
2004 }
2005 if (bad.length() > 0) {
2006 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2007 ", expected containment of " + prettify(charsIn));
2008 } else {
2009 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2010 }
2011
2012 bad.truncate(0);
2013 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2014 c = charsOut.char32At(i);
2015 if (set.contains(c)) {
2016 bad.append(c);
2017 }
2018 }
2019 if (bad.length() > 0) {
2020 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2021 ", expected non-containment of " + prettify(charsOut));
2022 } else {
2023 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2024 }
2025 }
2026
2027 void
2028 UnicodeSetTest::expectPattern(UnicodeSet& set,
2029 const UnicodeString& pattern,
2030 const UnicodeString& expectedPairs){
2031 UErrorCode status = U_ZERO_ERROR;
2032 set.applyPattern(pattern, status);
2033 if (U_FAILURE(status)) {
2034 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2035 "\") failed");
2036 return;
2037 } else {
2038 if (getPairs(set) != expectedPairs ) {
2039 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2040 "\") => pairs \"" +
2041 escape(getPairs(set)) + "\", expected \"" +
2042 escape(expectedPairs) + "\"");
2043 } else {
2044 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2045 "\") => pairs \"" +
2046 escape(getPairs(set)) + "\"");
2047 }
2048 }
2049 // the result of calling set.toPattern(), which is the string representation of
2050 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2051 // will produce another set that is equal to this one.
2052 UnicodeString temppattern;
2053 set.toPattern(temppattern);
2054 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2055 if (U_FAILURE(status)) {
2056 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2057 return;
2058 }
2059 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2060 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2061 escape(getPairs(set)) + "\""));
2062 } else{
2063 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2064 }
2065
2066 delete tempset;
2067
2068 }
2069
2070 void
2071 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2072 if (getPairs(set) != expectedPairs) {
2073 errln(UnicodeString("FAIL: Expected pair list \"") +
2074 escape(expectedPairs) + "\", got \"" +
2075 escape(getPairs(set)) + "\"");
2076 }
2077 }
2078
2079 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2080 const UnicodeString& expPat,
2081 const char** expStrings) {
2082 UnicodeString pat;
2083 set.toPattern(pat, TRUE);
2084 if (pat == expPat) {
2085 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2086 } else {
2087 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2088 return;
2089 }
2090 if (expStrings == NULL) {
2091 return;
2092 }
2093 UBool in = TRUE;
2094 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2095 if (expStrings[i] == NOT) { // sic; pointer comparison
2096 in = FALSE;
2097 continue;
2098 }
2099 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2100 UBool contained = set.contains(s);
2101 if (contained == in) {
2102 logln((UnicodeString)"Ok: " + expPat +
2103 (contained ? " contains {" : " does not contain {") +
2104 escape(expStrings[i]) + "}");
2105 } else {
2106 errln((UnicodeString)"FAIL: " + expPat +
2107 (contained ? " contains {" : " does not contain {") +
2108 escape(expStrings[i]) + "}");
2109 }
2110 }
2111 }
2112
2113 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2114
2115 void
2116 UnicodeSetTest::doAssert(UBool condition, const char *message)
2117 {
2118 if (!condition) {
2119 errln(UnicodeString("ERROR : ") + message);
2120 }
2121 }
2122
2123 UnicodeString
2124 UnicodeSetTest::escape(const UnicodeString& s) {
2125 UnicodeString buf;
2126 for (int32_t i=0; i<s.length(); )
2127 {
2128 UChar32 c = s.char32At(i);
2129 if (0x0020 <= c && c <= 0x007F) {
2130 buf += c;
2131 } else {
2132 if (c <= 0xFFFF) {
2133 buf += (UChar)0x5c; buf += (UChar)0x75;
2134 } else {
2135 buf += (UChar)0x5c; buf += (UChar)0x55;
2136 buf += toHexString((c & 0xF0000000) >> 28);
2137 buf += toHexString((c & 0x0F000000) >> 24);
2138 buf += toHexString((c & 0x00F00000) >> 20);
2139 buf += toHexString((c & 0x000F0000) >> 16);
2140 }
2141 buf += toHexString((c & 0xF000) >> 12);
2142 buf += toHexString((c & 0x0F00) >> 8);
2143 buf += toHexString((c & 0x00F0) >> 4);
2144 buf += toHexString(c & 0x000F);
2145 }
2146 i += U16_LENGTH(c);
2147 }
2148 return buf;
2149 }
2150
2151 void UnicodeSetTest::TestFreezable() {
2152 UErrorCode errorCode=U_ZERO_ERROR;
2153 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2154 UnicodeSet idSet(idPattern, errorCode);
2155 if(U_FAILURE(errorCode)) {
2156 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2157 return;
2158 }
2159
2160 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2161 UnicodeSet wsSet(wsPattern, errorCode);
2162 if(U_FAILURE(errorCode)) {
2163 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2164 return;
2165 }
2166
2167 idSet.add(idPattern);
2168 UnicodeSet frozen(idSet);
2169 frozen.freeze();
2170
2171 if(idSet.isFrozen() || !frozen.isFrozen()) {
2172 errln("FAIL: isFrozen() is wrong");
2173 }
2174 if(frozen!=idSet || !(frozen==idSet)) {
2175 errln("FAIL: a copy-constructed frozen set differs from its original");
2176 }
2177
2178 frozen=wsSet;
2179 if(frozen!=idSet || !(frozen==idSet)) {
2180 errln("FAIL: a frozen set was modified by operator=");
2181 }
2182
2183 UnicodeSet frozen2(frozen);
2184 if(frozen2!=frozen || frozen2!=idSet) {
2185 errln("FAIL: a copied frozen set differs from its frozen original");
2186 }
2187 if(!frozen2.isFrozen()) {
2188 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2189 }
2190 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2191 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2192 errln("FAIL: UnicodeSet(5, 55) failed");
2193 }
2194 frozen3=frozen;
2195 if(!frozen3.isFrozen()) {
2196 errln("FAIL: copying a frozen set results in a thawed one");
2197 }
2198
2199 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2200 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2201 errln("FAIL: clone() failed");
2202 }
2203 cloned->add(0xd802, 0xd805);
2204 if(cloned->containsSome(0xd802, 0xd805)) {
2205 errln("FAIL: unable to modify clone");
2206 }
2207 delete cloned;
2208
2209 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2210 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2211 errln("FAIL: cloneAsThawed() failed");
2212 }
2213 thawed->add(0xd802, 0xd805);
2214 if(!thawed->contains(0xd802, 0xd805)) {
2215 errln("FAIL: unable to modify thawed clone");
2216 }
2217 delete thawed;
2218
2219 frozen.set(5, 55);
2220 if(frozen!=idSet || !(frozen==idSet)) {
2221 errln("FAIL: UnicodeSet::set() modified a frozen set");
2222 }
2223
2224 frozen.clear();
2225 if(frozen!=idSet || !(frozen==idSet)) {
2226 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2227 }
2228
2229 frozen.closeOver(USET_CASE_INSENSITIVE);
2230 if(frozen!=idSet || !(frozen==idSet)) {
2231 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2232 }
2233
2234 frozen.compact();
2235 if(frozen!=idSet || !(frozen==idSet)) {
2236 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2237 }
2238
2239 ParsePosition pos;
2240 frozen.
2241 applyPattern(wsPattern, errorCode).
2242 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2243 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2244 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2245 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2246 if(frozen!=idSet || !(frozen==idSet)) {
2247 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2248 }
2249
2250 frozen.
2251 add(0xd800).
2252 add(0xd802, 0xd805).
2253 add(wsPattern).
2254 addAll(idPattern).
2255 addAll(wsSet);
2256 if(frozen!=idSet || !(frozen==idSet)) {
2257 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2258 }
2259
2260 frozen.
2261 retain(0x62).
2262 retain(0x64, 0x69).
2263 retainAll(wsPattern).
2264 retainAll(wsSet);
2265 if(frozen!=idSet || !(frozen==idSet)) {
2266 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2267 }
2268
2269 frozen.
2270 remove(0x62).
2271 remove(0x64, 0x69).
2272 remove(idPattern).
2273 removeAll(idPattern).
2274 removeAll(idSet);
2275 if(frozen!=idSet || !(frozen==idSet)) {
2276 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2277 }
2278
2279 frozen.
2280 complement().
2281 complement(0x62).
2282 complement(0x64, 0x69).
2283 complement(idPattern).
2284 complementAll(idPattern).
2285 complementAll(idSet);
2286 if(frozen!=idSet || !(frozen==idSet)) {
2287 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2288 }
2289 }
2290
2291 // Test span() etc. -------------------------------------------------------- ***
2292
2293 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2294 static int32_t
2295 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2296 UErrorCode errorCode=U_ZERO_ERROR;
2297 int32_t length8=0;
2298 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2299 if(U_SUCCESS(errorCode)) {
2300 return length8;
2301 } else {
2302 // The string contains an unpaired surrogate.
2303 // Ignore this string.
2304 return 0;
2305 }
2306 }
2307
2308 class UnicodeSetWithStringsIterator;
2309
2310 // Make the strings in a UnicodeSet easily accessible.
2311 class UnicodeSetWithStrings {
2312 public:
2313 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2314 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2315 int32_t size=set.size();
2316 if(size>0 && set.charAt(size-1)<0) {
2317 // If a set's last element is not a code point, then it must contain strings.
2318 // Iterate over the set, skip all code point ranges, and cache the strings.
2319 // Convert them to UTF-8 for spanUTF8().
2320 UnicodeSetIterator iter(set);
2321 const UnicodeString *s;
2322 char *s8=utf8;
2323 int32_t length8, utf8Count=0;
2324 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2325 if(iter.isString()) {
2326 // Store the pointer to the set's string element
2327 // which we happen to know is a stable pointer.
2328 strings[stringsLength]=s=&iter.getString();
2329 utf8Count+=
2330 utf8Lengths[stringsLength]=length8=
2331 appendUTF8(s->getBuffer(), s->length(),
2332 s8, (int32_t)(sizeof(utf8)-utf8Count));
2333 if(length8==0) {
2334 hasSurrogates=TRUE; // Contains unpaired surrogates.
2335 }
2336 s8+=length8;
2337 ++stringsLength;
2338 }
2339 }
2340 }
2341 }
2342
2343 const UnicodeSet &getSet() const {
2344 return set;
2345 }
2346
2347 UBool hasStrings() const {
2348 return (UBool)(stringsLength>0);
2349 }
2350
2351 UBool hasStringsWithSurrogates() const {
2352 return hasSurrogates;
2353 }
2354
2355 private:
2356 friend class UnicodeSetWithStringsIterator;
2357
2358 const UnicodeSet &set;
2359
2360 const UnicodeString *strings[20];
2361 int32_t stringsLength;
2362 UBool hasSurrogates;
2363
2364 char utf8[1024];
2365 int32_t utf8Lengths[20];
2366
2367 int32_t nextStringIndex;
2368 int32_t nextUTF8Start;
2369 };
2370
2371 class UnicodeSetWithStringsIterator {
2372 public:
2373 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2374 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2375 }
2376
2377 void reset() {
2378 nextStringIndex=nextUTF8Start=0;
2379 }
2380
2381 const UnicodeString *nextString() {
2382 if(nextStringIndex<fSet.stringsLength) {
2383 return fSet.strings[nextStringIndex++];
2384 } else {
2385 return NULL;
2386 }
2387 }
2388
2389 // Do not mix with calls to nextString().
2390 const char *nextUTF8(int32_t &length) {
2391 if(nextStringIndex<fSet.stringsLength) {
2392 const char *s8=fSet.utf8+nextUTF8Start;
2393 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2394 return s8;
2395 } else {
2396 length=0;
2397 return NULL;
2398 }
2399 }
2400
2401 private:
2402 const UnicodeSetWithStrings &fSet;
2403 int32_t nextStringIndex;
2404 int32_t nextUTF8Start;
2405 };
2406
2407 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2408 // at code point boundaries.
2409 // That is, each edge of a match must not be in the middle of a surrogate pair.
2410 static inline UBool
2411 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2412 s+=start;
2413 limit-=start;
2414 int32_t length=t.length();
2415 return 0==t.compare(s, length) &&
2416 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2417 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2418 }
2419
2420 // Implement span() with contains() for comparison.
2421 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2422 USetSpanCondition spanCondition) {
2423 const UnicodeSet &realSet(set.getSet());
2424 if(!set.hasStrings()) {
2425 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2426 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2427 }
2428
2429 UChar32 c;
2430 int32_t start=0, prev;
2431 while((prev=start)<length) {
2432 U16_NEXT(s, start, length, c);
2433 if(realSet.contains(c)!=spanCondition) {
2434 break;
2435 }
2436 }
2437 return prev;
2438 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2439 UnicodeSetWithStringsIterator iter(set);
2440 UChar32 c;
2441 int32_t start, next;
2442 for(start=next=0; start<length;) {
2443 U16_NEXT(s, next, length, c);
2444 if(realSet.contains(c)) {
2445 break;
2446 }
2447 const UnicodeString *str;
2448 iter.reset();
2449 while((str=iter.nextString())!=NULL) {
2450 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2451 // spanNeedsStrings=TRUE;
2452 return start;
2453 }
2454 }
2455 start=next;
2456 }
2457 return start;
2458 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2459 UnicodeSetWithStringsIterator iter(set);
2460 UChar32 c;
2461 int32_t start, next, maxSpanLimit=0;
2462 for(start=next=0; start<length;) {
2463 U16_NEXT(s, next, length, c);
2464 if(!realSet.contains(c)) {
2465 next=start; // Do not span this single, not-contained code point.
2466 }
2467 const UnicodeString *str;
2468 iter.reset();
2469 while((str=iter.nextString())!=NULL) {
2470 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2471 // spanNeedsStrings=TRUE;
2472 int32_t matchLimit=start+str->length();
2473 if(matchLimit==length) {
2474 return length;
2475 }
2476 if(spanCondition==USET_SPAN_CONTAINED) {
2477 // Iterate for the shortest match at each position.
2478 // Recurse for each but the shortest match.
2479 if(next==start) {
2480 next=matchLimit; // First match from start.
2481 } else {
2482 if(matchLimit<next) {
2483 // Remember shortest match from start for iteration.
2484 int32_t temp=next;
2485 next=matchLimit;
2486 matchLimit=temp;
2487 }
2488 // Recurse for non-shortest match from start.
2489 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2490 USET_SPAN_CONTAINED);
2491 if((matchLimit+spanLength)>maxSpanLimit) {
2492 maxSpanLimit=matchLimit+spanLength;
2493 if(maxSpanLimit==length) {
2494 return length;
2495 }
2496 }
2497 }
2498 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2499 if(matchLimit>next) {
2500 // Remember longest match from start.
2501 next=matchLimit;
2502 }
2503 }
2504 }
2505 }
2506 if(next==start) {
2507 break; // No match from start.
2508 }
2509 start=next;
2510 }
2511 if(start>maxSpanLimit) {
2512 return start;
2513 } else {
2514 return maxSpanLimit;
2515 }
2516 }
2517 }
2518
2519 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2520 USetSpanCondition spanCondition) {
2521 if(length==0) {
2522 return 0;
2523 }
2524 const UnicodeSet &realSet(set.getSet());
2525 if(!set.hasStrings()) {
2526 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2527 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2528 }
2529
2530 UChar32 c;
2531 int32_t prev=length;
2532 do {
2533 U16_PREV(s, 0, length, c);
2534 if(realSet.contains(c)!=spanCondition) {
2535 break;
2536 }
2537 } while((prev=length)>0);
2538 return prev;
2539 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2540 UnicodeSetWithStringsIterator iter(set);
2541 UChar32 c;
2542 int32_t prev=length, length0=length;
2543 do {
2544 U16_PREV(s, 0, length, c);
2545 if(realSet.contains(c)) {
2546 break;
2547 }
2548 const UnicodeString *str;
2549 iter.reset();
2550 while((str=iter.nextString())!=NULL) {
2551 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2552 // spanNeedsStrings=TRUE;
2553 return prev;
2554 }
2555 }
2556 } while((prev=length)>0);
2557 return prev;
2558 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2559 UnicodeSetWithStringsIterator iter(set);
2560 UChar32 c;
2561 int32_t prev=length, minSpanStart=length, length0=length;
2562 do {
2563 U16_PREV(s, 0, length, c);
2564 if(!realSet.contains(c)) {
2565 length=prev; // Do not span this single, not-contained code point.
2566 }
2567 const UnicodeString *str;
2568 iter.reset();
2569 while((str=iter.nextString())!=NULL) {
2570 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2571 // spanNeedsStrings=TRUE;
2572 int32_t matchStart=prev-str->length();
2573 if(matchStart==0) {
2574 return 0;
2575 }
2576 if(spanCondition==USET_SPAN_CONTAINED) {
2577 // Iterate for the shortest match at each position.
2578 // Recurse for each but the shortest match.
2579 if(length==prev) {
2580 length=matchStart; // First match from prev.
2581 } else {
2582 if(matchStart>length) {
2583 // Remember shortest match from prev for iteration.
2584 int32_t temp=length;
2585 length=matchStart;
2586 matchStart=temp;
2587 }
2588 // Recurse for non-shortest match from prev.
2589 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2590 USET_SPAN_CONTAINED);
2591 if(spanStart<minSpanStart) {
2592 minSpanStart=spanStart;
2593 if(minSpanStart==0) {
2594 return 0;
2595 }
2596 }
2597 }
2598 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2599 if(matchStart<length) {
2600 // Remember longest match from prev.
2601 length=matchStart;
2602 }
2603 }
2604 }
2605 }
2606 if(length==prev) {
2607 break; // No match from prev.
2608 }
2609 } while((prev=length)>0);
2610 if(prev<minSpanStart) {
2611 return prev;
2612 } else {
2613 return minSpanStart;
2614 }
2615 }
2616 }
2617
2618 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2619 USetSpanCondition spanCondition) {
2620 const UnicodeSet &realSet(set.getSet());
2621 if(!set.hasStrings()) {
2622 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2623 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2624 }
2625
2626 UChar32 c;
2627 int32_t start=0, prev;
2628 while((prev=start)<length) {
2629 U8_NEXT(s, start, length, c);
2630 if(c<0) {
2631 c=0xfffd;
2632 }
2633 if(realSet.contains(c)!=spanCondition) {
2634 break;
2635 }
2636 }
2637 return prev;
2638 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2639 UnicodeSetWithStringsIterator iter(set);
2640 UChar32 c;
2641 int32_t start, next;
2642 for(start=next=0; start<length;) {
2643 U8_NEXT(s, next, length, c);
2644 if(c<0) {
2645 c=0xfffd;
2646 }
2647 if(realSet.contains(c)) {
2648 break;
2649 }
2650 const char *s8;
2651 int32_t length8;
2652 iter.reset();
2653 while((s8=iter.nextUTF8(length8))!=NULL) {
2654 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2655 // spanNeedsStrings=TRUE;
2656 return start;
2657 }
2658 }
2659 start=next;
2660 }
2661 return start;
2662 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2663 UnicodeSetWithStringsIterator iter(set);
2664 UChar32 c;
2665 int32_t start, next, maxSpanLimit=0;
2666 for(start=next=0; start<length;) {
2667 U8_NEXT(s, next, length, c);
2668 if(c<0) {
2669 c=0xfffd;
2670 }
2671 if(!realSet.contains(c)) {
2672 next=start; // Do not span this single, not-contained code point.
2673 }
2674 const char *s8;
2675 int32_t length8;
2676 iter.reset();
2677 while((s8=iter.nextUTF8(length8))!=NULL) {
2678 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2679 // spanNeedsStrings=TRUE;
2680 int32_t matchLimit=start+length8;
2681 if(matchLimit==length) {
2682 return length;
2683 }
2684 if(spanCondition==USET_SPAN_CONTAINED) {
2685 // Iterate for the shortest match at each position.
2686 // Recurse for each but the shortest match.
2687 if(next==start) {
2688 next=matchLimit; // First match from start.
2689 } else {
2690 if(matchLimit<next) {
2691 // Remember shortest match from start for iteration.
2692 int32_t temp=next;
2693 next=matchLimit;
2694 matchLimit=temp;
2695 }
2696 // Recurse for non-shortest match from start.
2697 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2698 USET_SPAN_CONTAINED);
2699 if((matchLimit+spanLength)>maxSpanLimit) {
2700 maxSpanLimit=matchLimit+spanLength;
2701 if(maxSpanLimit==length) {
2702 return length;
2703 }
2704 }
2705 }
2706 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2707 if(matchLimit>next) {
2708 // Remember longest match from start.
2709 next=matchLimit;
2710 }
2711 }
2712 }
2713 }
2714 if(next==start) {
2715 break; // No match from start.
2716 }
2717 start=next;
2718 }
2719 if(start>maxSpanLimit) {
2720 return start;
2721 } else {
2722 return maxSpanLimit;
2723 }
2724 }
2725 }
2726
2727 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2728 USetSpanCondition spanCondition) {
2729 if(length==0) {
2730 return 0;
2731 }
2732 const UnicodeSet &realSet(set.getSet());
2733 if(!set.hasStrings()) {
2734 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2735 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2736 }
2737
2738 UChar32 c;
2739 int32_t prev=length;
2740 do {
2741 U8_PREV(s, 0, length, c);
2742 if(c<0) {
2743 c=0xfffd;
2744 }
2745 if(realSet.contains(c)!=spanCondition) {
2746 break;
2747 }
2748 } while((prev=length)>0);
2749 return prev;
2750 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2751 UnicodeSetWithStringsIterator iter(set);
2752 UChar32 c;
2753 int32_t prev=length;
2754 do {
2755 U8_PREV(s, 0, length, c);
2756 if(c<0) {
2757 c=0xfffd;
2758 }
2759 if(realSet.contains(c)) {
2760 break;
2761 }
2762 const char *s8;
2763 int32_t length8;
2764 iter.reset();
2765 while((s8=iter.nextUTF8(length8))!=NULL) {
2766 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2767 // spanNeedsStrings=TRUE;
2768 return prev;
2769 }
2770 }
2771 } while((prev=length)>0);
2772 return prev;
2773 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2774 UnicodeSetWithStringsIterator iter(set);
2775 UChar32 c;
2776 int32_t prev=length, minSpanStart=length;
2777 do {
2778 U8_PREV(s, 0, length, c);
2779 if(c<0) {
2780 c=0xfffd;
2781 }
2782 if(!realSet.contains(c)) {
2783 length=prev; // Do not span this single, not-contained code point.
2784 }
2785 const char *s8;
2786 int32_t length8;
2787 iter.reset();
2788 while((s8=iter.nextUTF8(length8))!=NULL) {
2789 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2790 // spanNeedsStrings=TRUE;
2791 int32_t matchStart=prev-length8;
2792 if(matchStart==0) {
2793 return 0;
2794 }
2795 if(spanCondition==USET_SPAN_CONTAINED) {
2796 // Iterate for the shortest match at each position.
2797 // Recurse for each but the shortest match.
2798 if(length==prev) {
2799 length=matchStart; // First match from prev.
2800 } else {
2801 if(matchStart>length) {
2802 // Remember shortest match from prev for iteration.
2803 int32_t temp=length;
2804 length=matchStart;
2805 matchStart=temp;
2806 }
2807 // Recurse for non-shortest match from prev.
2808 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2809 USET_SPAN_CONTAINED);
2810 if(spanStart<minSpanStart) {
2811 minSpanStart=spanStart;
2812 if(minSpanStart==0) {
2813 return 0;
2814 }
2815 }
2816 }
2817 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2818 if(matchStart<length) {
2819 // Remember longest match from prev.
2820 length=matchStart;
2821 }
2822 }
2823 }
2824 }
2825 if(length==prev) {
2826 break; // No match from prev.
2827 }
2828 } while((prev=length)>0);
2829 if(prev<minSpanStart) {
2830 return prev;
2831 } else {
2832 return minSpanStart;
2833 }
2834 }
2835 }
2836
2837 // spans to be performed and compared
2838 enum {
2839 SPAN_UTF16 =1,
2840 SPAN_UTF8 =2,
2841 SPAN_UTFS =3,
2842
2843 SPAN_SET =4,
2844 SPAN_COMPLEMENT =8,
2845 SPAN_POLARITY =0xc,
2846
2847 SPAN_FWD =0x10,
2848 SPAN_BACK =0x20,
2849 SPAN_DIRS =0x30,
2850
2851 SPAN_CONTAINED =0x100,
2852 SPAN_SIMPLE =0x200,
2853 SPAN_CONDITION =0x300,
2854
2855 SPAN_ALL =0x33f
2856 };
2857
2858 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2859 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2860 }
2861
2862 static inline int32_t slen(const void *s, UBool isUTF16) {
2863 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2864 }
2865
2866 /*
2867 * Count spans on a string with the method according to type and set the span limits.
2868 * The set may be the complement of the original.
2869 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2870 * according to the expected number of spans.
2871 * Sets typeName to an empty string if there is no such type.
2872 * Returns -1 if the span option is filtered out.
2873 */
2874 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2875 const void *s, int32_t length, UBool isUTF16,
2876 uint32_t whichSpans,
2877 int type, const char *&typeName,
2878 int32_t limits[], int32_t limitsCapacity,
2879 int32_t expectCount) {
2880 const UnicodeSet &realSet(set.getSet());
2881 int32_t start, count;
2882 USetSpanCondition spanCondition, firstSpanCondition, contained;
2883 UBool isForward;
2884
2885 if(type<0 || 7<type) {
2886 typeName="";
2887 return 0;
2888 }
2889
2890 static const char *const typeNames16[]={
2891 "contains", "contains(LM)",
2892 "span", "span(LM)",
2893 "containsBack", "containsBack(LM)",
2894 "spanBack", "spanBack(LM)"
2895 };
2896
2897 static const char *const typeNames8[]={
2898 "containsUTF8", "containsUTF8(LM)",
2899 "spanUTF8", "spanUTF8(LM)",
2900 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2901 "spanBackUTF8", "spanBackUTF8(LM)"
2902 };
2903
2904 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2905
2906 // filter span options
2907 if(type<=3) {
2908 // span forward
2909 if((whichSpans&SPAN_FWD)==0) {
2910 return -1;
2911 }
2912 isForward=TRUE;
2913 } else {
2914 // span backward
2915 if((whichSpans&SPAN_BACK)==0) {
2916 return -1;
2917 }
2918 isForward=FALSE;
2919 }
2920 if((type&1)==0) {
2921 // use USET_SPAN_CONTAINED
2922 if((whichSpans&SPAN_CONTAINED)==0) {
2923 return -1;
2924 }
2925 contained=USET_SPAN_CONTAINED;
2926 } else {
2927 // use USET_SPAN_SIMPLE
2928 if((whichSpans&SPAN_SIMPLE)==0) {
2929 return -1;
2930 }
2931 contained=USET_SPAN_SIMPLE;
2932 }
2933
2934 // Default first span condition for going forward with an uncomplemented set.
2935 spanCondition=USET_SPAN_NOT_CONTAINED;
2936 if(isComplement) {
2937 spanCondition=invertSpanCondition(spanCondition, contained);
2938 }
2939
2940 // First span condition for span(), used to terminate the spanBack() iteration.
2941 firstSpanCondition=spanCondition;
2942
2943 // spanBack(): Its initial span condition is span()'s last span condition,
2944 // which is the opposite of span()'s first span condition
2945 // if we expect an even number of spans.
2946 // (The loop inverts spanCondition (expectCount-1) times
2947 // before the expectCount'th span() call.)
2948 // If we do not compare forward and backward directions, then we do not have an
2949 // expectCount and just start with firstSpanCondition.
2950 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2951 spanCondition=invertSpanCondition(spanCondition, contained);
2952 }
2953
2954 count=0;
2955 switch(type) {
2956 case 0:
2957 case 1:
2958 start=0;
2959 if(length<0) {
2960 length=slen(s, isUTF16);
2961 }
2962 for(;;) {
2963 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2964 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2965 if(count<limitsCapacity) {
2966 limits[count]=start;
2967 }
2968 ++count;
2969 if(start>=length) {
2970 break;
2971 }
2972 spanCondition=invertSpanCondition(spanCondition, contained);
2973 }
2974 break;
2975 case 2:
2976 case 3:
2977 start=0;
2978 for(;;) {
2979 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2980 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2981 if(count<limitsCapacity) {
2982 limits[count]=start;
2983 }
2984 ++count;
2985 if(length>=0 ? start>=length :
2986 isUTF16 ? ((const UChar *)s)[start]==0 :
2987 ((const char *)s)[start]==0
2988 ) {
2989 break;
2990 }
2991 spanCondition=invertSpanCondition(spanCondition, contained);
2992 }
2993 break;
2994 case 4:
2995 case 5:
2996 if(length<0) {
2997 length=slen(s, isUTF16);
2998 }
2999 for(;;) {
3000 ++count;
3001 if(count<=limitsCapacity) {
3002 limits[limitsCapacity-count]=length;
3003 }
3004 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3005 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3006 if(length==0 && spanCondition==firstSpanCondition) {
3007 break;
3008 }
3009 spanCondition=invertSpanCondition(spanCondition, contained);
3010 }
3011 if(count<limitsCapacity) {
3012 memmove(limits, limits+(limitsCapacity-count), count*4);
3013 }
3014 break;
3015 case 6:
3016 case 7:
3017 for(;;) {
3018 ++count;
3019 if(count<=limitsCapacity) {
3020 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3021 }
3022 // Note: Length<0 is tested only for the first spanBack().
3023 // If we wanted to keep length<0 for all spanBack()s, we would have to
3024 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3025 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3026 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3027 if(length==0 && spanCondition==firstSpanCondition) {
3028 break;
3029 }
3030 spanCondition=invertSpanCondition(spanCondition, contained);
3031 }
3032 if(count<limitsCapacity) {
3033 memmove(limits, limits+(limitsCapacity-count), count*4);
3034 }
3035 break;
3036 default:
3037 typeName="";
3038 return -1;
3039 }
3040
3041 return count;
3042 }
3043
3044 // sets to be tested; odd index=isComplement
3045 enum {
3046 SLOW,
3047 SLOW_NOT,
3048 FAST,
3049 FAST_NOT,
3050 SET_COUNT
3051 };
3052
3053 static const char *const setNames[SET_COUNT]={
3054 "slow",
3055 "slow.not",
3056 "fast",
3057 "fast.not"
3058 };
3059
3060 /*
3061 * Verify that we get the same results whether we look at text with contains(),
3062 * span() or spanBack(), using unfrozen or frozen versions of the set,
3063 * and using the set or its complement (switching the spanConditions accordingly).
3064 * The latter verifies that
3065 * set.span(spanCondition) == set.complement().span(!spanCondition).
3066 *
3067 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3068 * or returned to the caller (with an input expectCount<0).
3069 */
3070 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3071 const void *s, int32_t length, UBool isUTF16,
3072 uint32_t whichSpans,
3073 int32_t expectLimits[], int32_t &expectCount,
3074 const char *testName, int32_t index) {
3075 int32_t limits[500];
3076 int32_t limitsCount;
3077 int i, j;
3078
3079 const char *typeName;
3080 int type;
3081
3082 for(i=0; i<SET_COUNT; ++i) {
3083 if((i&1)==0) {
3084 // Even-numbered sets are original, uncomplemented sets.
3085 if((whichSpans&SPAN_SET)==0) {
3086 continue;
3087 }
3088 } else {
3089 // Odd-numbered sets are complemented.
3090 if((whichSpans&SPAN_COMPLEMENT)==0) {
3091 continue;
3092 }
3093 }
3094 for(type=0;; ++type) {
3095 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3096 s, length, isUTF16,
3097 whichSpans,
3098 type, typeName,
3099 limits, LENGTHOF(limits), expectCount);
3100 if(typeName[0]==0) {
3101 break; // All types tried.
3102 }
3103 if(limitsCount<0) {
3104 continue; // Span option filtered out.
3105 }
3106 if(expectCount<0) {
3107 expectCount=limitsCount;
3108 if(limitsCount>LENGTHOF(limits)) {
3109 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3110 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3111 return;
3112 }
3113 memcpy(expectLimits, limits, limitsCount*4);
3114 } else if(limitsCount!=expectCount) {
3115 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3116 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3117 } else {
3118 for(j=0; j<limitsCount; ++j) {
3119 if(limits[j]!=expectLimits[j]) {
3120 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3121 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3122 j, (long)limits[j], (long)expectLimits[j]);
3123 break;
3124 }
3125 }
3126 }
3127 }
3128 }
3129
3130 // Compare span() with containsAll()/containsNone(),
3131 // but only if we have expectLimits[] from the uncomplemented set.
3132 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3133 const UChar *s16=(const UChar *)s;
3134 UnicodeString string;
3135 int32_t prev=0, limit, length;
3136 for(i=0; i<expectCount; ++i) {
3137 limit=expectLimits[i];
3138 length=limit-prev;
3139 if(length>0) {
3140 string.setTo(FALSE, s16+prev, length); // read-only alias
3141 if(i&1) {
3142 if(!sets[SLOW]->getSet().containsAll(string)) {
3143 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3144 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3145 return;
3146 }
3147 if(!sets[FAST]->getSet().containsAll(string)) {
3148 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3149 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3150 return;
3151 }
3152 } else {
3153 if(!sets[SLOW]->getSet().containsNone(string)) {
3154 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3155 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3156 return;
3157 }
3158 if(!sets[FAST]->getSet().containsNone(string)) {
3159 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3160 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3161 return;
3162 }
3163 }
3164 }
3165 prev=limit;
3166 }
3167 }
3168 }
3169
3170 // Specifically test either UTF-16 or UTF-8.
3171 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3172 const void *s, int32_t length, UBool isUTF16,
3173 uint32_t whichSpans,
3174 const char *testName, int32_t index) {
3175 int32_t expectLimits[500];
3176 int32_t expectCount=-1;
3177 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3178 }
3179
3180 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3181 UChar c, c2;
3182
3183 if(length>=0) {
3184 while(length>0) {
3185 c=*s++;
3186 --length;
3187 if(0xd800<=c && c<0xe000) {
3188 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3189 return TRUE;
3190 }
3191 --length;
3192 }
3193 }
3194 } else {
3195 while((c=*s++)!=0) {
3196 if(0xd800<=c && c<0xe000) {
3197 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3198 return TRUE;
3199 }
3200 }
3201 }
3202 }
3203 return FALSE;
3204 }
3205
3206 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3207 // unless either UTF is turned off in whichSpans.
3208 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3209 // have the same contains(c) value as U+FFFD.
3210 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3211 const UChar *s16, int32_t length16,
3212 uint32_t whichSpans,
3213 const char *testName, int32_t index) {
3214 int32_t expectLimits[500];
3215 int32_t expectCount;
3216
3217 expectCount=-1; // Get expectLimits[] from testSpan().
3218
3219 if((whichSpans&SPAN_UTF16)!=0) {
3220 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3221 }
3222 if((whichSpans&SPAN_UTF8)==0) {
3223 return;
3224 }
3225
3226 // Convert s16[] and expectLimits[] to UTF-8.
3227 uint8_t s8[3000];
3228 int32_t offsets[3000];
3229
3230 const UChar *s16Limit=s16+length16;
3231 char *t=(char *)s8;
3232 char *tLimit=t+sizeof(s8);
3233 int32_t *o=offsets;
3234 UErrorCode errorCode=U_ZERO_ERROR;
3235
3236 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3237 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3238 if(U_FAILURE(errorCode)) {
3239 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3240 testName, (long)index, u_errorName(errorCode));
3241 ucnv_resetFromUnicode(utf8Cnv);
3242 return;
3243 }
3244 int32_t length8=(int32_t)(t-(char *)s8);
3245
3246 // Convert expectLimits[].
3247 int32_t i, j, expect;
3248 for(i=j=0; i<expectCount; ++i) {
3249 expect=expectLimits[i];
3250 if(expect==length16) {
3251 expectLimits[i]=length8;
3252 } else {
3253 while(offsets[j]<expect) {
3254 ++j;
3255 }
3256 expectLimits[i]=j;
3257 }
3258 }
3259
3260 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3261 }
3262
3263 static UChar32 nextCodePoint(UChar32 c) {
3264 // Skip some large and boring ranges.
3265 switch(c) {
3266 case 0x3441:
3267 return 0x4d7f;
3268 case 0x5100:
3269 return 0x9f00;
3270 case 0xb040:
3271 return 0xd780;
3272 case 0xe041:
3273 return 0xf8fe;
3274 case 0x10100:
3275 return 0x20000;
3276 case 0x20041:
3277 return 0xe0000;
3278 case 0xe0101:
3279 return 0x10fffd;
3280 default:
3281 return c+1;
3282 }
3283 }
3284
3285 // Verify that all implementations represent the same set.
3286 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3287 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3288 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3289 // Skip the UTF-8 part of the test - if the string contains surrogates -
3290 // because it is likely to produce a different result.
3291 UBool inconsistentSurrogates=
3292 (!(sets[0]->getSet().contains(0xfffd) ?
3293 sets[0]->getSet().contains(0xd800, 0xdfff) :
3294 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3295 sets[0]->hasStringsWithSurrogates());
3296
3297 UChar s[1000];
3298 int32_t length=0;
3299 uint32_t localWhichSpans;
3300
3301 UChar32 c, first;
3302 for(first=c=0;; c=nextCodePoint(c)) {
3303 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3304 localWhichSpans=whichSpans;
3305 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3306 localWhichSpans&=~SPAN_UTF8;
3307 }
3308 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3309 if(c>0x10ffff) {
3310 break;
3311 }
3312 length=0;
3313 first=c;
3314 }
3315 U16_APPEND_UNSAFE(s, length, c);
3316 }
3317 }
3318
3319 // Test with a particular, interesting string.
3320 // Specify length and try NUL-termination.
3321 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3322 static const UChar s[]={
3323 0x61, 0x62, 0x20, // Latin, space
3324 0x3b1, 0x3b2, 0x3b3, // Greek
3325 0xd900, // lead surrogate
3326 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3327 0xdc05, // trail surrogate
3328 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3329 0xd900, 0xdc05, // unassigned supplementary
3330 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3331 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3332 0 // NUL
3333 };
3334
3335 if((whichSpans&SPAN_UTF16)==0) {
3336 return;
3337 }
3338 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3339 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3340 }
3341
3342 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3343 static const char s[]={
3344 "abc" // Latin
3345
3346 /* trail byte in lead position */
3347 "\x80"
3348
3349 " " // space
3350
3351 /* truncated multi-byte sequences */
3352 "\xd0"
3353 "\xe0"
3354 "\xe1"
3355 "\xed"
3356 "\xee"
3357 "\xf0"
3358 "\xf1"
3359 "\xf4"
3360 "\xf8"
3361 "\xfc"
3362
3363 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3364
3365 /* trail byte in lead position */
3366 "\x80"
3367
3368 "\xe0\x80"
3369 "\xe0\xa0"
3370 "\xe1\x80"
3371 "\xed\x80"
3372 "\xed\xa0"
3373 "\xee\x80"
3374 "\xf0\x80"
3375 "\xf0\x90"
3376 "\xf1\x80"
3377 "\xf4\x80"
3378 "\xf4\x90"
3379 "\xf8\x80"
3380 "\xfc\x80"
3381
3382 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3383
3384 /* trail byte in lead position */
3385 "\x80"
3386
3387 "\xf0\x80\x80"
3388 "\xf0\x90\x80"
3389 "\xf1\x80\x80"
3390 "\xf4\x80\x80"
3391 "\xf4\x90\x80"
3392 "\xf8\x80\x80"
3393 "\xfc\x80\x80"
3394
3395 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3396
3397 /* trail byte in lead position */
3398 "\x80"
3399
3400 "\xf8\x80\x80\x80"
3401 "\xfc\x80\x80\x80"
3402
3403 "\xF1\x90\x80\x85" // unassigned supplementary
3404
3405 /* trail byte in lead position */
3406 "\x80"
3407
3408 "\xfc\x80\x80\x80\x80"
3409
3410 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3411
3412 /* trail byte in lead position */
3413 "\x80"
3414
3415 /* complete sequences but non-shortest forms or out of range etc. */
3416 "\xc0\x80"
3417 "\xe0\x80\x80"
3418 "\xed\xa0\x80"
3419 "\xf0\x80\x80\x80"
3420 "\xf4\x90\x80\x80"
3421 "\xf8\x80\x80\x80\x80"
3422 "\xfc\x80\x80\x80\x80\x80"
3423 "\xfe"
3424 "\xff"
3425
3426 /* trail byte in lead position */
3427 "\x80"
3428
3429 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3430 };
3431
3432 if((whichSpans&SPAN_UTF8)==0) {
3433 return;
3434 }
3435 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3436 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3437 }
3438
3439 // Take a set of span options and multiply them so that
3440 // each portion only has one of the options a, b and c.
3441 // If b==0, then the set of options is just modified with mask and a.
3442 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3443 static int32_t
3444 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3445 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3446 uint32_t s;
3447 int32_t i;
3448
3449 for(i=0; i<whichSpansCount; ++i) {
3450 s=whichSpans[i]&mask;
3451 whichSpans[i]=s|a;
3452 if(b!=0) {
3453 whichSpans[whichSpansCount+i]=s|b;
3454 if(c!=0) {
3455 whichSpans[2*whichSpansCount+i]=s|c;
3456 }
3457 }
3458 }
3459 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3460 }
3461
3462 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3463 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3464 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3465 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3466
3467 void UnicodeSetTest::TestSpan() {
3468 // "[...]" is a UnicodeSet pattern.
3469 // "*" performs tests on all Unicode code points and on a selection of
3470 // malformed UTF-8/16 strings.
3471 // "-options" limits the scope of testing for the current set.
3472 // By default, the test verifies that equivalent boundaries are found
3473 // for UTF-16 and UTF-8, going forward and backward,
3474 // alternating USET_SPAN_NOT_CONTAINED with
3475 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3476 // Single-character options:
3477 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3478 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3479 // or the set contains strings with unpaired surrogates
3480 // which do not translate to valid UTF-8.
3481 // c -- set.span() and set.complement().span() boundaries may differ.
3482 // Cause: Set strings are not complemented.
3483 // b -- span() and spanBack() boundaries may differ.
3484 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3485 // and spanBack(USET_SPAN_SIMPLE) are defined to
3486 // match with non-overlapping substrings.
3487 // For example, with a set containing "ab" and "ba",
3488 // span() of "aba" yields boundaries { 0, 2, 3 }
3489 // because the initial "ab" matches from 0 to 2,
3490 // while spanBack() yields boundaries { 0, 1, 3 }
3491 // because the final "ba" matches from 1 to 3.
3492 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3493 // Cause: Strings in the set overlap, and a longer match may
3494 // require a sequence including non-longest substrings.
3495 // For example, with a set containing "ab", "abc" and "cd",
3496 // span(contained) of "abcd" spans the entire string
3497 // but span(longest match) only spans the first 3 characters.
3498 // Each "-options" first resets all options and then applies the specified options.
3499 // A "-" without options resets the options.
3500 // The options are also reset for each new set.
3501 // Other strings will be spanned.
3502 static const char *const testdata[]={
3503 "[:ID_Continue:]",
3504 "*",
3505 "[:White_Space:]",
3506 "*",
3507 "[]",
3508 "*",
3509 "[\\u0000-\\U0010FFFF]",
3510 "*",
3511 "[\\u0000\\u0080\\u0800\\U00010000]",
3512 "*",
3513 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3514 "*",
3515 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3516 "-c",
3517 "*",
3518 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3519 "-c",
3520 "*",
3521
3522 // Overlapping strings cause overlapping attempts to match.
3523 "[x{xy}{xya}{axy}{ax}]",
3524 "-cl",
3525
3526 // More repetitions of "xya" would take too long with the recursive
3527 // reference implementation.
3528 // containsAll()=FALSE
3529 // test_string 0x14
3530 "xx"
3531 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3532 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3533 "xyaxyaxyaxya"
3534 "xx"
3535 "xyaxyaxyaxya" // span() ends here.
3536 "aaa",
3537
3538 // containsAll()=TRUE
3539 // test_string 0x15
3540 "xx"
3541 "xyaxyaxyaxya"
3542 "xx"
3543 "xyaxyaxyaxya"
3544 "xx"
3545 "xyaxyaxyaxy",
3546
3547 "-bc",
3548 // test_string 0x17
3549 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3550 "-c",
3551 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3552 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3553 "-",
3554 "byaya", // span() -> { 5 }
3555 "byay", // span() -> { 4 }
3556 "bya", // span() -> { 3 }
3557
3558 // span(longest match) will not span the whole string.
3559 "[a{ab}{bc}]",
3560 "-cl",
3561 // test_string 0x21
3562 "abc",
3563
3564 "[a{ab}{abc}{cd}]",
3565 "-cl",
3566 "acdabcdabccd",
3567
3568 // spanBack(longest match) will not span the whole string.
3569 "[c{ab}{bc}]",
3570 "-cl",
3571 "abc",
3572
3573 "[d{cd}{bcd}{ab}]",
3574 "-cl",
3575 "abbcdabcdabd",
3576
3577 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3578 // and UTF-8 trail bytes.
3579 // Copies of above test sets and strings, but transliterated to have
3580 // different code points with similar trail units.
3581 // Previous: a b c d
3582 // Unicode: 042B 30AB 200AB 204AB
3583 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3584 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3585 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3586 "-cl",
3587 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3588
3589 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3590 "-cl",
3591 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3592
3593 // Stress bookkeeping and recursion.
3594 // The following strings are barely doable with the recursive
3595 // reference implementation.
3596 // The not-contained character at the end prevents an early exit from the span().
3597 "[b{bb}]",
3598 "-c",
3599 // test_string 0x33
3600 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3601 // On complement sets, span() and spanBack() get different results
3602 // because b is not in the complement set and there is an odd number of b's
3603 // in the test string.
3604 "-bc",
3605 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3606
3607 // Test with set strings with an initial or final code point span
3608 // longer than 254.
3609 "[a{" _64_a _64_a _64_a _64_a "b}"
3610 "{a" _64_b _64_b _64_b _64_b "}]",
3611 "-c",
3612 _64_a _64_a _64_a _63_a "b",
3613 _64_a _64_a _64_a _64_a "b",
3614 _64_a _64_a _64_a _64_a "aaaabbbb",
3615 "a" _64_b _64_b _64_b _63_b,
3616 "a" _64_b _64_b _64_b _64_b,
3617 "aaaabbbb" _64_b _64_b _64_b _64_b,
3618
3619 // Test with strings containing unpaired surrogates.
3620 // They are not representable in UTF-8, and a leading trail surrogate
3621 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3622 // U+20001 == \\uD840\\uDC01
3623 // U+20400 == \\uD841\\uDC00
3624 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3625 "-8cl",
3626 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3627 };
3628 uint32_t whichSpans[96]={ SPAN_ALL };
3629 int32_t whichSpansCount=1;
3630
3631 UnicodeSet *sets[SET_COUNT]={ NULL };
3632 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3633
3634 char testName[1024];
3635 char *testNameLimit=testName;
3636
3637 int32_t i, j;
3638 for(i=0; i<LENGTHOF(testdata); ++i) {
3639 const char *s=testdata[i];
3640 if(s[0]=='[') {
3641 // Create new test sets from this pattern.
3642 for(j=0; j<SET_COUNT; ++j) {
3643 delete sets_with_str[j];
3644 delete sets[j];
3645 }
3646 UErrorCode errorCode=U_ZERO_ERROR;
3647 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3648 if(U_FAILURE(errorCode)) {
3649 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3650 break;
3651 }
3652 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3653 sets[SLOW_NOT]->complement();
3654 // Intermediate set: Test cloning of a frozen set.
3655 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3656 fast->freeze();
3657 sets[FAST]=(UnicodeSet *)fast->clone();
3658 delete fast;
3659 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3660 fastNot->freeze();
3661 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3662 delete fastNot;
3663
3664 for(j=0; j<SET_COUNT; ++j) {
3665 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3666 }
3667
3668 strcpy(testName, s);
3669 testNameLimit=strchr(testName, 0);
3670 *testNameLimit++=':';
3671 *testNameLimit=0;
3672
3673 whichSpans[0]=SPAN_ALL;
3674 whichSpansCount=1;
3675 } else if(s[0]=='-') {
3676 whichSpans[0]=SPAN_ALL;
3677 whichSpansCount=1;
3678
3679 while(*++s!=0) {
3680 switch(*s) {
3681 case 'c':
3682 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3683 ~SPAN_POLARITY,
3684 SPAN_SET,
3685 SPAN_COMPLEMENT,
3686 0);
3687 break;
3688 case 'b':
3689 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3690 ~SPAN_DIRS,
3691 SPAN_FWD,
3692 SPAN_BACK,
3693 0);
3694 break;
3695 case 'l':
3696 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3697 // USET_SPAN_SIMPLE only FWD, and separately
3698 // USET_SPAN_SIMPLE only BACK
3699 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3700 ~(SPAN_DIRS|SPAN_CONDITION),
3701 SPAN_DIRS|SPAN_CONTAINED,
3702 SPAN_FWD|SPAN_SIMPLE,
3703 SPAN_BACK|SPAN_SIMPLE);
3704 break;
3705 case '8':
3706 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3707 ~SPAN_UTFS,
3708 SPAN_UTF16,
3709 SPAN_UTF8,
3710 0);
3711 break;
3712 default:
3713 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3714 break;
3715 }
3716 }
3717 } else if(0==strcmp(s, "*")) {
3718 strcpy(testNameLimit, "bad_string");
3719 for(j=0; j<whichSpansCount; ++j) {
3720 if(whichSpansCount>1) {
3721 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3722 "%%0x%3x",
3723 whichSpans[j]);
3724 }
3725 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3726 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3727 }
3728
3729 strcpy(testNameLimit, "contents");
3730 for(j=0; j<whichSpansCount; ++j) {
3731 if(whichSpansCount>1) {
3732 sprintf(testNameLimit+8 /* strlen("contents") */,
3733 "%%0x%3x",
3734 whichSpans[j]);
3735 }
3736 testSpanContents(sets_with_str, whichSpans[j], testName);
3737 }
3738 } else {
3739 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3740 strcpy(testNameLimit, "test_string");
3741 for(j=0; j<whichSpansCount; ++j) {
3742 if(whichSpansCount>1) {
3743 sprintf(testNameLimit+11 /* strlen("test_string") */,
3744 "%%0x%3x",
3745 whichSpans[j]);
3746 }
3747 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3748 }
3749 }
3750 }
3751 for(j=0; j<SET_COUNT; ++j) {
3752 delete sets_with_str[j];
3753 delete sets[j];
3754 }
3755 }
3756
3757 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3758 void UnicodeSetTest::TestStringSpan() {
3759 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3760 static const char *const string=
3761 "xx"
3762 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3763 "xx"
3764 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3765 "xx"
3766 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3767 "aaaa";
3768
3769 UErrorCode errorCode=U_ZERO_ERROR;
3770 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3771 UnicodeSet set(pattern16, errorCode);
3772 if(U_FAILURE(errorCode)) {
3773 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3774 return;
3775 }
3776
3777 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3778
3779 if(set.containsAll(string16)) {
3780 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3781 }
3782
3783 // Remove trailing "aaaa".
3784 string16.truncate(string16.length()-4);
3785 if(!set.containsAll(string16)) {
3786 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3787 }
3788
3789 string16=UNICODE_STRING_SIMPLE("byayaxya");
3790 const UChar *s16=string16.getBuffer();
3791 int32_t length16=string16.length();
3792 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3793 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3794 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3795 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3796 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3797 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3798 ) {
3799 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3800 }
3801
3802 pattern="[a{ab}{abc}{cd}]";
3803 pattern16=UnicodeString(pattern, -1, US_INV);
3804 set.applyPattern(pattern16, errorCode);
3805 if(U_FAILURE(errorCode)) {
3806 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3807 return;
3808 }
3809 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3810 s16=string16.getBuffer();
3811 length16=string16.length();
3812 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3813 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3814 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3815 ) {
3816 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3817 }
3818
3819 pattern="[d{cd}{bcd}{ab}]";
3820 pattern16=UnicodeString(pattern, -1, US_INV);
3821 set.applyPattern(pattern16, errorCode).freeze();
3822 if(U_FAILURE(errorCode)) {
3823 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3824 return;
3825 }
3826 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3827 s16=string16.getBuffer();
3828 length16=string16.length();
3829 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3830 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3831 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3832 ) {
3833 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3834 }
3835 }