]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/usettest.cpp
ICU-57132.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
1 /*
2 ********************************************************************************
3 * Copyright (C) 1999-2016 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 ********************************************************************************
10 */
11
12 #include <stdio.h>
13
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "cmemory.h"
26 #include "hash.h"
27
28 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
29 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
30 u_errorName(status));}}
31
32 #define TEST_ASSERT(expr) {if (!(expr)) { \
33 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
34
35 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
36 UnicodeString pat;
37 set.toPattern(pat);
38 return left + UnicodeSetTest::escape(pat);
39 }
40
41 #define CASE(id,test) case id: \
42 name = #test; \
43 if (exec) { \
44 logln(#test "---"); \
45 logln(); \
46 test(); \
47 } \
48 break
49
50 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
51 }
52
53 UConverter *UnicodeSetTest::openUTF8Converter() {
54 if(utf8Cnv==NULL) {
55 UErrorCode errorCode=U_ZERO_ERROR;
56 utf8Cnv=ucnv_open("UTF-8", &errorCode);
57 }
58 return utf8Cnv;
59 }
60
61 UnicodeSetTest::~UnicodeSetTest() {
62 ucnv_close(utf8Cnv);
63 }
64
65 void
66 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
67 const char* &name, char* /*par*/) {
68 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
69 switch (index) {
70 CASE(0,TestPatterns);
71 CASE(1,TestAddRemove);
72 CASE(2,TestCategories);
73 CASE(3,TestCloneEqualHash);
74 CASE(4,TestMinimalRep);
75 CASE(5,TestAPI);
76 CASE(6,TestScriptSet);
77 CASE(7,TestPropertySet);
78 CASE(8,TestClone);
79 CASE(9,TestExhaustive);
80 CASE(10,TestToPattern);
81 CASE(11,TestIndexOf);
82 CASE(12,TestStrings);
83 CASE(13,Testj2268);
84 CASE(14,TestCloseOver);
85 CASE(15,TestEscapePattern);
86 CASE(16,TestInvalidCodePoint);
87 CASE(17,TestSymbolTable);
88 CASE(18,TestSurrogate);
89 CASE(19,TestPosixClasses);
90 CASE(20,TestIteration);
91 CASE(21,TestFreezable);
92 CASE(22,TestSpan);
93 CASE(23,TestStringSpan);
94 CASE(24,TestUCAUnsafeBackwards);
95 default: name = ""; break;
96 }
97 }
98
99 static const char NOT[] = "%%%%";
100
101 /**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105 void UnicodeSetTest::Testj2268() {
106 UnicodeSet t;
107 t.add(UnicodeString("abc"));
108 UnicodeSet test(t);
109 UnicodeString ustrPat;
110 test.toPattern(ustrPat, TRUE);
111 }
112
113 /**
114 * Test toPattern().
115 */
116 void UnicodeSetTest::TestToPattern() {
117 UErrorCode ec = U_ZERO_ERROR;
118
119 // Test that toPattern() round trips with syntax characters and
120 // whitespace.
121 {
122 static const char* OTHER_TOPATTERN_TESTS[] = {
123 "[[:latin:]&[:greek:]]",
124 "[[:latin:]-[:greek:]]",
125 "[:nonspacing mark:]",
126 NULL
127 };
128
129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130 ec = U_ZERO_ERROR;
131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132 if (U_FAILURE(ec)) {
133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
134 continue;
135 }
136 checkPat(OTHER_TOPATTERN_TESTS[j], s);
137 }
138
139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142 // check various combinations to make sure they all work.
143 if (i != 0 && !toPatternAux(i, i)){
144 continue;
145 }
146 if (!toPatternAux(0, i)){
147 continue;
148 }
149 if (!toPatternAux(i, 0xFFFF)){
150 continue;
151 }
152 }
153 }
154 }
155
156 // Test pattern behavior of multicharacter strings.
157 {
158 ec = U_ZERO_ERROR;
159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161 // This loop isn't a loop. It's here to make the compiler happy.
162 // If you're curious, try removing it and changing the 'break'
163 // statements (except for the last) to goto's.
164 for (;;) {
165 if (U_FAILURE(ec)) break;
166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169 s->add("ac");
170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174 if (U_FAILURE(ec)) break;
175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177
178 s->add("[]");
179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181
182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183 if (U_FAILURE(ec)) break;
184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186
187 // j2189
188 s->clear();
189 s->add(UnicodeString("abc", ""));
190 s->add(UnicodeString("abc", ""));
191 const char* exp6[] = {"abc", NOT, "ab", NULL};
192 expectToPattern(*s, "[{abc}]", exp6);
193
194 break;
195 }
196
197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198 delete s;
199 }
200
201 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202 UnicodeSet s;
203 s.add((UChar)97, (UChar)98); // 'a', 'b'
204 expectToPattern(s, "[ab]", NULL);
205 }
206
207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209 // use Integer.toString because Utility.hex doesn't handle ints
210 UnicodeString pat = "";
211 // TODO do these in hex
212 //String source = "0x" + Integer.toString(start,16).toUpperCase();
213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214 UnicodeString source;
215 source = source + (uint32_t)start;
216 if (start != end)
217 source = source + ".." + (uint32_t)end;
218 UnicodeSet testSet;
219 testSet.add(start, end);
220 return checkPat(source, testSet);
221 }
222
223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224 const UnicodeSet& testSet) {
225 // What we want to make sure of is that a pattern generated
226 // by toPattern(), with or without escaped unprintables, can
227 // be passed back into the UnicodeSet constructor.
228 UnicodeString pat0;
229
230 testSet.toPattern(pat0, TRUE);
231
232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234 //String pat1 = unescapeLeniently(pat0);
235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237 UnicodeString pat2;
238 testSet.toPattern(pat2, FALSE);
239 if (!checkPat(source, testSet, pat2)) return FALSE;
240
241 //String pat3 = unescapeLeniently(pat2);
242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246 return TRUE;
247 }
248
249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250 const UnicodeSet& testSet,
251 const UnicodeString& pat) {
252 UErrorCode ec = U_ZERO_ERROR;
253 UnicodeSet testSet2(pat, ec);
254 if (testSet2 != testSet) {
255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256 return FALSE;
257 }
258 return TRUE;
259 }
260
261 void
262 UnicodeSetTest::TestPatterns(void) {
263 UnicodeSet set;
264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271 // Throw in a test of complement
272 set.complement();
273 UnicodeString exp;
274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275 expectPairs(set, exp);
276 }
277
278 void
279 UnicodeSetTest::TestCategories(void) {
280 UErrorCode status = U_ZERO_ERROR;
281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282 UnicodeSet set(pat, status);
283 if (U_FAILURE(status)) {
284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285 return;
286 } else {
287 expectContainment(set, pat, "ABC", "abc");
288 }
289
290 UChar32 i;
291 int32_t failures = 0;
292 // Make sure generation of L doesn't pollute cached Lu set
293 // First generate L, then Lu
294 set.applyPattern("[:L:]", status);
295 if (U_FAILURE(status)) { errln("FAIL"); return; }
296 for (i=0; i<0x200; ++i) {
297 UBool l = u_isalpha((UChar)i);
298 if (l != set.contains(i)) {
299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300 set.contains(i));
301 if (++failures == 10) break;
302 }
303 }
304
305 set.applyPattern("[:Lu:]", status);
306 if (U_FAILURE(status)) { errln("FAIL"); return; }
307 for (i=0; i<0x200; ++i) {
308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309 if (lu != set.contains(i)) {
310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311 set.contains(i));
312 if (++failures == 20) break;
313 }
314 }
315 }
316 void
317 UnicodeSetTest::TestCloneEqualHash(void) {
318 UErrorCode status = U_ZERO_ERROR;
319 // set1 and set2 used to be built with the obsolete constructor taking
320 // UCharCategory values; replaced with pattern constructors
321 // markus 20030502
322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
324 if (U_FAILURE(status)){
325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
326 return;
327 }
328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
330 if (U_FAILURE(status)){
331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332 return;
333 }
334
335 if (*set1 != *set1a) {
336 errln("FAIL: category constructor for Ll broken");
337 }
338 if (*set2 != *set2a) {
339 errln("FAIL: category constructor for Nd broken");
340 }
341 delete set1a;
342 delete set2a;
343
344 logln("Testing copy construction");
345 UnicodeSet *set1copy=new UnicodeSet(*set1);
346 if(*set1 != *set1copy || *set1 == *set2 ||
347 getPairs(*set1) != getPairs(*set1copy) ||
348 set1->hashCode() != set1copy->hashCode()){
349 errln("FAIL : Error in copy construction");
350 return;
351 }
352
353 logln("Testing =operator");
354 UnicodeSet set1equal=*set1;
355 UnicodeSet set2equal=*set2;
356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358 errln("FAIL: Error in =operator");
359 }
360
361 logln("Testing clone()");
362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367 errln("FAIL: Error in clone");
368 }
369
370 logln("Testing hashcode");
371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376 errln("FAIL: Error in hashCode()");
377 }
378
379 delete set1;
380 delete set1copy;
381 delete set2;
382 delete set1clone;
383 delete set2clone;
384
385
386 }
387 void
388 UnicodeSetTest::TestAddRemove(void) {
389 UnicodeSet set; // Construct empty set
390 doAssert(set.isEmpty() == TRUE, "set should be empty");
391 doAssert(set.size() == 0, "size should be 0");
392 set.complement();
393 doAssert(set.size() == 0x110000, "size should be 0x110000");
394 set.clear();
395 set.add(0x0061, 0x007a);
396 expectPairs(set, "az");
397 doAssert(set.isEmpty() == FALSE, "set should not be empty");
398 doAssert(set.size() != 0, "size should not be equal to 0");
399 doAssert(set.size() == 26, "size should be equal to 26");
400 set.remove(0x006d, 0x0070);
401 expectPairs(set, "alqz");
402 doAssert(set.size() == 22, "size should be equal to 22");
403 set.remove(0x0065, 0x0067);
404 expectPairs(set, "adhlqz");
405 doAssert(set.size() == 19, "size should be equal to 19");
406 set.remove(0x0064, 0x0069);
407 expectPairs(set, "acjlqz");
408 doAssert(set.size() == 16, "size should be equal to 16");
409 set.remove(0x0063, 0x0072);
410 expectPairs(set, "absz");
411 doAssert(set.size() == 10, "size should be equal to 10");
412 set.add(0x0066, 0x0071);
413 expectPairs(set, "abfqsz");
414 doAssert(set.size() == 22, "size should be equal to 22");
415 set.remove(0x0061, 0x0067);
416 expectPairs(set, "hqsz");
417 set.remove(0x0061, 0x007a);
418 expectPairs(set, "");
419 doAssert(set.isEmpty() == TRUE, "set should be empty");
420 doAssert(set.size() == 0, "size should be 0");
421 set.add(0x0061);
422 doAssert(set.isEmpty() == FALSE, "set should not be empty");
423 doAssert(set.size() == 1, "size should not be equal to 1");
424 set.add(0x0062);
425 set.add(0x0063);
426 expectPairs(set, "ac");
427 doAssert(set.size() == 3, "size should not be equal to 3");
428 set.add(0x0070);
429 set.add(0x0071);
430 expectPairs(set, "acpq");
431 doAssert(set.size() == 5, "size should not be equal to 5");
432 set.clear();
433 expectPairs(set, "");
434 doAssert(set.isEmpty() == TRUE, "set should be empty");
435 doAssert(set.size() == 0, "size should be 0");
436
437 // Try removing an entire set from another set
438 expectPattern(set, "[c-x]", "cx");
439 UnicodeSet set2;
440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441 set.removeAll(set2);
442 expectPairs(set, "deluxx");
443
444 // Try adding an entire set to another set
445 expectPattern(set, "[jackiemclean]", "aacceein");
446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447 set.addAll(set2);
448 expectPairs(set, "aacehort");
449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450
451 // Try retaining an set of elements contained in another set (intersection)
452 UnicodeSet set3;
453 expectPattern(set3, "[a-c]", "ac");
454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455 set3.remove(0x0062);
456 expectPairs(set3, "aacc");
457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458 set.retainAll(set3);
459 expectPairs(set, "aacc");
460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462 set.clear();
463 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464
465 // Test commutativity
466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467 expectPattern(set2, "[jackiemclean]", "aacceein");
468 set.addAll(set2);
469 expectPairs(set, "aacehort");
470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471
472
473
474
475 }
476
477 /**
478 * Make sure minimal representation is maintained.
479 */
480 void UnicodeSetTest::TestMinimalRep() {
481 UErrorCode status = U_ZERO_ERROR;
482 // This is pretty thoroughly tested by checkCanonicalRep()
483 // run against the exhaustive operation results. Use the code
484 // here for debugging specific spot problems.
485
486 // 1 overlap against 2
487 UnicodeSet set("[h-km-q]", status);
488 if (U_FAILURE(status)) { errln("FAIL"); return; }
489 UnicodeSet set2("[i-o]", status);
490 if (U_FAILURE(status)) { errln("FAIL"); return; }
491 set.addAll(set2);
492 expectPairs(set, "hq");
493 // right
494 set.applyPattern("[a-m]", status);
495 if (U_FAILURE(status)) { errln("FAIL"); return; }
496 set2.applyPattern("[e-o]", status);
497 if (U_FAILURE(status)) { errln("FAIL"); return; }
498 set.addAll(set2);
499 expectPairs(set, "ao");
500 // left
501 set.applyPattern("[e-o]", status);
502 if (U_FAILURE(status)) { errln("FAIL"); return; }
503 set2.applyPattern("[a-m]", status);
504 if (U_FAILURE(status)) { errln("FAIL"); return; }
505 set.addAll(set2);
506 expectPairs(set, "ao");
507 // 1 overlap against 3
508 set.applyPattern("[a-eg-mo-w]", status);
509 if (U_FAILURE(status)) { errln("FAIL"); return; }
510 set2.applyPattern("[d-q]", status);
511 if (U_FAILURE(status)) { errln("FAIL"); return; }
512 set.addAll(set2);
513 expectPairs(set, "aw");
514 }
515
516 void UnicodeSetTest::TestAPI() {
517 UErrorCode status = U_ZERO_ERROR;
518 // default ct
519 UnicodeSet set;
520 if (!set.isEmpty() || set.getRangeCount() != 0) {
521 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522 set);
523 }
524
525 // clear(), isEmpty()
526 set.add(0x0061);
527 if (set.isEmpty()) {
528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529 set);
530 }
531 set.clear();
532 if (!set.isEmpty()) {
533 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534 set);
535 }
536
537 // size()
538 set.clear();
539 if (set.size() != 0) {
540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541 ": " + set);
542 }
543 set.add(0x0061);
544 if (set.size() != 1) {
545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546 ": " + set);
547 }
548 set.add(0x0031, 0x0039);
549 if (set.size() != 10) {
550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551 ": " + set);
552 }
553
554 // contains(first, last)
555 set.clear();
556 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557 if (U_FAILURE(status)) { errln("FAIL"); return; }
558 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559 UChar32 a = set.getRangeStart(i);
560 UChar32 b = set.getRangeEnd(i);
561 if (!set.contains(a, b)) {
562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563 " but doesn't: " + set);
564 }
565 if (set.contains((UChar32)(a-1), b)) {
566 errln((UnicodeString)"FAIL, shouldn't contain " +
567 (unsigned short)(a-1) + '-' + (unsigned short)b +
568 " but does: " + set);
569 }
570 if (set.contains(a, (UChar32)(b+1))) {
571 errln((UnicodeString)"FAIL, shouldn't contain " +
572 (unsigned short)a + '-' + (unsigned short)(b+1) +
573 " but does: " + set);
574 }
575 }
576
577 // Ported InversionList test.
578 UnicodeSet a((UChar32)3,(UChar32)10);
579 UnicodeSet b((UChar32)7,(UChar32)15);
580 UnicodeSet c;
581
582 logln((UnicodeString)"a [3-10]: " + a);
583 logln((UnicodeString)"b [7-15]: " + b);
584 c = a;
585 c.addAll(b);
586 UnicodeSet exp((UChar32)3,(UChar32)15);
587 if (c == exp) {
588 logln((UnicodeString)"c.set(a).add(b): " + c);
589 } else {
590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591 }
592 c.complement();
593 exp.set((UChar32)0, (UChar32)2);
594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595 if (c == exp) {
596 logln((UnicodeString)"c.complement(): " + c);
597 } else {
598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599 }
600 c.complement();
601 exp.set((UChar32)3, (UChar32)15);
602 if (c == exp) {
603 logln((UnicodeString)"c.complement(): " + c);
604 } else {
605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606 }
607 c = a;
608 c.complementAll(b);
609 exp.set((UChar32)3,(UChar32)6);
610 exp.add((UChar32)11,(UChar32) 15);
611 if (c == exp) {
612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613 } else {
614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615 }
616
617 exp = c;
618 bitsToSet(setToBits(c), c);
619 if (c == exp) {
620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621 } else {
622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623 }
624
625 // Additional tests for coverage JB#2118
626 //UnicodeSet::complement(class UnicodeString const &)
627 //UnicodeSet::complementAll(class UnicodeString const &)
628 //UnicodeSet::containsNone(class UnicodeSet const &)
629 //UnicodeSet::containsNone(long,long)
630 //UnicodeSet::containsSome(class UnicodeSet const &)
631 //UnicodeSet::containsSome(long,long)
632 //UnicodeSet::removeAll(class UnicodeString const &)
633 //UnicodeSet::retain(long)
634 //UnicodeSet::retainAll(class UnicodeString const &)
635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636 //UnicodeSetIterator::getString(void)
637 set.clear();
638 set.complement("ab");
639 exp.applyPattern("[{ab}]", status);
640 if (U_FAILURE(status)) { errln("FAIL"); return; }
641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642
643 UnicodeSetIterator iset(set);
644 if (!iset.next() || !iset.isString()) {
645 errln("FAIL: UnicodeSetIterator::next/isString");
646 } else if (iset.getString() != "ab") {
647 errln("FAIL: UnicodeSetIterator::getString");
648 }
649
650 set.add((UChar32)0x61, (UChar32)0x7A);
651 set.complementAll("alan");
652 exp.applyPattern("[{ab}b-kmo-z]", status);
653 if (U_FAILURE(status)) { errln("FAIL"); return; }
654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655
656 exp.applyPattern("[a-z]", status);
657 if (U_FAILURE(status)) { errln("FAIL"); return; }
658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660 exp.applyPattern("[aln]", status);
661 if (U_FAILURE(status)) { errln("FAIL"); return; }
662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664
665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666 errln("FAIL: containsNone(UChar32, UChar32)");
667 }
668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669 errln("FAIL: containsSome(UChar32, UChar32)");
670 }
671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672 errln("FAIL: containsNone(UChar32, UChar32)");
673 }
674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675 errln("FAIL: containsSome(UChar32, UChar32)");
676 }
677
678 set.removeAll("liu");
679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680 if (U_FAILURE(status)) { errln("FAIL"); return; }
681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682
683 set.retainAll("star");
684 exp.applyPattern("[rst]", status);
685 if (U_FAILURE(status)) { errln("FAIL"); return; }
686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687
688 set.retain((UChar32)0x73);
689 exp.applyPattern("[s]", status);
690 if (U_FAILURE(status)) { errln("FAIL"); return; }
691 if (set != exp) { errln("FAIL: retain('s')"); return; }
692
693 uint16_t buf[32];
694 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697 errln("FAIL: serialize");
698 return;
699 }
700
701 // Conversions to and from USet
702 UnicodeSet *uniset = &set;
703 USet *uset = uniset->toUSet();
704 TEST_ASSERT((void *)uset == (void *)uniset);
705 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706 TEST_ASSERT((void *)setx == (void *)uset);
707 const UnicodeSet *constSet = uniset;
708 const USet *constUSet = constSet->toUSet();
709 TEST_ASSERT((void *)constUSet == (void *)constSet);
710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711 TEST_ASSERT((void *)constSetx == (void *)constUSet);
712
713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715 UnicodeSet ac(0x61, 0x63);
716 ac.remove(0x62).freeze();
717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727 ) {
728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729 }
730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740 ) {
741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742 }
743 }
744
745 void UnicodeSetTest::TestIteration() {
746 UErrorCode ec = U_ZERO_ERROR;
747 int i = 0;
748 int outerLoop;
749
750 // 6 code points, 3 ranges, 2 strings, 8 total elements
751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
753 TEST_ASSERT_SUCCESS(ec);
754 UnicodeSetIterator it(set);
755
756 for (outerLoop=0; outerLoop<3; outerLoop++) {
757 // Run the test multiple times, to check that iterator.reset() is working.
758 for (i=0; i<10; i++) {
759 UBool nextv = it.next();
760 UBool isString = it.isString();
761 int32_t codePoint = it.getCodepoint();
762 //int32_t codePointEnd = it.getCodepointEnd();
763 UnicodeString s = it.getString();
764 switch (i) {
765 case 0:
766 TEST_ASSERT(nextv == TRUE);
767 TEST_ASSERT(isString == FALSE);
768 TEST_ASSERT(codePoint==0x61);
769 TEST_ASSERT(s == "a");
770 break;
771 case 1:
772 TEST_ASSERT(nextv == TRUE);
773 TEST_ASSERT(isString == FALSE);
774 TEST_ASSERT(codePoint==0x62);
775 TEST_ASSERT(s == "b");
776 break;
777 case 2:
778 TEST_ASSERT(nextv == TRUE);
779 TEST_ASSERT(isString == FALSE);
780 TEST_ASSERT(codePoint==0x63);
781 TEST_ASSERT(s == "c");
782 break;
783 case 3:
784 TEST_ASSERT(nextv == TRUE);
785 TEST_ASSERT(isString == FALSE);
786 TEST_ASSERT(codePoint==0x79);
787 TEST_ASSERT(s == "y");
788 break;
789 case 4:
790 TEST_ASSERT(nextv == TRUE);
791 TEST_ASSERT(isString == FALSE);
792 TEST_ASSERT(codePoint==0x7a);
793 TEST_ASSERT(s == "z");
794 break;
795 case 5:
796 TEST_ASSERT(nextv == TRUE);
797 TEST_ASSERT(isString == FALSE);
798 TEST_ASSERT(codePoint==0x1abcd);
799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800 break;
801 case 6:
802 TEST_ASSERT(nextv == TRUE);
803 TEST_ASSERT(isString == TRUE);
804 TEST_ASSERT(s == "str1");
805 break;
806 case 7:
807 TEST_ASSERT(nextv == TRUE);
808 TEST_ASSERT(isString == TRUE);
809 TEST_ASSERT(s == "str2");
810 break;
811 case 8:
812 TEST_ASSERT(nextv == FALSE);
813 break;
814 case 9:
815 TEST_ASSERT(nextv == FALSE);
816 break;
817 }
818 }
819 it.reset(); // prepare to run the iteration again.
820 }
821 }
822
823
824
825
826 void UnicodeSetTest::TestStrings() {
827 UErrorCode ec = U_ZERO_ERROR;
828
829 UnicodeSet* testList[] = {
830 UnicodeSet::createFromAll("abc"),
831 new UnicodeSet("[a-c]", ec),
832
833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834 new UnicodeSet("[{ll}{ch}a-z]", ec),
835
836 UnicodeSet::createFrom("ab}c"),
837 new UnicodeSet("[{ab\\}c}]", ec),
838
839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841
842 NULL
843 };
844
845 if (U_FAILURE(ec)) {
846 errln("FAIL: couldn't construct test sets");
847 }
848
849 for (int32_t i = 0; testList[i] != NULL; i+=2) {
850 if (U_SUCCESS(ec)) {
851 UnicodeString pat0, pat1;
852 testList[i]->toPattern(pat0, TRUE);
853 testList[i+1]->toPattern(pat1, TRUE);
854 if (*testList[i] == *testList[i+1]) {
855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856 } else {
857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858 }
859 }
860 delete testList[i];
861 delete testList[i+1];
862 }
863 }
864
865 /**
866 * Test the [:Latin:] syntax.
867 */
868 void UnicodeSetTest::TestScriptSet() {
869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
870
871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
872
873 /* Jitterbug 1423 */
874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
875
876 }
877
878 /**
879 * Test the [:Latin:] syntax.
880 */
881 void UnicodeSetTest::TestPropertySet() {
882 static const char* const DATA[] = {
883 // Pattern, Chars IN, Chars NOT in
884
885 "[:Latin:]",
886 "aA",
887 "\\u0391\\u03B1",
888
889 "[\\p{Greek}]",
890 "\\u0391\\u03B1",
891 "aA",
892
893 "\\P{ GENERAL Category = upper case letter }",
894 "abc",
895 "ABC",
896
897 #if !UCONFIG_NO_NORMALIZATION
898 // Combining class: @since ICU 2.2
899 // Check both symbolic and numeric
900 "\\p{ccc=Nukta}",
901 "\\u0ABC",
902 "abc",
903
904 "\\p{Canonical Combining Class = 11}",
905 "\\u05B1",
906 "\\u05B2",
907
908 "[:c c c = iota subscript :]",
909 "\\u0345",
910 "xyz",
911 #endif
912
913 // Bidi class: @since ICU 2.2
914 "\\p{bidiclass=lefttoright}",
915 "abc",
916 "\\u0671\\u0672",
917
918 // Binary properties: @since ICU 2.2
919 "\\p{ideographic}",
920 "\\u4E0A",
921 "x",
922
923 "[:math=false:]",
924 "q)*(",
925 // weiv: )(and * were removed from math in Unicode 4.0.1
926 //"(*+)",
927 "+<>^",
928
929 // JB#1767 \N{}, \p{ASCII}
930 "[:Ascii:]",
931 "abc\\u0000\\u007F",
932 "\\u0080\\u4E00",
933
934 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
935 "az",
936 "qrs",
937
938 // JB#2015
939 "[:any:]",
940 "a\\U0010FFFF",
941 "",
942
943 "[:nv=0.5:]",
944 "\\u00BD\\u0F2A",
945 "\\u00BC",
946
947 // JB#2653: Age
948 "[:Age=1.1:]",
949 "\\u03D6", // 1.1
950 "\\u03D8\\u03D9", // 3.2
951
952 "[:Age=3.1:]",
953 "\\u1800\\u3400\\U0002f800",
954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955
956 // JB#2350: Case_Sensitive
957 "[:Case Sensitive:]",
958 "A\\u1FFC\\U00010410",
959 ";\\u00B4\\U00010500",
960
961 // JB#2832: C99-compatibility props
962 "[:blank:]",
963 " \\u0009",
964 "1-9A-Z",
965
966 "[:graph:]",
967 "19AZ",
968 " \\u0003\\u0007\\u0009\\u000A\\u000D",
969
970 "[:punct:]",
971 "!@#%&*()[]{}-_\\/;:,.?'\"",
972 "09azAZ",
973
974 "[:xdigit:]",
975 "09afAF",
976 "gG!",
977
978 // Regex compatibility test
979 "[-b]", // leading '-' is literal
980 "-b",
981 "ac",
982
983 "[^-b]", // leading '-' is literal
984 "ac",
985 "-b",
986
987 "[b-]", // trailing '-' is literal
988 "-b",
989 "ac",
990
991 "[^b-]", // trailing '-' is literal
992 "ac",
993 "-b",
994
995 "[a-b-]", // trailing '-' is literal
996 "ab-",
997 "c=",
998
999 "[[a-q]&[p-z]-]", // trailing '-' is literal
1000 "pq-",
1001 "or=",
1002
1003 "[\\s|\\)|:|$|\\>]", // from regex tests
1004 "s|):$>",
1005 "abc",
1006
1007 "[\\uDC00cd]", // JB#2906: isolated trail at start
1008 "cd\\uDC00",
1009 "ab\\uD800\\U00010000",
1010
1011 "[ab\\uD800]", // JB#2906: isolated trail at start
1012 "ab\\uD800",
1013 "cd\\uDC00\\U00010000",
1014
1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016 "abcd\\uD800",
1017 "ef\\uDC00\\U00010000",
1018
1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020 "abcd\\uDC00",
1021 "ef\\uD800\\U00010000",
1022
1023 #if !UCONFIG_NO_NORMALIZATION
1024 "[:^lccc=0:]", // Lead canonical class
1025 "\\u0300\\u0301",
1026 "abcd\\u00c0\\u00c5",
1027
1028 "[:^tccc=0:]", // Trail canonical class
1029 "\\u0300\\u0301\\u00c0\\u00c5",
1030 "abcd",
1031
1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033 "\\u0300\\u0301\\u00c0\\u00c5",
1034 "abcd",
1035
1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037 "",
1038 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041 "\\u0F73\\u0F75\\u0F81",
1042 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043 #endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045 "[:Assigned:]",
1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049 // Script_Extensions, new in Unicode 6.0
1050 "[:scx=Arab:]",
1051 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052 "\\u061D\\uFDEF\\uFDFE",
1053
1054 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055 // so scx-sc is missing U+FDF2.
1056 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057 "\\u0640\\u064B\\u0650\\u0655",
1058 "\\uFDF2"
1059 };
1060
1061 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1062
1063 for (int32_t i=0; i<DATA_LEN; i+=3) {
1064 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065 CharsToUnicodeString(DATA[i+2]));
1066 }
1067 }
1068
1069 /**
1070 * Test that Posix style character classes [:digit:], etc.
1071 * have the Unicode definitions from TR 18.
1072 */
1073 void UnicodeSetTest::TestPosixClasses() {
1074 {
1075 UErrorCode status = U_ZERO_ERROR;
1076 UnicodeSet s1("[:alpha:]", status);
1077 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078 TEST_ASSERT_SUCCESS(status);
1079 TEST_ASSERT(s1==s2);
1080 }
1081 {
1082 UErrorCode status = U_ZERO_ERROR;
1083 UnicodeSet s1("[:lower:]", status);
1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085 TEST_ASSERT_SUCCESS(status);
1086 TEST_ASSERT(s1==s2);
1087 }
1088 {
1089 UErrorCode status = U_ZERO_ERROR;
1090 UnicodeSet s1("[:upper:]", status);
1091 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092 TEST_ASSERT_SUCCESS(status);
1093 TEST_ASSERT(s1==s2);
1094 }
1095 {
1096 UErrorCode status = U_ZERO_ERROR;
1097 UnicodeSet s1("[:punct:]", status);
1098 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099 TEST_ASSERT_SUCCESS(status);
1100 TEST_ASSERT(s1==s2);
1101 }
1102 {
1103 UErrorCode status = U_ZERO_ERROR;
1104 UnicodeSet s1("[:digit:]", status);
1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106 TEST_ASSERT_SUCCESS(status);
1107 TEST_ASSERT(s1==s2);
1108 }
1109 {
1110 UErrorCode status = U_ZERO_ERROR;
1111 UnicodeSet s1("[:xdigit:]", status);
1112 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113 TEST_ASSERT_SUCCESS(status);
1114 TEST_ASSERT(s1==s2);
1115 }
1116 {
1117 UErrorCode status = U_ZERO_ERROR;
1118 UnicodeSet s1("[:alnum:]", status);
1119 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120 TEST_ASSERT_SUCCESS(status);
1121 TEST_ASSERT(s1==s2);
1122 }
1123 {
1124 UErrorCode status = U_ZERO_ERROR;
1125 UnicodeSet s1("[:space:]", status);
1126 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127 TEST_ASSERT_SUCCESS(status);
1128 TEST_ASSERT(s1==s2);
1129 }
1130 {
1131 UErrorCode status = U_ZERO_ERROR;
1132 UnicodeSet s1("[:blank:]", status);
1133 TEST_ASSERT_SUCCESS(status);
1134 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135 status);
1136 TEST_ASSERT_SUCCESS(status);
1137 TEST_ASSERT(s1==s2);
1138 }
1139 {
1140 UErrorCode status = U_ZERO_ERROR;
1141 UnicodeSet s1("[:cntrl:]", status);
1142 TEST_ASSERT_SUCCESS(status);
1143 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144 TEST_ASSERT_SUCCESS(status);
1145 TEST_ASSERT(s1==s2);
1146 }
1147 {
1148 UErrorCode status = U_ZERO_ERROR;
1149 UnicodeSet s1("[:graph:]", status);
1150 TEST_ASSERT_SUCCESS(status);
1151 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152 TEST_ASSERT_SUCCESS(status);
1153 TEST_ASSERT(s1==s2);
1154 }
1155 {
1156 UErrorCode status = U_ZERO_ERROR;
1157 UnicodeSet s1("[:print:]", status);
1158 TEST_ASSERT_SUCCESS(status);
1159 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160 TEST_ASSERT_SUCCESS(status);
1161 TEST_ASSERT(s1==s2);
1162 }
1163 }
1164 /**
1165 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1166 */
1167 void UnicodeSetTest::TestClone() {
1168 UErrorCode ec = U_ZERO_ERROR;
1169 UnicodeSet s("[abcxyz]", ec);
1170 UnicodeSet t(s);
1171 expectContainment(t, "abc", "def");
1172 }
1173
1174 /**
1175 * Test the indexOf() and charAt() methods.
1176 */
1177 void UnicodeSetTest::TestIndexOf() {
1178 UErrorCode ec = U_ZERO_ERROR;
1179 UnicodeSet set("[a-cx-y3578]", ec);
1180 if (U_FAILURE(ec)) {
1181 errln("FAIL: UnicodeSet constructor");
1182 return;
1183 }
1184 for (int32_t i=0; i<set.size(); ++i) {
1185 UChar32 c = set.charAt(i);
1186 if (set.indexOf(c) != i) {
1187 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188 i, c, set.indexOf(c));
1189 }
1190 }
1191 UChar32 c = set.charAt(set.size());
1192 if (c != -1) {
1193 errln("FAIL: charAt(<out of range>) = %X", c);
1194 }
1195 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196 if (j != -1) {
1197 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198 }
1199 }
1200
1201 /**
1202 * Test closure API.
1203 */
1204 void UnicodeSetTest::TestCloseOver() {
1205 UErrorCode ec = U_ZERO_ERROR;
1206
1207 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209 const char* DATA[] = {
1210 // selector, input, output
1211 CASE,
1212 "[aq\\u00DF{Bc}{bC}{Fi}]",
1213 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215 CASE,
1216 "[\\u01F1]", // 'DZ'
1217 "[\\u01F1\\u01F2\\u01F3]",
1218
1219 CASE,
1220 "[\\u1FB4]",
1221 "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223 CASE,
1224 "[{F\\uFB01}]",
1225 "[\\uFB03{ffi}]",
1226
1227 CASE, // make sure binary search finds limits
1228 "[a\\uFF3A]",
1229 "[aA\\uFF3A\\uFF5A]",
1230
1231 CASE,
1232 "[a-z]","[A-Za-z\\u017F\\u212A]",
1233 CASE,
1234 "[abc]","[A-Ca-c]",
1235 CASE,
1236 "[ABC]","[A-Ca-c]",
1237
1238 CASE, "[i]", "[iI]",
1239
1240 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1241 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1242
1243 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1244
1245 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1252
1253 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1256 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1257
1258 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1259
1260 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264 #if !UCONFIG_NO_FILE_IO
1265 CASE_MAPPINGS,
1266 "[aq\\u00DF{Bc}{bC}{Fi}]",
1267 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268 #endif
1269
1270 CASE_MAPPINGS,
1271 "[\\u01F1]", // 'DZ'
1272 "[\\u01F1\\u01F2\\u01F3]",
1273
1274 CASE_MAPPINGS,
1275 "[a-z]",
1276 "[A-Za-z]",
1277
1278 NULL
1279 };
1280
1281 UnicodeSet s;
1282 UnicodeSet t;
1283 UnicodeString buf;
1284 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285 int32_t selector = DATA[i][0];
1286 UnicodeString pat(DATA[i+1], -1, US_INV);
1287 UnicodeString exp(DATA[i+2], -1, US_INV);
1288 s.applyPattern(pat, ec);
1289 s.closeOver(selector);
1290 t.applyPattern(exp, ec);
1291 if (U_FAILURE(ec)) {
1292 errln("FAIL: applyPattern failed");
1293 continue;
1294 }
1295 if (s == t) {
1296 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297 } else {
1298 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299 s.toPattern(buf, TRUE) + ", expected " + exp);
1300 }
1301 }
1302
1303 #if 0
1304 /*
1305 * Unused test code.
1306 * This was used to compare the old implementation (using USET_CASE)
1307 * with the new one (using 0x100 temporarily)
1308 * while transitioning from hardcoded case closure tables in uniset.cpp
1309 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310 * and using ucase.c functions for closure.
1311 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312 *
1313 * Note: The old and new implementation never fully matched because
1314 * the old implementation turned out to not map U+0130 and U+0131 correctly
1315 * (dotted I and dotless i) and because the old implementation's data tables
1316 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317 * new implementation. (So sigmas and some other characters were not handled
1318 * according to the newer Unicode version.)
1319 */
1320 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321 UnicodeSetIterator si(sens);
1322 UnicodeString str, buf2;
1323 const UnicodeString *pStr;
1324 UChar32 c;
1325 while(si.next()) {
1326 if(!si.isString()) {
1327 c=si.getCodepoint();
1328 s.clear();
1329 s.add(c);
1330
1331 str.setTo(c);
1332 str.foldCase();
1333 sens2.add(str);
1334
1335 t=s;
1336 s.closeOver(USET_CASE);
1337 t.closeOver(0x100);
1338 if(s!=t) {
1339 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341 }
1342 }
1343 }
1344 // remove all code points
1345 // should contain all full case folding mapping strings
1346 sens2.remove(0, 0x10ffff);
1347 si.reset(sens2);
1348 while(si.next()) {
1349 if(si.isString()) {
1350 pStr=&si.getString();
1351 s.clear();
1352 s.add(*pStr);
1353 t=s2=s;
1354 s.closeOver(USET_CASE);
1355 t.closeOver(0x100);
1356 if(s!=t) {
1357 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359 }
1360 }
1361 }
1362 #endif
1363
1364 // Test the pattern API
1365 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366 if (U_FAILURE(ec)) {
1367 errln("FAIL: applyPattern failed");
1368 } else {
1369 expectContainment(s, "abcABC", "defDEF");
1370 }
1371 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372 if (U_FAILURE(ec)) {
1373 errln("FAIL: constructor failed");
1374 } else {
1375 expectContainment(v, "defDEF", "abcABC");
1376 }
1377 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378 if (U_FAILURE(ec)) {
1379 errln("FAIL: construct w/case mappings failed");
1380 } else {
1381 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382 }
1383 }
1384
1385 void UnicodeSetTest::TestEscapePattern() {
1386 const char pattern[] =
1387 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388 const char exp[] =
1389 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390 // We test this with two passes; in the second pass we
1391 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1392 // this fails -- which is what we expect.
1393 for (int32_t pass=1; pass<=2; ++pass) {
1394 UErrorCode ec = U_ZERO_ERROR;
1395 UnicodeString pat(pattern, -1, US_INV);
1396 if (pass==2) {
1397 pat = pat.unescape();
1398 }
1399 // Pattern is only good for pass 1
1400 UBool isPatternValid = (pass==1);
1401
1402 UnicodeSet set(pat, ec);
1403 if (U_SUCCESS(ec) != isPatternValid){
1404 errln((UnicodeString)"FAIL: applyPattern(" +
1405 escape(pat) + ") => " +
1406 u_errorName(ec));
1407 continue;
1408 }
1409 if (U_FAILURE(ec)) {
1410 continue;
1411 }
1412 if (set.contains((UChar)0x0644)){
1413 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414 }
1415
1416 UnicodeString newpat;
1417 set.toPattern(newpat, TRUE);
1418 if (newpat == UnicodeString(exp, -1, US_INV)) {
1419 logln(escape(pat) + " => " + newpat);
1420 } else {
1421 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422 }
1423
1424 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425 UnicodeString str("Range ");
1426 str.append((UChar)(0x30 + i))
1427 .append(": ")
1428 .append((UChar32)set.getRangeStart(i))
1429 .append(" - ")
1430 .append((UChar32)set.getRangeEnd(i));
1431 str = str + " (" + set.getRangeStart(i) + " - " +
1432 set.getRangeEnd(i) + ")";
1433 if (set.getRangeStart(i) < 0) {
1434 errln((UnicodeString)"FAIL: " + escape(str));
1435 } else {
1436 logln(escape(str));
1437 }
1438 }
1439 }
1440 }
1441
1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
1443 const UnicodeSet& set,
1444 UChar32 start, UChar32 end) {
1445 UnicodeSet exp(start, end);
1446 UnicodeString pat;
1447 if (set == exp) {
1448 logln(label + " => " + set.toPattern(pat, TRUE));
1449 } else {
1450 UnicodeString xpat;
1451 errln((UnicodeString)"FAIL: " + label + " => " +
1452 set.toPattern(pat, TRUE) +
1453 ", expected " + exp.toPattern(xpat, TRUE));
1454 }
1455 }
1456
1457 void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459 const UChar32 DATA[] = {
1460 // Test range Expected range
1461 0, 0x10FFFF, 0, 0x10FFFF,
1462 (UChar32)-1, 8, 0, 8,
1463 8, 0x110000, 8, 0x10FFFF
1464 };
1465 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1466
1467 UnicodeString pat;
1468 int32_t i;
1469
1470 for (i=0; i<DATA_LENGTH; i+=4) {
1471 UChar32 start = DATA[i];
1472 UChar32 end = DATA[i+1];
1473 UChar32 xstart = DATA[i+2];
1474 UChar32 xend = DATA[i+3];
1475
1476 // Try various API using the test code points
1477
1478 UnicodeSet set(start, end);
1479 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480 set, xstart, xend);
1481
1482 set.clear();
1483 set.set(start, end);
1484 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485 set, xstart, xend);
1486
1487 UBool b = set.contains(start);
1488 b = set.contains(start, end);
1489 b = set.containsNone(start, end);
1490 b = set.containsSome(start, end);
1491 (void)b; // Suppress set but not used warning.
1492
1493 /*int32_t index = set.indexOf(start);*/
1494
1495 set.clear();
1496 set.add(start);
1497 set.add(start, end);
1498 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1499 set, xstart, xend);
1500
1501 set.set(0, 0x10FFFF);
1502 set.retain(start, end);
1503 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1504 set, xstart, xend);
1505 set.retain(start);
1506
1507 set.set(0, 0x10FFFF);
1508 set.remove(start);
1509 set.remove(start, end);
1510 set.complement();
1511 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1512 set, xstart, xend);
1513
1514 set.set(0, 0x10FFFF);
1515 set.complement(start, end);
1516 set.complement();
1517 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1518 set, xstart, xend);
1519 set.complement(start);
1520 }
1521
1522 const UChar32 DATA2[] = {
1523 0,
1524 0x10FFFF,
1525 (UChar32)-1,
1526 0x110000
1527 };
1528 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1529
1530 for (i=0; i<DATA2_LENGTH; ++i) {
1531 UChar32 c = DATA2[i], end = 0x10FFFF;
1532 UBool valid = (c >= 0 && c <= 0x10FFFF);
1533
1534 UnicodeSet set(0, 0x10FFFF);
1535
1536 // For single-codepoint contains, invalid codepoints are NOT contained
1537 UBool b = set.contains(c);
1538 if (b == valid) {
1539 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1540 ") = " + b);
1541 } else {
1542 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1543 ") = " + b);
1544 }
1545
1546 // For codepoint range contains, containsNone, and containsSome,
1547 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1548 b = set.contains(c, end);
1549 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1550 "," + end + ") = " + b);
1551
1552 b = set.containsNone(c, end);
1553 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1554 "," + end + ") = " + b);
1555
1556 b = set.containsSome(c, end);
1557 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1558 "," + end + ") = " + b);
1559
1560 int32_t index = set.indexOf(c);
1561 if ((index >= 0) == valid) {
1562 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1563 ") = " + index);
1564 } else {
1565 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1566 ") = " + index);
1567 }
1568 }
1569 }
1570
1571 // Used by TestSymbolTable
1572 class TokenSymbolTable : public SymbolTable {
1573 public:
1574 Hashtable contents;
1575
1576 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1577 contents.setValueDeleter(uprv_deleteUObject);
1578 }
1579
1580 ~TokenSymbolTable() {}
1581
1582 /**
1583 * (Non-SymbolTable API) Add the given variable and value to
1584 * the table. Variable should NOT contain leading '$'.
1585 */
1586 void add(const UnicodeString& var, const UnicodeString& value,
1587 UErrorCode& ec) {
1588 if (U_SUCCESS(ec)) {
1589 contents.put(var, new UnicodeString(value), ec);
1590 }
1591 }
1592
1593 /**
1594 * SymbolTable API
1595 */
1596 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1597 return (const UnicodeString*) contents.get(s);
1598 }
1599
1600 /**
1601 * SymbolTable API
1602 */
1603 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1604 return NULL;
1605 }
1606
1607 /**
1608 * SymbolTable API
1609 */
1610 virtual UnicodeString parseReference(const UnicodeString& text,
1611 ParsePosition& pos, int32_t limit) const {
1612 int32_t start = pos.getIndex();
1613 int32_t i = start;
1614 UnicodeString result;
1615 while (i < limit) {
1616 UChar c = text.charAt(i);
1617 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1618 break;
1619 }
1620 ++i;
1621 }
1622 if (i == start) { // No valid name chars
1623 return result; // Indicate failure with empty string
1624 }
1625 pos.setIndex(i);
1626 text.extractBetween(start, i, result);
1627 return result;
1628 }
1629 };
1630
1631 void UnicodeSetTest::TestSymbolTable() {
1632 // Multiple test cases can be set up here. Each test case
1633 // is terminated by null:
1634 // var, value, var, value,..., input pat., exp. output pat., null
1635 const char* DATA[] = {
1636 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1637 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1638 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1639 NULL
1640 };
1641
1642 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1643 UErrorCode ec = U_ZERO_ERROR;
1644 TokenSymbolTable sym(ec);
1645 if (U_FAILURE(ec)) {
1646 errln("FAIL: couldn't construct TokenSymbolTable");
1647 continue;
1648 }
1649
1650 // Set up variables
1651 while (DATA[i+2] != NULL) {
1652 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1653 if (U_FAILURE(ec)) {
1654 errln("FAIL: couldn't add to TokenSymbolTable");
1655 continue;
1656 }
1657 i += 2;
1658 }
1659
1660 // Input pattern and expected output pattern
1661 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1662 i += 2;
1663
1664 ParsePosition pos(0);
1665 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1666 if (U_FAILURE(ec)) {
1667 errln("FAIL: couldn't construct UnicodeSet");
1668 continue;
1669 }
1670
1671 // results
1672 if (pos.getIndex() != inpat.length()) {
1673 errln((UnicodeString)"Failed to read to end of string \""
1674 + inpat + "\": read to "
1675 + pos.getIndex() + ", length is "
1676 + inpat.length());
1677 }
1678
1679 UnicodeSet us2(exppat, ec);
1680 if (U_FAILURE(ec)) {
1681 errln("FAIL: couldn't construct expected UnicodeSet");
1682 continue;
1683 }
1684
1685 UnicodeString a, b;
1686 if (us != us2) {
1687 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1688 ", expected " + us2.toPattern(b, TRUE));
1689 } else {
1690 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1691 }
1692 }
1693 }
1694
1695 void UnicodeSetTest::TestSurrogate() {
1696 const char* DATA[] = {
1697 // These should all behave identically
1698 "[abc\\uD800\\uDC00]",
1699 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1700 "[abc\\U00010000]",
1701 0
1702 };
1703 for (int i=0; DATA[i] != 0; ++i) {
1704 UErrorCode ec = U_ZERO_ERROR;
1705 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1706 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1707 UnicodeSet set(str, ec);
1708 if (U_FAILURE(ec)) {
1709 errln("FAIL: UnicodeSet constructor");
1710 continue;
1711 }
1712 expectContainment(set,
1713 CharsToUnicodeString("abc\\U00010000"),
1714 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1715 if (set.size() != 4) {
1716 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1717 set.size() + ", expected 4");
1718 }
1719
1720 {
1721 UErrorCode subErr = U_ZERO_ERROR;
1722 checkRoundTrip(set);
1723 checkSerializeRoundTrip(set, subErr);
1724 }
1725 }
1726 }
1727
1728 void UnicodeSetTest::TestExhaustive() {
1729 // exhaustive tests. Simulate UnicodeSets with integers.
1730 // That gives us very solid tests (except for large memory tests).
1731
1732 int32_t limit = 128;
1733
1734 UnicodeSet x, y, z, aa;
1735
1736 for (int32_t i = 0; i < limit; ++i) {
1737 bitsToSet(i, x);
1738 logln((UnicodeString)"Testing " + i + ", " + x);
1739 _testComplement(i, x, y);
1740
1741 UnicodeSet &toTest = bitsToSet(i, aa);
1742
1743 // AS LONG AS WE ARE HERE, check roundtrip
1744 checkRoundTrip(toTest);
1745 UErrorCode ec = U_ZERO_ERROR;
1746 checkSerializeRoundTrip(toTest, ec);
1747
1748 for (int32_t j = 0; j < limit; ++j) {
1749 _testAdd(i,j, x,y,z);
1750 _testXor(i,j, x,y,z);
1751 _testRetain(i,j, x,y,z);
1752 _testRemove(i,j, x,y,z);
1753 }
1754 }
1755 }
1756
1757 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1758 bitsToSet(a, x);
1759 z = x;
1760 z.complement();
1761 int32_t c = setToBits(z);
1762 if (c != (~a)) {
1763 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1764 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1765 }
1766 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1767 }
1768
1769 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1770 bitsToSet(a, x);
1771 bitsToSet(b, y);
1772 z = x;
1773 z.addAll(y);
1774 int32_t c = setToBits(z);
1775 if (c != (a | b)) {
1776 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1777 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1778 }
1779 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1780 }
1781
1782 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1783 bitsToSet(a, x);
1784 bitsToSet(b, y);
1785 z = x;
1786 z.retainAll(y);
1787 int32_t c = setToBits(z);
1788 if (c != (a & b)) {
1789 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1790 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1791 }
1792 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1793 }
1794
1795 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1796 bitsToSet(a, x);
1797 bitsToSet(b, y);
1798 z = x;
1799 z.removeAll(y);
1800 int32_t c = setToBits(z);
1801 if (c != (a &~ b)) {
1802 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1803 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1804 }
1805 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1806 }
1807
1808 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1809 bitsToSet(a, x);
1810 bitsToSet(b, y);
1811 z = x;
1812 z.complementAll(y);
1813 int32_t c = setToBits(z);
1814 if (c != (a ^ b)) {
1815 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1816 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1817 }
1818 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1819 }
1820
1821 /**
1822 * Check that ranges are monotonically increasing and non-
1823 * overlapping.
1824 */
1825 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1826 int32_t n = set.getRangeCount();
1827 if (n < 0) {
1828 errln((UnicodeString)"FAIL result of " + msg +
1829 ": range count should be >= 0 but is " +
1830 n /*+ " for " + set.toPattern())*/);
1831 return;
1832 }
1833 UChar32 last = 0;
1834 for (int32_t i=0; i<n; ++i) {
1835 UChar32 start = set.getRangeStart(i);
1836 UChar32 end = set.getRangeEnd(i);
1837 if (start > end) {
1838 errln((UnicodeString)"FAIL result of " + msg +
1839 ": range " + (i+1) +
1840 " start > end: " + (int)start + ", " + (int)end +
1841 " for " + set);
1842 }
1843 if (i > 0 && start <= last) {
1844 errln((UnicodeString)"FAIL result of " + msg +
1845 ": range " + (i+1) +
1846 " overlaps previous range: " + (int)start + ", " + (int)end +
1847 " for " + set);
1848 }
1849 last = end;
1850 }
1851 }
1852
1853 /**
1854 * Convert a bitmask to a UnicodeSet.
1855 */
1856 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1857 result.clear();
1858 for (UChar32 i = 0; i < 32; ++i) {
1859 if ((a & (1<<i)) != 0) {
1860 result.add(i);
1861 }
1862 }
1863 return result;
1864 }
1865
1866 /**
1867 * Convert a UnicodeSet to a bitmask. Only the characters
1868 * U+0000 to U+0020 are represented in the bitmask.
1869 */
1870 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1871 int32_t result = 0;
1872 for (int32_t i = 0; i < 32; ++i) {
1873 if (x.contains((UChar32)i)) {
1874 result |= (1<<i);
1875 }
1876 }
1877 return result;
1878 }
1879
1880 /**
1881 * Return the representation of an inversion list based UnicodeSet
1882 * as a pairs list. Ranges are listed in ascending Unicode order.
1883 * For example, the set [a-zA-M3] is represented as "33AMaz".
1884 */
1885 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1886 UnicodeString pairs;
1887 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1888 UChar32 start = set.getRangeStart(i);
1889 UChar32 end = set.getRangeEnd(i);
1890 if (end > 0xFFFF) {
1891 end = 0xFFFF;
1892 i = set.getRangeCount(); // Should be unnecessary
1893 }
1894 pairs.append((UChar)start).append((UChar)end);
1895 }
1896 return pairs;
1897 }
1898
1899 /**
1900 * Basic consistency check for a few items.
1901 * That the iterator works, and that we can create a pattern and
1902 * get the same thing back
1903 */
1904 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1905 {
1906 UnicodeSet t(s);
1907 checkEqual(s, t, "copy ct");
1908 }
1909
1910 {
1911 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
1912 t = s;
1913 checkEqual(s, t, "operator=");
1914 }
1915
1916 {
1917 UnicodeSet t;
1918 copyWithIterator(t, s, FALSE);
1919 checkEqual(s, t, "iterator roundtrip");
1920 }
1921
1922 {
1923 UnicodeSet t;
1924 copyWithIterator(t, s, TRUE); // try range
1925 checkEqual(s, t, "iterator roundtrip");
1926 }
1927
1928 {
1929 UnicodeSet t;
1930 UnicodeString pat;
1931 UErrorCode ec = U_ZERO_ERROR;
1932 s.toPattern(pat, FALSE);
1933 t.applyPattern(pat, ec);
1934 if (U_FAILURE(ec)) {
1935 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1936 return;
1937 } else {
1938 checkEqual(s, t, "toPattern(false)");
1939 }
1940 }
1941
1942 {
1943 UnicodeSet t;
1944 UnicodeString pat;
1945 UErrorCode ec = U_ZERO_ERROR;
1946 s.toPattern(pat, TRUE);
1947 t.applyPattern(pat, ec);
1948 if (U_FAILURE(ec)) {
1949 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1950 return;
1951 } else {
1952 checkEqual(s, t, "toPattern(true)");
1953 }
1954 }
1955 }
1956
1957 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1958 if(U_FAILURE(status)) return;
1959 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1960 if(status == U_BUFFER_OVERFLOW_ERROR) {
1961 status = U_ZERO_ERROR;
1962 serializeBuffer.resize(len);
1963 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1964 // let 2nd error stand
1965 }
1966 if(U_FAILURE(status)) {
1967 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1968 return;
1969 }
1970 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1971 if(U_FAILURE(status)) {
1972 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1973 return;
1974 }
1975
1976 checkEqual(t, deserialized, "Set was unequal when deserialized");
1977 }
1978
1979 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1980 t.clear();
1981 UnicodeSetIterator it(s);
1982 if (withRange) {
1983 while (it.nextRange()) {
1984 if (it.isString()) {
1985 t.add(it.getString());
1986 } else {
1987 t.add(it.getCodepoint(), it.getCodepointEnd());
1988 }
1989 }
1990 } else {
1991 while (it.next()) {
1992 if (it.isString()) {
1993 t.add(it.getString());
1994 } else {
1995 t.add(it.getCodepoint());
1996 }
1997 }
1998 }
1999 }
2000
2001 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2002 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2003 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2004 UnicodeString source; s.toPattern(source, TRUE);
2005 UnicodeString result; t.toPattern(result, TRUE);
2006 if (s != t) {
2007 errln((UnicodeString)"FAIL: " + message
2008 + "; source = " + source
2009 + "; result = " + result
2010 );
2011 return FALSE;
2012 } else {
2013 logln((UnicodeString)"Ok: " + message
2014 + "; source = " + source
2015 + "; result = " + result
2016 );
2017 }
2018 return TRUE;
2019 }
2020
2021 void
2022 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2023 const UnicodeString& charsIn,
2024 const UnicodeString& charsOut) {
2025 UErrorCode ec = U_ZERO_ERROR;
2026 UnicodeSet set(pat, ec);
2027 if (U_FAILURE(ec)) {
2028 dataerrln((UnicodeString)"FAIL: pattern \"" +
2029 pat + "\" => " + u_errorName(ec));
2030 return;
2031 }
2032 expectContainment(set, pat, charsIn, charsOut);
2033 }
2034
2035 void
2036 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2037 const UnicodeString& charsIn,
2038 const UnicodeString& charsOut) {
2039 UnicodeString pat;
2040 set.toPattern(pat);
2041 expectContainment(set, pat, charsIn, charsOut);
2042 }
2043
2044 void
2045 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2046 const UnicodeString& setName,
2047 const UnicodeString& charsIn,
2048 const UnicodeString& charsOut) {
2049 UnicodeString bad;
2050 UChar32 c;
2051 int32_t i;
2052
2053 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2054 c = charsIn.char32At(i);
2055 if (!set.contains(c)) {
2056 bad.append(c);
2057 }
2058 }
2059 if (bad.length() > 0) {
2060 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2061 ", expected containment of " + prettify(charsIn));
2062 } else {
2063 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2064 }
2065
2066 bad.truncate(0);
2067 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2068 c = charsOut.char32At(i);
2069 if (set.contains(c)) {
2070 bad.append(c);
2071 }
2072 }
2073 if (bad.length() > 0) {
2074 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2075 ", expected non-containment of " + prettify(charsOut));
2076 } else {
2077 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2078 }
2079 }
2080
2081 void
2082 UnicodeSetTest::expectPattern(UnicodeSet& set,
2083 const UnicodeString& pattern,
2084 const UnicodeString& expectedPairs){
2085 UErrorCode status = U_ZERO_ERROR;
2086 set.applyPattern(pattern, status);
2087 if (U_FAILURE(status)) {
2088 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2089 "\") failed");
2090 return;
2091 } else {
2092 if (getPairs(set) != expectedPairs ) {
2093 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2094 "\") => pairs \"" +
2095 escape(getPairs(set)) + "\", expected \"" +
2096 escape(expectedPairs) + "\"");
2097 } else {
2098 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2099 "\") => pairs \"" +
2100 escape(getPairs(set)) + "\"");
2101 }
2102 }
2103 // the result of calling set.toPattern(), which is the string representation of
2104 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2105 // will produce another set that is equal to this one.
2106 UnicodeString temppattern;
2107 set.toPattern(temppattern);
2108 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2109 if (U_FAILURE(status)) {
2110 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2111 return;
2112 }
2113 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2114 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2115 escape(getPairs(set)) + "\""));
2116 } else{
2117 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2118 }
2119
2120 delete tempset;
2121
2122 }
2123
2124 void
2125 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2126 if (getPairs(set) != expectedPairs) {
2127 errln(UnicodeString("FAIL: Expected pair list \"") +
2128 escape(expectedPairs) + "\", got \"" +
2129 escape(getPairs(set)) + "\"");
2130 }
2131 }
2132
2133 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2134 const UnicodeString& expPat,
2135 const char** expStrings) {
2136 UnicodeString pat;
2137 set.toPattern(pat, TRUE);
2138 if (pat == expPat) {
2139 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2140 } else {
2141 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2142 return;
2143 }
2144 if (expStrings == NULL) {
2145 return;
2146 }
2147 UBool in = TRUE;
2148 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2149 if (expStrings[i] == NOT) { // sic; pointer comparison
2150 in = FALSE;
2151 continue;
2152 }
2153 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2154 UBool contained = set.contains(s);
2155 if (contained == in) {
2156 logln((UnicodeString)"Ok: " + expPat +
2157 (contained ? " contains {" : " does not contain {") +
2158 escape(expStrings[i]) + "}");
2159 } else {
2160 errln((UnicodeString)"FAIL: " + expPat +
2161 (contained ? " contains {" : " does not contain {") +
2162 escape(expStrings[i]) + "}");
2163 }
2164 }
2165 }
2166
2167 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2168
2169 void
2170 UnicodeSetTest::doAssert(UBool condition, const char *message)
2171 {
2172 if (!condition) {
2173 errln(UnicodeString("ERROR : ") + message);
2174 }
2175 }
2176
2177 UnicodeString
2178 UnicodeSetTest::escape(const UnicodeString& s) {
2179 UnicodeString buf;
2180 for (int32_t i=0; i<s.length(); )
2181 {
2182 UChar32 c = s.char32At(i);
2183 if (0x0020 <= c && c <= 0x007F) {
2184 buf += c;
2185 } else {
2186 if (c <= 0xFFFF) {
2187 buf += (UChar)0x5c; buf += (UChar)0x75;
2188 } else {
2189 buf += (UChar)0x5c; buf += (UChar)0x55;
2190 buf += toHexString((c & 0xF0000000) >> 28);
2191 buf += toHexString((c & 0x0F000000) >> 24);
2192 buf += toHexString((c & 0x00F00000) >> 20);
2193 buf += toHexString((c & 0x000F0000) >> 16);
2194 }
2195 buf += toHexString((c & 0xF000) >> 12);
2196 buf += toHexString((c & 0x0F00) >> 8);
2197 buf += toHexString((c & 0x00F0) >> 4);
2198 buf += toHexString(c & 0x000F);
2199 }
2200 i += U16_LENGTH(c);
2201 }
2202 return buf;
2203 }
2204
2205 void UnicodeSetTest::TestFreezable() {
2206 UErrorCode errorCode=U_ZERO_ERROR;
2207 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2208 UnicodeSet idSet(idPattern, errorCode);
2209 if(U_FAILURE(errorCode)) {
2210 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2211 return;
2212 }
2213
2214 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2215 UnicodeSet wsSet(wsPattern, errorCode);
2216 if(U_FAILURE(errorCode)) {
2217 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2218 return;
2219 }
2220
2221 idSet.add(idPattern);
2222 UnicodeSet frozen(idSet);
2223 frozen.freeze();
2224
2225 if(idSet.isFrozen() || !frozen.isFrozen()) {
2226 errln("FAIL: isFrozen() is wrong");
2227 }
2228 if(frozen!=idSet || !(frozen==idSet)) {
2229 errln("FAIL: a copy-constructed frozen set differs from its original");
2230 }
2231
2232 frozen=wsSet;
2233 if(frozen!=idSet || !(frozen==idSet)) {
2234 errln("FAIL: a frozen set was modified by operator=");
2235 }
2236
2237 UnicodeSet frozen2(frozen);
2238 if(frozen2!=frozen || frozen2!=idSet) {
2239 errln("FAIL: a copied frozen set differs from its frozen original");
2240 }
2241 if(!frozen2.isFrozen()) {
2242 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2243 }
2244 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2245 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2246 errln("FAIL: UnicodeSet(5, 55) failed");
2247 }
2248 frozen3=frozen;
2249 if(!frozen3.isFrozen()) {
2250 errln("FAIL: copying a frozen set results in a thawed one");
2251 }
2252
2253 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2254 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2255 errln("FAIL: clone() failed");
2256 }
2257 cloned->add(0xd802, 0xd805);
2258 if(cloned->containsSome(0xd802, 0xd805)) {
2259 errln("FAIL: unable to modify clone");
2260 }
2261 delete cloned;
2262
2263 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2264 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2265 errln("FAIL: cloneAsThawed() failed");
2266 }
2267 thawed->add(0xd802, 0xd805);
2268 if(!thawed->contains(0xd802, 0xd805)) {
2269 errln("FAIL: unable to modify thawed clone");
2270 }
2271 delete thawed;
2272
2273 frozen.set(5, 55);
2274 if(frozen!=idSet || !(frozen==idSet)) {
2275 errln("FAIL: UnicodeSet::set() modified a frozen set");
2276 }
2277
2278 frozen.clear();
2279 if(frozen!=idSet || !(frozen==idSet)) {
2280 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2281 }
2282
2283 frozen.closeOver(USET_CASE_INSENSITIVE);
2284 if(frozen!=idSet || !(frozen==idSet)) {
2285 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2286 }
2287
2288 frozen.compact();
2289 if(frozen!=idSet || !(frozen==idSet)) {
2290 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2291 }
2292
2293 ParsePosition pos;
2294 frozen.
2295 applyPattern(wsPattern, errorCode).
2296 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2297 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2298 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2299 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2300 if(frozen!=idSet || !(frozen==idSet)) {
2301 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2302 }
2303
2304 frozen.
2305 add(0xd800).
2306 add(0xd802, 0xd805).
2307 add(wsPattern).
2308 addAll(idPattern).
2309 addAll(wsSet);
2310 if(frozen!=idSet || !(frozen==idSet)) {
2311 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2312 }
2313
2314 frozen.
2315 retain(0x62).
2316 retain(0x64, 0x69).
2317 retainAll(wsPattern).
2318 retainAll(wsSet);
2319 if(frozen!=idSet || !(frozen==idSet)) {
2320 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2321 }
2322
2323 frozen.
2324 remove(0x62).
2325 remove(0x64, 0x69).
2326 remove(idPattern).
2327 removeAll(idPattern).
2328 removeAll(idSet);
2329 if(frozen!=idSet || !(frozen==idSet)) {
2330 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2331 }
2332
2333 frozen.
2334 complement().
2335 complement(0x62).
2336 complement(0x64, 0x69).
2337 complement(idPattern).
2338 complementAll(idPattern).
2339 complementAll(idSet);
2340 if(frozen!=idSet || !(frozen==idSet)) {
2341 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2342 }
2343 }
2344
2345 // Test span() etc. -------------------------------------------------------- ***
2346
2347 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2348 static int32_t
2349 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2350 UErrorCode errorCode=U_ZERO_ERROR;
2351 int32_t length8=0;
2352 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2353 if(U_SUCCESS(errorCode)) {
2354 return length8;
2355 } else {
2356 // The string contains an unpaired surrogate.
2357 // Ignore this string.
2358 return 0;
2359 }
2360 }
2361
2362 class UnicodeSetWithStringsIterator;
2363
2364 // Make the strings in a UnicodeSet easily accessible.
2365 class UnicodeSetWithStrings {
2366 public:
2367 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2368 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2369 int32_t size=set.size();
2370 if(size>0 && set.charAt(size-1)<0) {
2371 // If a set's last element is not a code point, then it must contain strings.
2372 // Iterate over the set, skip all code point ranges, and cache the strings.
2373 // Convert them to UTF-8 for spanUTF8().
2374 UnicodeSetIterator iter(set);
2375 const UnicodeString *s;
2376 char *s8=utf8;
2377 int32_t length8, utf8Count=0;
2378 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2379 if(iter.isString()) {
2380 // Store the pointer to the set's string element
2381 // which we happen to know is a stable pointer.
2382 strings[stringsLength]=s=&iter.getString();
2383 utf8Count+=
2384 utf8Lengths[stringsLength]=length8=
2385 appendUTF8(s->getBuffer(), s->length(),
2386 s8, (int32_t)(sizeof(utf8)-utf8Count));
2387 if(length8==0) {
2388 hasSurrogates=TRUE; // Contains unpaired surrogates.
2389 }
2390 s8+=length8;
2391 ++stringsLength;
2392 }
2393 }
2394 }
2395 }
2396
2397 const UnicodeSet &getSet() const {
2398 return set;
2399 }
2400
2401 UBool hasStrings() const {
2402 return (UBool)(stringsLength>0);
2403 }
2404
2405 UBool hasStringsWithSurrogates() const {
2406 return hasSurrogates;
2407 }
2408
2409 private:
2410 friend class UnicodeSetWithStringsIterator;
2411
2412 const UnicodeSet &set;
2413
2414 const UnicodeString *strings[20];
2415 int32_t stringsLength;
2416 UBool hasSurrogates;
2417
2418 char utf8[1024];
2419 int32_t utf8Lengths[20];
2420 };
2421
2422 class UnicodeSetWithStringsIterator {
2423 public:
2424 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2425 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2426 }
2427
2428 void reset() {
2429 nextStringIndex=nextUTF8Start=0;
2430 }
2431
2432 const UnicodeString *nextString() {
2433 if(nextStringIndex<fSet.stringsLength) {
2434 return fSet.strings[nextStringIndex++];
2435 } else {
2436 return NULL;
2437 }
2438 }
2439
2440 // Do not mix with calls to nextString().
2441 const char *nextUTF8(int32_t &length) {
2442 if(nextStringIndex<fSet.stringsLength) {
2443 const char *s8=fSet.utf8+nextUTF8Start;
2444 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2445 return s8;
2446 } else {
2447 length=0;
2448 return NULL;
2449 }
2450 }
2451
2452 private:
2453 const UnicodeSetWithStrings &fSet;
2454 int32_t nextStringIndex;
2455 int32_t nextUTF8Start;
2456 };
2457
2458 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2459 // at code point boundaries.
2460 // That is, each edge of a match must not be in the middle of a surrogate pair.
2461 static inline UBool
2462 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2463 s+=start;
2464 limit-=start;
2465 int32_t length=t.length();
2466 return 0==t.compare(s, length) &&
2467 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2468 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2469 }
2470
2471 // Implement span() with contains() for comparison.
2472 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2473 USetSpanCondition spanCondition) {
2474 const UnicodeSet &realSet(set.getSet());
2475 if(!set.hasStrings()) {
2476 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2477 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2478 }
2479
2480 UChar32 c;
2481 int32_t start=0, prev;
2482 while((prev=start)<length) {
2483 U16_NEXT(s, start, length, c);
2484 if(realSet.contains(c)!=spanCondition) {
2485 break;
2486 }
2487 }
2488 return prev;
2489 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2490 UnicodeSetWithStringsIterator iter(set);
2491 UChar32 c;
2492 int32_t start, next;
2493 for(start=next=0; start<length;) {
2494 U16_NEXT(s, next, length, c);
2495 if(realSet.contains(c)) {
2496 break;
2497 }
2498 const UnicodeString *str;
2499 iter.reset();
2500 while((str=iter.nextString())!=NULL) {
2501 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2502 // spanNeedsStrings=TRUE;
2503 return start;
2504 }
2505 }
2506 start=next;
2507 }
2508 return start;
2509 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2510 UnicodeSetWithStringsIterator iter(set);
2511 UChar32 c;
2512 int32_t start, next, maxSpanLimit=0;
2513 for(start=next=0; start<length;) {
2514 U16_NEXT(s, next, length, c);
2515 if(!realSet.contains(c)) {
2516 next=start; // Do not span this single, not-contained code point.
2517 }
2518 const UnicodeString *str;
2519 iter.reset();
2520 while((str=iter.nextString())!=NULL) {
2521 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2522 // spanNeedsStrings=TRUE;
2523 int32_t matchLimit=start+str->length();
2524 if(matchLimit==length) {
2525 return length;
2526 }
2527 if(spanCondition==USET_SPAN_CONTAINED) {
2528 // Iterate for the shortest match at each position.
2529 // Recurse for each but the shortest match.
2530 if(next==start) {
2531 next=matchLimit; // First match from start.
2532 } else {
2533 if(matchLimit<next) {
2534 // Remember shortest match from start for iteration.
2535 int32_t temp=next;
2536 next=matchLimit;
2537 matchLimit=temp;
2538 }
2539 // Recurse for non-shortest match from start.
2540 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2541 USET_SPAN_CONTAINED);
2542 if((matchLimit+spanLength)>maxSpanLimit) {
2543 maxSpanLimit=matchLimit+spanLength;
2544 if(maxSpanLimit==length) {
2545 return length;
2546 }
2547 }
2548 }
2549 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2550 if(matchLimit>next) {
2551 // Remember longest match from start.
2552 next=matchLimit;
2553 }
2554 }
2555 }
2556 }
2557 if(next==start) {
2558 break; // No match from start.
2559 }
2560 start=next;
2561 }
2562 if(start>maxSpanLimit) {
2563 return start;
2564 } else {
2565 return maxSpanLimit;
2566 }
2567 }
2568 }
2569
2570 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2571 USetSpanCondition spanCondition) {
2572 if(length==0) {
2573 return 0;
2574 }
2575 const UnicodeSet &realSet(set.getSet());
2576 if(!set.hasStrings()) {
2577 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2578 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2579 }
2580
2581 UChar32 c;
2582 int32_t prev=length;
2583 do {
2584 U16_PREV(s, 0, length, c);
2585 if(realSet.contains(c)!=spanCondition) {
2586 break;
2587 }
2588 } while((prev=length)>0);
2589 return prev;
2590 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2591 UnicodeSetWithStringsIterator iter(set);
2592 UChar32 c;
2593 int32_t prev=length, length0=length;
2594 do {
2595 U16_PREV(s, 0, length, c);
2596 if(realSet.contains(c)) {
2597 break;
2598 }
2599 const UnicodeString *str;
2600 iter.reset();
2601 while((str=iter.nextString())!=NULL) {
2602 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2603 // spanNeedsStrings=TRUE;
2604 return prev;
2605 }
2606 }
2607 } while((prev=length)>0);
2608 return prev;
2609 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2610 UnicodeSetWithStringsIterator iter(set);
2611 UChar32 c;
2612 int32_t prev=length, minSpanStart=length, length0=length;
2613 do {
2614 U16_PREV(s, 0, length, c);
2615 if(!realSet.contains(c)) {
2616 length=prev; // Do not span this single, not-contained code point.
2617 }
2618 const UnicodeString *str;
2619 iter.reset();
2620 while((str=iter.nextString())!=NULL) {
2621 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2622 // spanNeedsStrings=TRUE;
2623 int32_t matchStart=prev-str->length();
2624 if(matchStart==0) {
2625 return 0;
2626 }
2627 if(spanCondition==USET_SPAN_CONTAINED) {
2628 // Iterate for the shortest match at each position.
2629 // Recurse for each but the shortest match.
2630 if(length==prev) {
2631 length=matchStart; // First match from prev.
2632 } else {
2633 if(matchStart>length) {
2634 // Remember shortest match from prev for iteration.
2635 int32_t temp=length;
2636 length=matchStart;
2637 matchStart=temp;
2638 }
2639 // Recurse for non-shortest match from prev.
2640 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2641 USET_SPAN_CONTAINED);
2642 if(spanStart<minSpanStart) {
2643 minSpanStart=spanStart;
2644 if(minSpanStart==0) {
2645 return 0;
2646 }
2647 }
2648 }
2649 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2650 if(matchStart<length) {
2651 // Remember longest match from prev.
2652 length=matchStart;
2653 }
2654 }
2655 }
2656 }
2657 if(length==prev) {
2658 break; // No match from prev.
2659 }
2660 } while((prev=length)>0);
2661 if(prev<minSpanStart) {
2662 return prev;
2663 } else {
2664 return minSpanStart;
2665 }
2666 }
2667 }
2668
2669 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2670 USetSpanCondition spanCondition) {
2671 const UnicodeSet &realSet(set.getSet());
2672 if(!set.hasStrings()) {
2673 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2674 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2675 }
2676
2677 UChar32 c;
2678 int32_t start=0, prev;
2679 while((prev=start)<length) {
2680 U8_NEXT_OR_FFFD(s, start, length, c);
2681 if(realSet.contains(c)!=spanCondition) {
2682 break;
2683 }
2684 }
2685 return prev;
2686 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2687 UnicodeSetWithStringsIterator iter(set);
2688 UChar32 c;
2689 int32_t start, next;
2690 for(start=next=0; start<length;) {
2691 U8_NEXT_OR_FFFD(s, next, length, c);
2692 if(realSet.contains(c)) {
2693 break;
2694 }
2695 const char *s8;
2696 int32_t length8;
2697 iter.reset();
2698 while((s8=iter.nextUTF8(length8))!=NULL) {
2699 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2700 // spanNeedsStrings=TRUE;
2701 return start;
2702 }
2703 }
2704 start=next;
2705 }
2706 return start;
2707 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2708 UnicodeSetWithStringsIterator iter(set);
2709 UChar32 c;
2710 int32_t start, next, maxSpanLimit=0;
2711 for(start=next=0; start<length;) {
2712 U8_NEXT_OR_FFFD(s, next, length, c);
2713 if(!realSet.contains(c)) {
2714 next=start; // Do not span this single, not-contained code point.
2715 }
2716 const char *s8;
2717 int32_t length8;
2718 iter.reset();
2719 while((s8=iter.nextUTF8(length8))!=NULL) {
2720 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2721 // spanNeedsStrings=TRUE;
2722 int32_t matchLimit=start+length8;
2723 if(matchLimit==length) {
2724 return length;
2725 }
2726 if(spanCondition==USET_SPAN_CONTAINED) {
2727 // Iterate for the shortest match at each position.
2728 // Recurse for each but the shortest match.
2729 if(next==start) {
2730 next=matchLimit; // First match from start.
2731 } else {
2732 if(matchLimit<next) {
2733 // Remember shortest match from start for iteration.
2734 int32_t temp=next;
2735 next=matchLimit;
2736 matchLimit=temp;
2737 }
2738 // Recurse for non-shortest match from start.
2739 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2740 USET_SPAN_CONTAINED);
2741 if((matchLimit+spanLength)>maxSpanLimit) {
2742 maxSpanLimit=matchLimit+spanLength;
2743 if(maxSpanLimit==length) {
2744 return length;
2745 }
2746 }
2747 }
2748 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2749 if(matchLimit>next) {
2750 // Remember longest match from start.
2751 next=matchLimit;
2752 }
2753 }
2754 }
2755 }
2756 if(next==start) {
2757 break; // No match from start.
2758 }
2759 start=next;
2760 }
2761 if(start>maxSpanLimit) {
2762 return start;
2763 } else {
2764 return maxSpanLimit;
2765 }
2766 }
2767 }
2768
2769 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2770 USetSpanCondition spanCondition) {
2771 if(length==0) {
2772 return 0;
2773 }
2774 const UnicodeSet &realSet(set.getSet());
2775 if(!set.hasStrings()) {
2776 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2777 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2778 }
2779
2780 UChar32 c;
2781 int32_t prev=length;
2782 do {
2783 U8_PREV_OR_FFFD(s, 0, length, c);
2784 if(realSet.contains(c)!=spanCondition) {
2785 break;
2786 }
2787 } while((prev=length)>0);
2788 return prev;
2789 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2790 UnicodeSetWithStringsIterator iter(set);
2791 UChar32 c;
2792 int32_t prev=length;
2793 do {
2794 U8_PREV_OR_FFFD(s, 0, length, c);
2795 if(realSet.contains(c)) {
2796 break;
2797 }
2798 const char *s8;
2799 int32_t length8;
2800 iter.reset();
2801 while((s8=iter.nextUTF8(length8))!=NULL) {
2802 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2803 // spanNeedsStrings=TRUE;
2804 return prev;
2805 }
2806 }
2807 } while((prev=length)>0);
2808 return prev;
2809 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2810 UnicodeSetWithStringsIterator iter(set);
2811 UChar32 c;
2812 int32_t prev=length, minSpanStart=length;
2813 do {
2814 U8_PREV_OR_FFFD(s, 0, length, c);
2815 if(!realSet.contains(c)) {
2816 length=prev; // Do not span this single, not-contained code point.
2817 }
2818 const char *s8;
2819 int32_t length8;
2820 iter.reset();
2821 while((s8=iter.nextUTF8(length8))!=NULL) {
2822 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2823 // spanNeedsStrings=TRUE;
2824 int32_t matchStart=prev-length8;
2825 if(matchStart==0) {
2826 return 0;
2827 }
2828 if(spanCondition==USET_SPAN_CONTAINED) {
2829 // Iterate for the shortest match at each position.
2830 // Recurse for each but the shortest match.
2831 if(length==prev) {
2832 length=matchStart; // First match from prev.
2833 } else {
2834 if(matchStart>length) {
2835 // Remember shortest match from prev for iteration.
2836 int32_t temp=length;
2837 length=matchStart;
2838 matchStart=temp;
2839 }
2840 // Recurse for non-shortest match from prev.
2841 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2842 USET_SPAN_CONTAINED);
2843 if(spanStart<minSpanStart) {
2844 minSpanStart=spanStart;
2845 if(minSpanStart==0) {
2846 return 0;
2847 }
2848 }
2849 }
2850 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2851 if(matchStart<length) {
2852 // Remember longest match from prev.
2853 length=matchStart;
2854 }
2855 }
2856 }
2857 }
2858 if(length==prev) {
2859 break; // No match from prev.
2860 }
2861 } while((prev=length)>0);
2862 if(prev<minSpanStart) {
2863 return prev;
2864 } else {
2865 return minSpanStart;
2866 }
2867 }
2868 }
2869
2870 // spans to be performed and compared
2871 enum {
2872 SPAN_UTF16 =1,
2873 SPAN_UTF8 =2,
2874 SPAN_UTFS =3,
2875
2876 SPAN_SET =4,
2877 SPAN_COMPLEMENT =8,
2878 SPAN_POLARITY =0xc,
2879
2880 SPAN_FWD =0x10,
2881 SPAN_BACK =0x20,
2882 SPAN_DIRS =0x30,
2883
2884 SPAN_CONTAINED =0x100,
2885 SPAN_SIMPLE =0x200,
2886 SPAN_CONDITION =0x300,
2887
2888 SPAN_ALL =0x33f
2889 };
2890
2891 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2892 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2893 }
2894
2895 static inline int32_t slen(const void *s, UBool isUTF16) {
2896 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2897 }
2898
2899 /*
2900 * Count spans on a string with the method according to type and set the span limits.
2901 * The set may be the complement of the original.
2902 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2903 * according to the expected number of spans.
2904 * Sets typeName to an empty string if there is no such type.
2905 * Returns -1 if the span option is filtered out.
2906 */
2907 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2908 const void *s, int32_t length, UBool isUTF16,
2909 uint32_t whichSpans,
2910 int type, const char *&typeName,
2911 int32_t limits[], int32_t limitsCapacity,
2912 int32_t expectCount) {
2913 const UnicodeSet &realSet(set.getSet());
2914 int32_t start, count;
2915 USetSpanCondition spanCondition, firstSpanCondition, contained;
2916 UBool isForward;
2917
2918 if(type<0 || 7<type) {
2919 typeName="";
2920 return 0;
2921 }
2922
2923 static const char *const typeNames16[]={
2924 "contains", "contains(LM)",
2925 "span", "span(LM)",
2926 "containsBack", "containsBack(LM)",
2927 "spanBack", "spanBack(LM)"
2928 };
2929
2930 static const char *const typeNames8[]={
2931 "containsUTF8", "containsUTF8(LM)",
2932 "spanUTF8", "spanUTF8(LM)",
2933 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2934 "spanBackUTF8", "spanBackUTF8(LM)"
2935 };
2936
2937 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2938
2939 // filter span options
2940 if(type<=3) {
2941 // span forward
2942 if((whichSpans&SPAN_FWD)==0) {
2943 return -1;
2944 }
2945 isForward=TRUE;
2946 } else {
2947 // span backward
2948 if((whichSpans&SPAN_BACK)==0) {
2949 return -1;
2950 }
2951 isForward=FALSE;
2952 }
2953 if((type&1)==0) {
2954 // use USET_SPAN_CONTAINED
2955 if((whichSpans&SPAN_CONTAINED)==0) {
2956 return -1;
2957 }
2958 contained=USET_SPAN_CONTAINED;
2959 } else {
2960 // use USET_SPAN_SIMPLE
2961 if((whichSpans&SPAN_SIMPLE)==0) {
2962 return -1;
2963 }
2964 contained=USET_SPAN_SIMPLE;
2965 }
2966
2967 // Default first span condition for going forward with an uncomplemented set.
2968 spanCondition=USET_SPAN_NOT_CONTAINED;
2969 if(isComplement) {
2970 spanCondition=invertSpanCondition(spanCondition, contained);
2971 }
2972
2973 // First span condition for span(), used to terminate the spanBack() iteration.
2974 firstSpanCondition=spanCondition;
2975
2976 // spanBack(): Its initial span condition is span()'s last span condition,
2977 // which is the opposite of span()'s first span condition
2978 // if we expect an even number of spans.
2979 // (The loop inverts spanCondition (expectCount-1) times
2980 // before the expectCount'th span() call.)
2981 // If we do not compare forward and backward directions, then we do not have an
2982 // expectCount and just start with firstSpanCondition.
2983 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2984 spanCondition=invertSpanCondition(spanCondition, contained);
2985 }
2986
2987 count=0;
2988 switch(type) {
2989 case 0:
2990 case 1:
2991 start=0;
2992 if(length<0) {
2993 length=slen(s, isUTF16);
2994 }
2995 for(;;) {
2996 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2997 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2998 if(count<limitsCapacity) {
2999 limits[count]=start;
3000 }
3001 ++count;
3002 if(start>=length) {
3003 break;
3004 }
3005 spanCondition=invertSpanCondition(spanCondition, contained);
3006 }
3007 break;
3008 case 2:
3009 case 3:
3010 start=0;
3011 for(;;) {
3012 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3013 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3014 if(count<limitsCapacity) {
3015 limits[count]=start;
3016 }
3017 ++count;
3018 if(length>=0 ? start>=length :
3019 isUTF16 ? ((const UChar *)s)[start]==0 :
3020 ((const char *)s)[start]==0
3021 ) {
3022 break;
3023 }
3024 spanCondition=invertSpanCondition(spanCondition, contained);
3025 }
3026 break;
3027 case 4:
3028 case 5:
3029 if(length<0) {
3030 length=slen(s, isUTF16);
3031 }
3032 for(;;) {
3033 ++count;
3034 if(count<=limitsCapacity) {
3035 limits[limitsCapacity-count]=length;
3036 }
3037 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3038 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3039 if(length==0 && spanCondition==firstSpanCondition) {
3040 break;
3041 }
3042 spanCondition=invertSpanCondition(spanCondition, contained);
3043 }
3044 if(count<limitsCapacity) {
3045 memmove(limits, limits+(limitsCapacity-count), count*4);
3046 }
3047 break;
3048 case 6:
3049 case 7:
3050 for(;;) {
3051 ++count;
3052 if(count<=limitsCapacity) {
3053 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3054 }
3055 // Note: Length<0 is tested only for the first spanBack().
3056 // If we wanted to keep length<0 for all spanBack()s, we would have to
3057 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3058 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3059 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3060 if(length==0 && spanCondition==firstSpanCondition) {
3061 break;
3062 }
3063 spanCondition=invertSpanCondition(spanCondition, contained);
3064 }
3065 if(count<limitsCapacity) {
3066 memmove(limits, limits+(limitsCapacity-count), count*4);
3067 }
3068 break;
3069 default:
3070 typeName="";
3071 return -1;
3072 }
3073
3074 return count;
3075 }
3076
3077 // sets to be tested; odd index=isComplement
3078 enum {
3079 SLOW,
3080 SLOW_NOT,
3081 FAST,
3082 FAST_NOT,
3083 SET_COUNT
3084 };
3085
3086 static const char *const setNames[SET_COUNT]={
3087 "slow",
3088 "slow.not",
3089 "fast",
3090 "fast.not"
3091 };
3092
3093 /*
3094 * Verify that we get the same results whether we look at text with contains(),
3095 * span() or spanBack(), using unfrozen or frozen versions of the set,
3096 * and using the set or its complement (switching the spanConditions accordingly).
3097 * The latter verifies that
3098 * set.span(spanCondition) == set.complement().span(!spanCondition).
3099 *
3100 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3101 * or returned to the caller (with an input expectCount<0).
3102 */
3103 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3104 const void *s, int32_t length, UBool isUTF16,
3105 uint32_t whichSpans,
3106 int32_t expectLimits[], int32_t &expectCount,
3107 const char *testName, int32_t index) {
3108 int32_t limits[500];
3109 int32_t limitsCount;
3110 int i, j;
3111
3112 const char *typeName;
3113 int type;
3114
3115 for(i=0; i<SET_COUNT; ++i) {
3116 if((i&1)==0) {
3117 // Even-numbered sets are original, uncomplemented sets.
3118 if((whichSpans&SPAN_SET)==0) {
3119 continue;
3120 }
3121 } else {
3122 // Odd-numbered sets are complemented.
3123 if((whichSpans&SPAN_COMPLEMENT)==0) {
3124 continue;
3125 }
3126 }
3127 for(type=0;; ++type) {
3128 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3129 s, length, isUTF16,
3130 whichSpans,
3131 type, typeName,
3132 limits, UPRV_LENGTHOF(limits), expectCount);
3133 if(typeName[0]==0) {
3134 break; // All types tried.
3135 }
3136 if(limitsCount<0) {
3137 continue; // Span option filtered out.
3138 }
3139 if(expectCount<0) {
3140 expectCount=limitsCount;
3141 if(limitsCount>UPRV_LENGTHOF(limits)) {
3142 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3143 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3144 return;
3145 }
3146 memcpy(expectLimits, limits, limitsCount*4);
3147 } else if(limitsCount!=expectCount) {
3148 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3149 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3150 } else {
3151 for(j=0; j<limitsCount; ++j) {
3152 if(limits[j]!=expectLimits[j]) {
3153 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3154 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3155 j, (long)limits[j], (long)expectLimits[j]);
3156 break;
3157 }
3158 }
3159 }
3160 }
3161 }
3162
3163 // Compare span() with containsAll()/containsNone(),
3164 // but only if we have expectLimits[] from the uncomplemented set.
3165 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3166 const UChar *s16=(const UChar *)s;
3167 UnicodeString string;
3168 int32_t prev=0, limit, length;
3169 for(i=0; i<expectCount; ++i) {
3170 limit=expectLimits[i];
3171 length=limit-prev;
3172 if(length>0) {
3173 string.setTo(FALSE, s16+prev, length); // read-only alias
3174 if(i&1) {
3175 if(!sets[SLOW]->getSet().containsAll(string)) {
3176 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3177 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3178 return;
3179 }
3180 if(!sets[FAST]->getSet().containsAll(string)) {
3181 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3182 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3183 return;
3184 }
3185 } else {
3186 if(!sets[SLOW]->getSet().containsNone(string)) {
3187 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3188 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3189 return;
3190 }
3191 if(!sets[FAST]->getSet().containsNone(string)) {
3192 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3193 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3194 return;
3195 }
3196 }
3197 }
3198 prev=limit;
3199 }
3200 }
3201 }
3202
3203 // Specifically test either UTF-16 or UTF-8.
3204 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3205 const void *s, int32_t length, UBool isUTF16,
3206 uint32_t whichSpans,
3207 const char *testName, int32_t index) {
3208 int32_t expectLimits[500];
3209 int32_t expectCount=-1;
3210 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3211 }
3212
3213 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3214 UChar c, c2;
3215
3216 if(length>=0) {
3217 while(length>0) {
3218 c=*s++;
3219 --length;
3220 if(0xd800<=c && c<0xe000) {
3221 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3222 return TRUE;
3223 }
3224 --length;
3225 }
3226 }
3227 } else {
3228 while((c=*s++)!=0) {
3229 if(0xd800<=c && c<0xe000) {
3230 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3231 return TRUE;
3232 }
3233 }
3234 }
3235 }
3236 return FALSE;
3237 }
3238
3239 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3240 // unless either UTF is turned off in whichSpans.
3241 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3242 // have the same contains(c) value as U+FFFD.
3243 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3244 const UChar *s16, int32_t length16,
3245 uint32_t whichSpans,
3246 const char *testName, int32_t index) {
3247 int32_t expectLimits[500];
3248 int32_t expectCount;
3249
3250 expectCount=-1; // Get expectLimits[] from testSpan().
3251
3252 if((whichSpans&SPAN_UTF16)!=0) {
3253 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3254 }
3255 if((whichSpans&SPAN_UTF8)==0) {
3256 return;
3257 }
3258
3259 // Convert s16[] and expectLimits[] to UTF-8.
3260 uint8_t s8[3000];
3261 int32_t offsets[3000];
3262
3263 const UChar *s16Limit=s16+length16;
3264 char *t=(char *)s8;
3265 char *tLimit=t+sizeof(s8);
3266 int32_t *o=offsets;
3267 UErrorCode errorCode=U_ZERO_ERROR;
3268
3269 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3270 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3271 if(U_FAILURE(errorCode)) {
3272 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3273 testName, (long)index, u_errorName(errorCode));
3274 ucnv_resetFromUnicode(utf8Cnv);
3275 return;
3276 }
3277 int32_t length8=(int32_t)(t-(char *)s8);
3278
3279 // Convert expectLimits[].
3280 int32_t i, j, expect;
3281 for(i=j=0; i<expectCount; ++i) {
3282 expect=expectLimits[i];
3283 if(expect==length16) {
3284 expectLimits[i]=length8;
3285 } else {
3286 while(offsets[j]<expect) {
3287 ++j;
3288 }
3289 expectLimits[i]=j;
3290 }
3291 }
3292
3293 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3294 }
3295
3296 static UChar32 nextCodePoint(UChar32 c) {
3297 // Skip some large and boring ranges.
3298 switch(c) {
3299 case 0x3441:
3300 return 0x4d7f;
3301 case 0x5100:
3302 return 0x9f00;
3303 case 0xb040:
3304 return 0xd780;
3305 case 0xe041:
3306 return 0xf8fe;
3307 case 0x10100:
3308 return 0x20000;
3309 case 0x20041:
3310 return 0xe0000;
3311 case 0xe0101:
3312 return 0x10fffd;
3313 default:
3314 return c+1;
3315 }
3316 }
3317
3318 // Verify that all implementations represent the same set.
3319 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3320 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3321 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3322 // Skip the UTF-8 part of the test - if the string contains surrogates -
3323 // because it is likely to produce a different result.
3324 UBool inconsistentSurrogates=
3325 (!(sets[0]->getSet().contains(0xfffd) ?
3326 sets[0]->getSet().contains(0xd800, 0xdfff) :
3327 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3328 sets[0]->hasStringsWithSurrogates());
3329
3330 UChar s[1000];
3331 int32_t length=0;
3332 uint32_t localWhichSpans;
3333
3334 UChar32 c, first;
3335 for(first=c=0;; c=nextCodePoint(c)) {
3336 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3337 localWhichSpans=whichSpans;
3338 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3339 localWhichSpans&=~SPAN_UTF8;
3340 }
3341 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3342 if(c>0x10ffff) {
3343 break;
3344 }
3345 length=0;
3346 first=c;
3347 }
3348 U16_APPEND_UNSAFE(s, length, c);
3349 }
3350 }
3351
3352 // Test with a particular, interesting string.
3353 // Specify length and try NUL-termination.
3354 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3355 static const UChar s[]={
3356 0x61, 0x62, 0x20, // Latin, space
3357 0x3b1, 0x3b2, 0x3b3, // Greek
3358 0xd900, // lead surrogate
3359 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3360 0xdc05, // trail surrogate
3361 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3362 0xd900, 0xdc05, // unassigned supplementary
3363 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3364 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3365 0 // NUL
3366 };
3367
3368 if((whichSpans&SPAN_UTF16)==0) {
3369 return;
3370 }
3371 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3372 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3373 }
3374
3375 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3376 static const char s[]={
3377 "abc" // Latin
3378
3379 /* trail byte in lead position */
3380 "\x80"
3381
3382 " " // space
3383
3384 /* truncated multi-byte sequences */
3385 "\xd0"
3386 "\xe0"
3387 "\xe1"
3388 "\xed"
3389 "\xee"
3390 "\xf0"
3391 "\xf1"
3392 "\xf4"
3393 "\xf8"
3394 "\xfc"
3395
3396 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3397
3398 /* trail byte in lead position */
3399 "\x80"
3400
3401 "\xe0\x80"
3402 "\xe0\xa0"
3403 "\xe1\x80"
3404 "\xed\x80"
3405 "\xed\xa0"
3406 "\xee\x80"
3407 "\xf0\x80"
3408 "\xf0\x90"
3409 "\xf1\x80"
3410 "\xf4\x80"
3411 "\xf4\x90"
3412 "\xf8\x80"
3413 "\xfc\x80"
3414
3415 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3416
3417 /* trail byte in lead position */
3418 "\x80"
3419
3420 "\xf0\x80\x80"
3421 "\xf0\x90\x80"
3422 "\xf1\x80\x80"
3423 "\xf4\x80\x80"
3424 "\xf4\x90\x80"
3425 "\xf8\x80\x80"
3426 "\xfc\x80\x80"
3427
3428 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3429
3430 /* trail byte in lead position */
3431 "\x80"
3432
3433 "\xf8\x80\x80\x80"
3434 "\xfc\x80\x80\x80"
3435
3436 "\xF1\x90\x80\x85" // unassigned supplementary
3437
3438 /* trail byte in lead position */
3439 "\x80"
3440
3441 "\xfc\x80\x80\x80\x80"
3442
3443 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3444
3445 /* trail byte in lead position */
3446 "\x80"
3447
3448 /* complete sequences but non-shortest forms or out of range etc. */
3449 "\xc0\x80"
3450 "\xe0\x80\x80"
3451 "\xed\xa0\x80"
3452 "\xf0\x80\x80\x80"
3453 "\xf4\x90\x80\x80"
3454 "\xf8\x80\x80\x80\x80"
3455 "\xfc\x80\x80\x80\x80\x80"
3456 "\xfe"
3457 "\xff"
3458
3459 /* trail byte in lead position */
3460 "\x80"
3461
3462 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3463 };
3464
3465 if((whichSpans&SPAN_UTF8)==0) {
3466 return;
3467 }
3468 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3469 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3470 }
3471
3472 // Take a set of span options and multiply them so that
3473 // each portion only has one of the options a, b and c.
3474 // If b==0, then the set of options is just modified with mask and a.
3475 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3476 static int32_t
3477 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3478 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3479 uint32_t s;
3480 int32_t i;
3481
3482 for(i=0; i<whichSpansCount; ++i) {
3483 s=whichSpans[i]&mask;
3484 whichSpans[i]=s|a;
3485 if(b!=0) {
3486 whichSpans[whichSpansCount+i]=s|b;
3487 if(c!=0) {
3488 whichSpans[2*whichSpansCount+i]=s|c;
3489 }
3490 }
3491 }
3492 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3493 }
3494
3495 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3496 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3497 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3498 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3499
3500 void UnicodeSetTest::TestSpan() {
3501 // "[...]" is a UnicodeSet pattern.
3502 // "*" performs tests on all Unicode code points and on a selection of
3503 // malformed UTF-8/16 strings.
3504 // "-options" limits the scope of testing for the current set.
3505 // By default, the test verifies that equivalent boundaries are found
3506 // for UTF-16 and UTF-8, going forward and backward,
3507 // alternating USET_SPAN_NOT_CONTAINED with
3508 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3509 // Single-character options:
3510 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3511 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3512 // or the set contains strings with unpaired surrogates
3513 // which do not translate to valid UTF-8.
3514 // c -- set.span() and set.complement().span() boundaries may differ.
3515 // Cause: Set strings are not complemented.
3516 // b -- span() and spanBack() boundaries may differ.
3517 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3518 // and spanBack(USET_SPAN_SIMPLE) are defined to
3519 // match with non-overlapping substrings.
3520 // For example, with a set containing "ab" and "ba",
3521 // span() of "aba" yields boundaries { 0, 2, 3 }
3522 // because the initial "ab" matches from 0 to 2,
3523 // while spanBack() yields boundaries { 0, 1, 3 }
3524 // because the final "ba" matches from 1 to 3.
3525 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3526 // Cause: Strings in the set overlap, and a longer match may
3527 // require a sequence including non-longest substrings.
3528 // For example, with a set containing "ab", "abc" and "cd",
3529 // span(contained) of "abcd" spans the entire string
3530 // but span(longest match) only spans the first 3 characters.
3531 // Each "-options" first resets all options and then applies the specified options.
3532 // A "-" without options resets the options.
3533 // The options are also reset for each new set.
3534 // Other strings will be spanned.
3535 static const char *const testdata[]={
3536 "[:ID_Continue:]",
3537 "*",
3538 "[:White_Space:]",
3539 "*",
3540 "[]",
3541 "*",
3542 "[\\u0000-\\U0010FFFF]",
3543 "*",
3544 "[\\u0000\\u0080\\u0800\\U00010000]",
3545 "*",
3546 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3547 "*",
3548 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3549 "-c",
3550 "*",
3551 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3552 "-c",
3553 "*",
3554
3555 // Overlapping strings cause overlapping attempts to match.
3556 "[x{xy}{xya}{axy}{ax}]",
3557 "-cl",
3558
3559 // More repetitions of "xya" would take too long with the recursive
3560 // reference implementation.
3561 // containsAll()=FALSE
3562 // test_string 0x14
3563 "xx"
3564 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3565 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3566 "xyaxyaxyaxya"
3567 "xx"
3568 "xyaxyaxyaxya" // span() ends here.
3569 "aaa",
3570
3571 // containsAll()=TRUE
3572 // test_string 0x15
3573 "xx"
3574 "xyaxyaxyaxya"
3575 "xx"
3576 "xyaxyaxyaxya"
3577 "xx"
3578 "xyaxyaxyaxy",
3579
3580 "-bc",
3581 // test_string 0x17
3582 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3583 "-c",
3584 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3585 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3586 "-",
3587 "byaya", // span() -> { 5 }
3588 "byay", // span() -> { 4 }
3589 "bya", // span() -> { 3 }
3590
3591 // span(longest match) will not span the whole string.
3592 "[a{ab}{bc}]",
3593 "-cl",
3594 // test_string 0x21
3595 "abc",
3596
3597 "[a{ab}{abc}{cd}]",
3598 "-cl",
3599 "acdabcdabccd",
3600
3601 // spanBack(longest match) will not span the whole string.
3602 "[c{ab}{bc}]",
3603 "-cl",
3604 "abc",
3605
3606 "[d{cd}{bcd}{ab}]",
3607 "-cl",
3608 "abbcdabcdabd",
3609
3610 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3611 // and UTF-8 trail bytes.
3612 // Copies of above test sets and strings, but transliterated to have
3613 // different code points with similar trail units.
3614 // Previous: a b c d
3615 // Unicode: 042B 30AB 200AB 204AB
3616 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3617 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3618 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3619 "-cl",
3620 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3621
3622 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3623 "-cl",
3624 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3625
3626 // Stress bookkeeping and recursion.
3627 // The following strings are barely doable with the recursive
3628 // reference implementation.
3629 // The not-contained character at the end prevents an early exit from the span().
3630 "[b{bb}]",
3631 "-c",
3632 // test_string 0x33
3633 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3634 // On complement sets, span() and spanBack() get different results
3635 // because b is not in the complement set and there is an odd number of b's
3636 // in the test string.
3637 "-bc",
3638 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3639
3640 // Test with set strings with an initial or final code point span
3641 // longer than 254.
3642 "[a{" _64_a _64_a _64_a _64_a "b}"
3643 "{a" _64_b _64_b _64_b _64_b "}]",
3644 "-c",
3645 _64_a _64_a _64_a _63_a "b",
3646 _64_a _64_a _64_a _64_a "b",
3647 _64_a _64_a _64_a _64_a "aaaabbbb",
3648 "a" _64_b _64_b _64_b _63_b,
3649 "a" _64_b _64_b _64_b _64_b,
3650 "aaaabbbb" _64_b _64_b _64_b _64_b,
3651
3652 // Test with strings containing unpaired surrogates.
3653 // They are not representable in UTF-8, and a leading trail surrogate
3654 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3655 // U+20001 == \\uD840\\uDC01
3656 // U+20400 == \\uD841\\uDC00
3657 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3658 "-8cl",
3659 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3660 };
3661 uint32_t whichSpans[96]={ SPAN_ALL };
3662 int32_t whichSpansCount=1;
3663
3664 UnicodeSet *sets[SET_COUNT]={ NULL };
3665 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3666
3667 char testName[1024];
3668 char *testNameLimit=testName;
3669
3670 int32_t i, j;
3671 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3672 const char *s=testdata[i];
3673 if(s[0]=='[') {
3674 // Create new test sets from this pattern.
3675 for(j=0; j<SET_COUNT; ++j) {
3676 delete sets_with_str[j];
3677 delete sets[j];
3678 }
3679 UErrorCode errorCode=U_ZERO_ERROR;
3680 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3681 if(U_FAILURE(errorCode)) {
3682 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3683 break;
3684 }
3685 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3686 sets[SLOW_NOT]->complement();
3687 // Intermediate set: Test cloning of a frozen set.
3688 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3689 fast->freeze();
3690 sets[FAST]=(UnicodeSet *)fast->clone();
3691 delete fast;
3692 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3693 fastNot->freeze();
3694 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3695 delete fastNot;
3696
3697 for(j=0; j<SET_COUNT; ++j) {
3698 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3699 }
3700
3701 strcpy(testName, s);
3702 testNameLimit=strchr(testName, 0);
3703 *testNameLimit++=':';
3704 *testNameLimit=0;
3705
3706 whichSpans[0]=SPAN_ALL;
3707 whichSpansCount=1;
3708 } else if(s[0]=='-') {
3709 whichSpans[0]=SPAN_ALL;
3710 whichSpansCount=1;
3711
3712 while(*++s!=0) {
3713 switch(*s) {
3714 case 'c':
3715 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3716 ~SPAN_POLARITY,
3717 SPAN_SET,
3718 SPAN_COMPLEMENT,
3719 0);
3720 break;
3721 case 'b':
3722 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3723 ~SPAN_DIRS,
3724 SPAN_FWD,
3725 SPAN_BACK,
3726 0);
3727 break;
3728 case 'l':
3729 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3730 // USET_SPAN_SIMPLE only FWD, and separately
3731 // USET_SPAN_SIMPLE only BACK
3732 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3733 ~(SPAN_DIRS|SPAN_CONDITION),
3734 SPAN_DIRS|SPAN_CONTAINED,
3735 SPAN_FWD|SPAN_SIMPLE,
3736 SPAN_BACK|SPAN_SIMPLE);
3737 break;
3738 case '8':
3739 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3740 ~SPAN_UTFS,
3741 SPAN_UTF16,
3742 SPAN_UTF8,
3743 0);
3744 break;
3745 default:
3746 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3747 break;
3748 }
3749 }
3750 } else if(0==strcmp(s, "*")) {
3751 strcpy(testNameLimit, "bad_string");
3752 for(j=0; j<whichSpansCount; ++j) {
3753 if(whichSpansCount>1) {
3754 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3755 "%%0x%3x",
3756 whichSpans[j]);
3757 }
3758 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3759 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3760 }
3761
3762 strcpy(testNameLimit, "contents");
3763 for(j=0; j<whichSpansCount; ++j) {
3764 if(whichSpansCount>1) {
3765 sprintf(testNameLimit+8 /* strlen("contents") */,
3766 "%%0x%3x",
3767 whichSpans[j]);
3768 }
3769 testSpanContents(sets_with_str, whichSpans[j], testName);
3770 }
3771 } else {
3772 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3773 strcpy(testNameLimit, "test_string");
3774 for(j=0; j<whichSpansCount; ++j) {
3775 if(whichSpansCount>1) {
3776 sprintf(testNameLimit+11 /* strlen("test_string") */,
3777 "%%0x%3x",
3778 whichSpans[j]);
3779 }
3780 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3781 }
3782 }
3783 }
3784 for(j=0; j<SET_COUNT; ++j) {
3785 delete sets_with_str[j];
3786 delete sets[j];
3787 }
3788 }
3789
3790 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3791 void UnicodeSetTest::TestStringSpan() {
3792 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3793 static const char *const string=
3794 "xx"
3795 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3796 "xx"
3797 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3798 "xx"
3799 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3800 "aaaa";
3801
3802 UErrorCode errorCode=U_ZERO_ERROR;
3803 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3804 UnicodeSet set(pattern16, errorCode);
3805 if(U_FAILURE(errorCode)) {
3806 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3807 return;
3808 }
3809
3810 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3811
3812 if(set.containsAll(string16)) {
3813 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3814 }
3815
3816 // Remove trailing "aaaa".
3817 string16.truncate(string16.length()-4);
3818 if(!set.containsAll(string16)) {
3819 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3820 }
3821
3822 string16=UNICODE_STRING_SIMPLE("byayaxya");
3823 const UChar *s16=string16.getBuffer();
3824 int32_t length16=string16.length();
3825 (void)length16; // Suppress set but not used warning.
3826 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3827 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3828 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3829 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3830 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3831 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3832 ) {
3833 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3834 }
3835
3836 pattern="[a{ab}{abc}{cd}]";
3837 pattern16=UnicodeString(pattern, -1, US_INV);
3838 set.applyPattern(pattern16, errorCode);
3839 if(U_FAILURE(errorCode)) {
3840 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3841 return;
3842 }
3843 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3844 s16=string16.getBuffer();
3845 length16=string16.length();
3846 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3847 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3848 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3849 ) {
3850 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3851 }
3852
3853 pattern="[d{cd}{bcd}{ab}]";
3854 pattern16=UnicodeString(pattern, -1, US_INV);
3855 set.applyPattern(pattern16, errorCode).freeze();
3856 if(U_FAILURE(errorCode)) {
3857 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3858 return;
3859 }
3860 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3861 s16=string16.getBuffer();
3862 length16=string16.length();
3863 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3864 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3865 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3866 ) {
3867 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3868 }
3869 }
3870
3871 /**
3872 * Including collationroot.h fails here with
3873 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3874 * .. so, we skip this test on Windows.
3875 *
3876 * the cause is that intltest builds with /Za which disables language extensions - which means
3877 * windows header files can't be used.
3878 */
3879 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3880 #include "collationroot.h"
3881 #include "collationtailoring.h"
3882 #endif
3883
3884 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3885 #if U_PLATFORM_HAS_WIN32_API
3886 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3887 #elif !UCONFIG_NO_COLLATION
3888 UErrorCode errorCode = U_ZERO_ERROR;
3889
3890 // Get the unsafeBackwardsSet
3891 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3892 if(U_FAILURE(errorCode)) {
3893 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3894 return;
3895 }
3896 //const UVersionInfo &version = rootEntry->tailoring->version;
3897 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3898
3899 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3900
3901 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3902 // simple test case
3903 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3904 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3905 UnicodeSet surrogates;
3906 surrogates.add(0xd83a); // a lead surrogate
3907 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3908 UnicodeString pat;
3909 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3910 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3911 // so that at least one type of surrogate code points are escaped,
3912 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3913 errorCode = U_ZERO_ERROR;
3914 UnicodeSet s2;
3915 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3916 if(U_FAILURE(errorCode)) {
3917 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3918 } else {
3919 checkEqual(surrogates, s2, "surrogates to/from pattern");
3920 }
3921 // This occurs in the UCA unsafe-backwards set.
3922 checkRoundTrip(*unsafeBackwardSet);
3923 }
3924 #endif
3925 }