]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/usettest.cpp
ICU-511.34.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
CommitLineData
b75a7d8f 1/*
46f4442e 2********************************************************************************
51004dcb 3* Copyright (C) 1999-2012 International Business Machines Corporation and
b75a7d8f 4* others. All Rights Reserved.
46f4442e 5********************************************************************************
b75a7d8f
A
6* Date Name Description
7* 10/20/99 alan Creation.
8* 03/22/2000 Madhu Added additional tests
46f4442e 9********************************************************************************
b75a7d8f
A
10*/
11
46f4442e
A
12#include <stdio.h>
13
14#include <string.h>
b75a7d8f
A
15#include "unicode/utypes.h"
16#include "usettest.h"
46f4442e 17#include "unicode/ucnv.h"
b75a7d8f
A
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/usetiter.h"
21#include "unicode/ustring.h"
374ca955
A
22#include "unicode/parsepos.h"
23#include "unicode/symtable.h"
73c04bcf 24#include "unicode/uversion.h"
374ca955 25#include "hash.h"
b75a7d8f 26
46f4442e 27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
73c04bcf
A
28
29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
729e4ab9 30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
73c04bcf
A
31 u_errorName(status));}}
32
33#define TEST_ASSERT(expr) {if (!(expr)) { \
729e4ab9 34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
73c04bcf 35
b75a7d8f
A
36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37 UnicodeString pat;
38 set.toPattern(pat);
39 return left + UnicodeSetTest::escape(pat);
40}
41
42#define CASE(id,test) case id: \
43 name = #test; \
44 if (exec) { \
45 logln(#test "---"); \
46f4442e 46 logln(); \
b75a7d8f
A
47 test(); \
48 } \
49 break
50
46f4442e
A
51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52}
53
54UConverter *UnicodeSetTest::openUTF8Converter() {
55 if(utf8Cnv==NULL) {
56 UErrorCode errorCode=U_ZERO_ERROR;
57 utf8Cnv=ucnv_open("UTF-8", &errorCode);
58 }
59 return utf8Cnv;
60}
61
62UnicodeSetTest::~UnicodeSetTest() {
63 ucnv_close(utf8Cnv);
64}
65
b75a7d8f
A
66void
67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68 const char* &name, char* /*par*/) {
69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70 switch (index) {
71 CASE(0,TestPatterns);
72 CASE(1,TestAddRemove);
73 CASE(2,TestCategories);
74 CASE(3,TestCloneEqualHash);
75 CASE(4,TestMinimalRep);
76 CASE(5,TestAPI);
77 CASE(6,TestScriptSet);
78 CASE(7,TestPropertySet);
79 CASE(8,TestClone);
80 CASE(9,TestExhaustive);
81 CASE(10,TestToPattern);
82 CASE(11,TestIndexOf);
83 CASE(12,TestStrings);
374ca955
A
84 CASE(13,Testj2268);
85 CASE(14,TestCloseOver);
86 CASE(15,TestEscapePattern);
87 CASE(16,TestInvalidCodePoint);
88 CASE(17,TestSymbolTable);
89 CASE(18,TestSurrogate);
73c04bcf
A
90 CASE(19,TestPosixClasses);
91 CASE(20,TestIteration);
46f4442e
A
92 CASE(21,TestFreezable);
93 CASE(22,TestSpan);
94 CASE(23,TestStringSpan);
b75a7d8f
A
95 default: name = ""; break;
96 }
97}
98
374ca955
A
99static const char NOT[] = "%%%%";
100
b75a7d8f
A
101/**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105void UnicodeSetTest::Testj2268() {
106 UnicodeSet t;
107 t.add(UnicodeString("abc"));
108 UnicodeSet test(t);
109 UnicodeString ustrPat;
110 test.toPattern(ustrPat, TRUE);
111}
112
113/**
374ca955 114 * Test toPattern().
b75a7d8f
A
115 */
116void UnicodeSetTest::TestToPattern() {
374ca955 117 UErrorCode ec = U_ZERO_ERROR;
b75a7d8f 118
374ca955
A
119 // Test that toPattern() round trips with syntax characters and
120 // whitespace.
121 {
122 static const char* OTHER_TOPATTERN_TESTS[] = {
123 "[[:latin:]&[:greek:]]",
124 "[[:latin:]-[:greek:]]",
125 "[:nonspacing mark:]",
126 NULL
127 };
128
129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130 ec = U_ZERO_ERROR;
131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132 if (U_FAILURE(ec)) {
729e4ab9 133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
b75a7d8f
A
134 continue;
135 }
374ca955
A
136 checkPat(OTHER_TOPATTERN_TESTS[j], s);
137 }
138
139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142 // check various combinations to make sure they all work.
143 if (i != 0 && !toPatternAux(i, i)){
144 continue;
145 }
146 if (!toPatternAux(0, i)){
147 continue;
148 }
149 if (!toPatternAux(i, 0xFFFF)){
150 continue;
151 }
b75a7d8f
A
152 }
153 }
154 }
374ca955
A
155
156 // Test pattern behavior of multicharacter strings.
157 {
158 ec = U_ZERO_ERROR;
159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161 // This loop isn't a loop. It's here to make the compiler happy.
162 // If you're curious, try removing it and changing the 'break'
163 // statements (except for the last) to goto's.
164 for (;;) {
165 if (U_FAILURE(ec)) break;
166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169 s->add("ac");
170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
46f4442e 173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
374ca955
A
174 if (U_FAILURE(ec)) break;
175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
46f4442e 176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
374ca955
A
177
178 s->add("[]");
179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
46f4442e 180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
374ca955 181
46f4442e 182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
374ca955
A
183 if (U_FAILURE(ec)) break;
184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
46f4442e 185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
374ca955
A
186
187 // j2189
188 s->clear();
189 s->add(UnicodeString("abc", ""));
190 s->add(UnicodeString("abc", ""));
191 const char* exp6[] = {"abc", NOT, "ab", NULL};
192 expectToPattern(*s, "[{abc}]", exp6);
193
194 break;
195 }
196
197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198 delete s;
199 }
200
201 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202 UnicodeSet s;
203 s.add((UChar)97, (UChar)98); // 'a', 'b'
204 expectToPattern(s, "[ab]", NULL);
b75a7d8f
A
205}
206
207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209 // use Integer.toString because Utility.hex doesn't handle ints
210 UnicodeString pat = "";
211 // TODO do these in hex
212 //String source = "0x" + Integer.toString(start,16).toUpperCase();
213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214 UnicodeString source;
215 source = source + (uint32_t)start;
216 if (start != end)
217 source = source + ".." + (uint32_t)end;
218 UnicodeSet testSet;
219 testSet.add(start, end);
220 return checkPat(source, testSet);
221}
222
223UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224 const UnicodeSet& testSet) {
225 // What we want to make sure of is that a pattern generated
226 // by toPattern(), with or without escaped unprintables, can
227 // be passed back into the UnicodeSet constructor.
228 UnicodeString pat0;
229
230 testSet.toPattern(pat0, TRUE);
231
232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234 //String pat1 = unescapeLeniently(pat0);
235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237 UnicodeString pat2;
238 testSet.toPattern(pat2, FALSE);
239 if (!checkPat(source, testSet, pat2)) return FALSE;
240
241 //String pat3 = unescapeLeniently(pat2);
242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246 return TRUE;
247}
248
249UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250 const UnicodeSet& testSet,
251 const UnicodeString& pat) {
252 UErrorCode ec = U_ZERO_ERROR;
253 UnicodeSet testSet2(pat, ec);
254 if (testSet2 != testSet) {
255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256 return FALSE;
257 }
258 return TRUE;
259}
260
261void
262UnicodeSetTest::TestPatterns(void) {
263 UnicodeSet set;
264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271 // Throw in a test of complement
272 set.complement();
273 UnicodeString exp;
274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275 expectPairs(set, exp);
276}
277
278void
279UnicodeSetTest::TestCategories(void) {
280 UErrorCode status = U_ZERO_ERROR;
281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282 UnicodeSet set(pat, status);
283 if (U_FAILURE(status)) {
729e4ab9
A
284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285 return;
b75a7d8f
A
286 } else {
287 expectContainment(set, pat, "ABC", "abc");
288 }
289
290 UChar32 i;
291 int32_t failures = 0;
292 // Make sure generation of L doesn't pollute cached Lu set
293 // First generate L, then Lu
294 set.applyPattern("[:L:]", status);
295 if (U_FAILURE(status)) { errln("FAIL"); return; }
296 for (i=0; i<0x200; ++i) {
297 UBool l = u_isalpha((UChar)i);
298 if (l != set.contains(i)) {
299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300 set.contains(i));
301 if (++failures == 10) break;
302 }
303 }
304
305 set.applyPattern("[:Lu:]", status);
306 if (U_FAILURE(status)) { errln("FAIL"); return; }
307 for (i=0; i<0x200; ++i) {
308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309 if (lu != set.contains(i)) {
310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311 set.contains(i));
312 if (++failures == 20) break;
313 }
314 }
315}
316void
317UnicodeSetTest::TestCloneEqualHash(void) {
318 UErrorCode status = U_ZERO_ERROR;
319 // set1 and set2 used to be built with the obsolete constructor taking
320 // UCharCategory values; replaced with pattern constructors
321 // markus 20030502
46f4442e
A
322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
b75a7d8f 324 if (U_FAILURE(status)){
729e4ab9 325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
b75a7d8f
A
326 return;
327 }
46f4442e
A
328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
b75a7d8f
A
330 if (U_FAILURE(status)){
331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332 return;
333 }
334
335 if (*set1 != *set1a) {
336 errln("FAIL: category constructor for Ll broken");
337 }
338 if (*set2 != *set2a) {
339 errln("FAIL: category constructor for Nd broken");
340 }
341 delete set1a;
342 delete set2a;
343
344 logln("Testing copy construction");
345 UnicodeSet *set1copy=new UnicodeSet(*set1);
346 if(*set1 != *set1copy || *set1 == *set2 ||
347 getPairs(*set1) != getPairs(*set1copy) ||
348 set1->hashCode() != set1copy->hashCode()){
349 errln("FAIL : Error in copy construction");
350 return;
351 }
352
353 logln("Testing =operator");
354 UnicodeSet set1equal=*set1;
355 UnicodeSet set2equal=*set2;
356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358 errln("FAIL: Error in =operator");
359 }
360
361 logln("Testing clone()");
362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367 errln("FAIL: Error in clone");
368 }
369
370 logln("Testing hashcode");
371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376 errln("FAIL: Error in hashCode()");
377 }
378
379 delete set1;
380 delete set1copy;
381 delete set2;
382 delete set1clone;
383 delete set2clone;
384
385
386}
387void
388UnicodeSetTest::TestAddRemove(void) {
389 UnicodeSet set; // Construct empty set
390 doAssert(set.isEmpty() == TRUE, "set should be empty");
391 doAssert(set.size() == 0, "size should be 0");
374ca955
A
392 set.complement();
393 doAssert(set.size() == 0x110000, "size should be 0x110000");
394 set.clear();
b75a7d8f
A
395 set.add(0x0061, 0x007a);
396 expectPairs(set, "az");
397 doAssert(set.isEmpty() == FALSE, "set should not be empty");
398 doAssert(set.size() != 0, "size should not be equal to 0");
399 doAssert(set.size() == 26, "size should be equal to 26");
400 set.remove(0x006d, 0x0070);
401 expectPairs(set, "alqz");
402 doAssert(set.size() == 22, "size should be equal to 22");
403 set.remove(0x0065, 0x0067);
404 expectPairs(set, "adhlqz");
405 doAssert(set.size() == 19, "size should be equal to 19");
406 set.remove(0x0064, 0x0069);
407 expectPairs(set, "acjlqz");
408 doAssert(set.size() == 16, "size should be equal to 16");
409 set.remove(0x0063, 0x0072);
410 expectPairs(set, "absz");
411 doAssert(set.size() == 10, "size should be equal to 10");
412 set.add(0x0066, 0x0071);
413 expectPairs(set, "abfqsz");
414 doAssert(set.size() == 22, "size should be equal to 22");
415 set.remove(0x0061, 0x0067);
416 expectPairs(set, "hqsz");
417 set.remove(0x0061, 0x007a);
418 expectPairs(set, "");
419 doAssert(set.isEmpty() == TRUE, "set should be empty");
420 doAssert(set.size() == 0, "size should be 0");
421 set.add(0x0061);
422 doAssert(set.isEmpty() == FALSE, "set should not be empty");
423 doAssert(set.size() == 1, "size should not be equal to 1");
424 set.add(0x0062);
425 set.add(0x0063);
426 expectPairs(set, "ac");
427 doAssert(set.size() == 3, "size should not be equal to 3");
428 set.add(0x0070);
429 set.add(0x0071);
430 expectPairs(set, "acpq");
431 doAssert(set.size() == 5, "size should not be equal to 5");
432 set.clear();
433 expectPairs(set, "");
434 doAssert(set.isEmpty() == TRUE, "set should be empty");
435 doAssert(set.size() == 0, "size should be 0");
436
437 // Try removing an entire set from another set
438 expectPattern(set, "[c-x]", "cx");
439 UnicodeSet set2;
440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441 set.removeAll(set2);
442 expectPairs(set, "deluxx");
443
444 // Try adding an entire set to another set
445 expectPattern(set, "[jackiemclean]", "aacceein");
446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447 set.addAll(set2);
448 expectPairs(set, "aacehort");
449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450
451 // Try retaining an set of elements contained in another set (intersection)
452 UnicodeSet set3;
453 expectPattern(set3, "[a-c]", "ac");
454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455 set3.remove(0x0062);
456 expectPairs(set3, "aacc");
457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458 set.retainAll(set3);
459 expectPairs(set, "aacc");
460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462 set.clear();
463 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464
465 // Test commutativity
466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467 expectPattern(set2, "[jackiemclean]", "aacceein");
468 set.addAll(set2);
469 expectPairs(set, "aacehort");
470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471
472
473
474
475}
476
477/**
478 * Make sure minimal representation is maintained.
479 */
480void UnicodeSetTest::TestMinimalRep() {
481 UErrorCode status = U_ZERO_ERROR;
482 // This is pretty thoroughly tested by checkCanonicalRep()
483 // run against the exhaustive operation results. Use the code
484 // here for debugging specific spot problems.
485
486 // 1 overlap against 2
487 UnicodeSet set("[h-km-q]", status);
488 if (U_FAILURE(status)) { errln("FAIL"); return; }
489 UnicodeSet set2("[i-o]", status);
490 if (U_FAILURE(status)) { errln("FAIL"); return; }
491 set.addAll(set2);
492 expectPairs(set, "hq");
493 // right
494 set.applyPattern("[a-m]", status);
495 if (U_FAILURE(status)) { errln("FAIL"); return; }
496 set2.applyPattern("[e-o]", status);
497 if (U_FAILURE(status)) { errln("FAIL"); return; }
498 set.addAll(set2);
499 expectPairs(set, "ao");
500 // left
501 set.applyPattern("[e-o]", status);
502 if (U_FAILURE(status)) { errln("FAIL"); return; }
503 set2.applyPattern("[a-m]", status);
504 if (U_FAILURE(status)) { errln("FAIL"); return; }
505 set.addAll(set2);
506 expectPairs(set, "ao");
507 // 1 overlap against 3
508 set.applyPattern("[a-eg-mo-w]", status);
509 if (U_FAILURE(status)) { errln("FAIL"); return; }
510 set2.applyPattern("[d-q]", status);
511 if (U_FAILURE(status)) { errln("FAIL"); return; }
512 set.addAll(set2);
513 expectPairs(set, "aw");
514}
515
516void UnicodeSetTest::TestAPI() {
517 UErrorCode status = U_ZERO_ERROR;
518 // default ct
519 UnicodeSet set;
520 if (!set.isEmpty() || set.getRangeCount() != 0) {
521 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522 set);
523 }
524
525 // clear(), isEmpty()
526 set.add(0x0061);
527 if (set.isEmpty()) {
528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529 set);
530 }
531 set.clear();
532 if (!set.isEmpty()) {
533 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534 set);
535 }
536
537 // size()
538 set.clear();
539 if (set.size() != 0) {
540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541 ": " + set);
542 }
543 set.add(0x0061);
544 if (set.size() != 1) {
545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546 ": " + set);
547 }
548 set.add(0x0031, 0x0039);
549 if (set.size() != 10) {
550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551 ": " + set);
552 }
553
554 // contains(first, last)
555 set.clear();
556 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557 if (U_FAILURE(status)) { errln("FAIL"); return; }
558 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559 UChar32 a = set.getRangeStart(i);
560 UChar32 b = set.getRangeEnd(i);
561 if (!set.contains(a, b)) {
562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563 " but doesn't: " + set);
564 }
565 if (set.contains((UChar32)(a-1), b)) {
566 errln((UnicodeString)"FAIL, shouldn't contain " +
567 (unsigned short)(a-1) + '-' + (unsigned short)b +
568 " but does: " + set);
569 }
570 if (set.contains(a, (UChar32)(b+1))) {
571 errln((UnicodeString)"FAIL, shouldn't contain " +
572 (unsigned short)a + '-' + (unsigned short)(b+1) +
573 " but does: " + set);
574 }
575 }
576
577 // Ported InversionList test.
578 UnicodeSet a((UChar32)3,(UChar32)10);
579 UnicodeSet b((UChar32)7,(UChar32)15);
580 UnicodeSet c;
581
582 logln((UnicodeString)"a [3-10]: " + a);
583 logln((UnicodeString)"b [7-15]: " + b);
374ca955
A
584 c = a;
585 c.addAll(b);
b75a7d8f
A
586 UnicodeSet exp((UChar32)3,(UChar32)15);
587 if (c == exp) {
588 logln((UnicodeString)"c.set(a).add(b): " + c);
589 } else {
590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591 }
592 c.complement();
593 exp.set((UChar32)0, (UChar32)2);
594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595 if (c == exp) {
596 logln((UnicodeString)"c.complement(): " + c);
597 } else {
598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599 }
600 c.complement();
601 exp.set((UChar32)3, (UChar32)15);
602 if (c == exp) {
603 logln((UnicodeString)"c.complement(): " + c);
604 } else {
605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606 }
374ca955
A
607 c = a;
608 c.complementAll(b);
b75a7d8f
A
609 exp.set((UChar32)3,(UChar32)6);
610 exp.add((UChar32)11,(UChar32) 15);
611 if (c == exp) {
612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613 } else {
614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615 }
616
617 exp = c;
618 bitsToSet(setToBits(c), c);
619 if (c == exp) {
620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621 } else {
622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623 }
624
625 // Additional tests for coverage JB#2118
626 //UnicodeSet::complement(class UnicodeString const &)
627 //UnicodeSet::complementAll(class UnicodeString const &)
628 //UnicodeSet::containsNone(class UnicodeSet const &)
629 //UnicodeSet::containsNone(long,long)
630 //UnicodeSet::containsSome(class UnicodeSet const &)
631 //UnicodeSet::containsSome(long,long)
632 //UnicodeSet::removeAll(class UnicodeString const &)
633 //UnicodeSet::retain(long)
634 //UnicodeSet::retainAll(class UnicodeString const &)
635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636 //UnicodeSetIterator::getString(void)
637 set.clear();
638 set.complement("ab");
639 exp.applyPattern("[{ab}]", status);
640 if (U_FAILURE(status)) { errln("FAIL"); return; }
641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642
643 UnicodeSetIterator iset(set);
644 if (!iset.next() || !iset.isString()) {
645 errln("FAIL: UnicodeSetIterator::next/isString");
646 } else if (iset.getString() != "ab") {
647 errln("FAIL: UnicodeSetIterator::getString");
648 }
649
650 set.add((UChar32)0x61, (UChar32)0x7A);
651 set.complementAll("alan");
652 exp.applyPattern("[{ab}b-kmo-z]", status);
653 if (U_FAILURE(status)) { errln("FAIL"); return; }
654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655
656 exp.applyPattern("[a-z]", status);
657 if (U_FAILURE(status)) { errln("FAIL"); return; }
658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660 exp.applyPattern("[aln]", status);
661 if (U_FAILURE(status)) { errln("FAIL"); return; }
662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664
665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666 errln("FAIL: containsNone(UChar32, UChar32)");
667 }
668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669 errln("FAIL: containsSome(UChar32, UChar32)");
670 }
671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672 errln("FAIL: containsNone(UChar32, UChar32)");
673 }
674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675 errln("FAIL: containsSome(UChar32, UChar32)");
676 }
677
678 set.removeAll("liu");
679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680 if (U_FAILURE(status)) { errln("FAIL"); return; }
681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682
683 set.retainAll("star");
684 exp.applyPattern("[rst]", status);
685 if (U_FAILURE(status)) { errln("FAIL"); return; }
686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687
688 set.retain((UChar32)0x73);
689 exp.applyPattern("[s]", status);
690 if (U_FAILURE(status)) { errln("FAIL"); return; }
691 if (set != exp) { errln("FAIL: retain('s')"); return; }
692
693 uint16_t buf[32];
694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697 errln("FAIL: serialize");
698 return;
699 }
729e4ab9
A
700
701 // Conversions to and from USet
702 UnicodeSet *uniset = &set;
703 USet *uset = uniset->toUSet();
704 TEST_ASSERT((void *)uset == (void *)uniset);
705 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706 TEST_ASSERT((void *)setx == (void *)uset);
707 const UnicodeSet *constSet = uniset;
708 const USet *constUSet = constSet->toUSet();
709 TEST_ASSERT((void *)constUSet == (void *)constSet);
710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711 TEST_ASSERT((void *)constSetx == (void *)constUSet);
712
713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715 UnicodeSet ac(0x61, 0x63);
716 ac.remove(0x62).freeze();
717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727 ) {
728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729 }
730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740 ) {
741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742 }
b75a7d8f
A
743}
744
73c04bcf
A
745void UnicodeSetTest::TestIteration() {
746 UErrorCode ec = U_ZERO_ERROR;
747 int i = 0;
748 int outerLoop;
749
750 // 6 code points, 3 ranges, 2 strings, 8 total elements
751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
46f4442e 752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
73c04bcf
A
753 TEST_ASSERT_SUCCESS(ec);
754 UnicodeSetIterator it(set);
755
756 for (outerLoop=0; outerLoop<3; outerLoop++) {
757 // Run the test multiple times, to check that iterator.reset() is working.
758 for (i=0; i<10; i++) {
759 UBool nextv = it.next();
760 UBool isString = it.isString();
761 int32_t codePoint = it.getCodepoint();
762 //int32_t codePointEnd = it.getCodepointEnd();
763 UnicodeString s = it.getString();
764 switch (i) {
765 case 0:
766 TEST_ASSERT(nextv == TRUE);
767 TEST_ASSERT(isString == FALSE);
768 TEST_ASSERT(codePoint==0x61);
769 TEST_ASSERT(s == "a");
770 break;
771 case 1:
772 TEST_ASSERT(nextv == TRUE);
773 TEST_ASSERT(isString == FALSE);
774 TEST_ASSERT(codePoint==0x62);
775 TEST_ASSERT(s == "b");
776 break;
777 case 2:
778 TEST_ASSERT(nextv == TRUE);
779 TEST_ASSERT(isString == FALSE);
780 TEST_ASSERT(codePoint==0x63);
781 TEST_ASSERT(s == "c");
782 break;
783 case 3:
784 TEST_ASSERT(nextv == TRUE);
785 TEST_ASSERT(isString == FALSE);
786 TEST_ASSERT(codePoint==0x79);
787 TEST_ASSERT(s == "y");
788 break;
789 case 4:
790 TEST_ASSERT(nextv == TRUE);
791 TEST_ASSERT(isString == FALSE);
792 TEST_ASSERT(codePoint==0x7a);
793 TEST_ASSERT(s == "z");
794 break;
795 case 5:
796 TEST_ASSERT(nextv == TRUE);
797 TEST_ASSERT(isString == FALSE);
798 TEST_ASSERT(codePoint==0x1abcd);
799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800 break;
801 case 6:
802 TEST_ASSERT(nextv == TRUE);
803 TEST_ASSERT(isString == TRUE);
804 TEST_ASSERT(s == "str1");
805 break;
806 case 7:
807 TEST_ASSERT(nextv == TRUE);
808 TEST_ASSERT(isString == TRUE);
809 TEST_ASSERT(s == "str2");
810 break;
811 case 8:
812 TEST_ASSERT(nextv == FALSE);
813 break;
814 case 9:
815 TEST_ASSERT(nextv == FALSE);
816 break;
817 }
818 }
819 it.reset(); // prepare to run the iteration again.
820 }
821}
822
823
824
825
b75a7d8f
A
826void UnicodeSetTest::TestStrings() {
827 UErrorCode ec = U_ZERO_ERROR;
828
829 UnicodeSet* testList[] = {
830 UnicodeSet::createFromAll("abc"),
831 new UnicodeSet("[a-c]", ec),
832
833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834 new UnicodeSet("[{ll}{ch}a-z]", ec),
835
836 UnicodeSet::createFrom("ab}c"),
837 new UnicodeSet("[{ab\\}c}]", ec),
838
839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841
842 NULL
843 };
844
845 if (U_FAILURE(ec)) {
846 errln("FAIL: couldn't construct test sets");
847 }
848
849 for (int32_t i = 0; testList[i] != NULL; i+=2) {
850 if (U_SUCCESS(ec)) {
851 UnicodeString pat0, pat1;
852 testList[i]->toPattern(pat0, TRUE);
853 testList[i+1]->toPattern(pat1, TRUE);
854 if (*testList[i] == *testList[i+1]) {
855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856 } else {
857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858 }
859 }
860 delete testList[i];
861 delete testList[i+1];
862 }
863}
864
b75a7d8f
A
865/**
866 * Test the [:Latin:] syntax.
867 */
868void UnicodeSetTest::TestScriptSet() {
46f4442e 869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
b75a7d8f 870
46f4442e 871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
b75a7d8f
A
872
873 /* Jitterbug 1423 */
46f4442e 874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
b75a7d8f
A
875
876}
877
878/**
879 * Test the [:Latin:] syntax.
880 */
881void UnicodeSetTest::TestPropertySet() {
46f4442e 882 static const char* const DATA[] = {
b75a7d8f
A
883 // Pattern, Chars IN, Chars NOT in
884
885 "[:Latin:]",
886 "aA",
887 "\\u0391\\u03B1",
888
889 "[\\p{Greek}]",
890 "\\u0391\\u03B1",
891 "aA",
892
893 "\\P{ GENERAL Category = upper case letter }",
894 "abc",
895 "ABC",
896
729e4ab9 897#if !UCONFIG_NO_NORMALIZATION
b75a7d8f
A
898 // Combining class: @since ICU 2.2
899 // Check both symbolic and numeric
900 "\\p{ccc=Nukta}",
901 "\\u0ABC",
902 "abc",
903
904 "\\p{Canonical Combining Class = 11}",
905 "\\u05B1",
906 "\\u05B2",
907
908 "[:c c c = iota subscript :]",
909 "\\u0345",
910 "xyz",
729e4ab9 911#endif
b75a7d8f
A
912
913 // Bidi class: @since ICU 2.2
914 "\\p{bidiclass=lefttoright}",
915 "abc",
916 "\\u0671\\u0672",
917
918 // Binary properties: @since ICU 2.2
919 "\\p{ideographic}",
920 "\\u4E0A",
921 "x",
922
923 "[:math=false:]",
374ca955
A
924 "q)*(",
925 // weiv: )(and * were removed from math in Unicode 4.0.1
926 //"(*+)",
927 "+<>^",
b75a7d8f
A
928
929 // JB#1767 \N{}, \p{ASCII}
930 "[:Ascii:]",
931 "abc\\u0000\\u007F",
932 "\\u0080\\u4E00",
933
934 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
935 "az",
936 "qrs",
937
938 // JB#2015
939 "[:any:]",
940 "a\\U0010FFFF",
941 "",
942
943 "[:nv=0.5:]",
944 "\\u00BD\\u0F2A",
945 "\\u00BC",
946
947 // JB#2653: Age
948 "[:Age=1.1:]",
949 "\\u03D6", // 1.1
950 "\\u03D8\\u03D9", // 3.2
951
952 "[:Age=3.1:]",
953 "\\u1800\\u3400\\U0002f800",
954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955
956 // JB#2350: Case_Sensitive
957 "[:Case Sensitive:]",
958 "A\\u1FFC\\U00010410",
959 ";\\u00B4\\U00010500",
960
961 // JB#2832: C99-compatibility props
962 "[:blank:]",
963 " \\u0009",
964 "1-9A-Z",
965
966 "[:graph:]",
967 "19AZ",
968 " \\u0003\\u0007\\u0009\\u000A\\u000D",
969
970 "[:punct:]",
971 "!@#%&*()[]{}-_\\/;:,.?'\"",
972 "09azAZ",
973
974 "[:xdigit:]",
975 "09afAF",
976 "gG!",
977
978 // Regex compatibility test
979 "[-b]", // leading '-' is literal
980 "-b",
981 "ac",
982
983 "[^-b]", // leading '-' is literal
984 "ac",
985 "-b",
986
987 "[b-]", // trailing '-' is literal
988 "-b",
989 "ac",
990
991 "[^b-]", // trailing '-' is literal
992 "ac",
374ca955
A
993 "-b",
994
995 "[a-b-]", // trailing '-' is literal
996 "ab-",
997 "c=",
998
999 "[[a-q]&[p-z]-]", // trailing '-' is literal
1000 "pq-",
1001 "or=",
1002
1003 "[\\s|\\)|:|$|\\>]", // from regex tests
1004 "s|):$>",
1005 "abc",
1006
1007 "[\\uDC00cd]", // JB#2906: isolated trail at start
1008 "cd\\uDC00",
1009 "ab\\uD800\\U00010000",
1010
1011 "[ab\\uD800]", // JB#2906: isolated trail at start
1012 "ab\\uD800",
1013 "cd\\uDC00\\U00010000",
1014
1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016 "abcd\\uD800",
1017 "ef\\uDC00\\U00010000",
1018
1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020 "abcd\\uDC00",
1021 "ef\\uD800\\U00010000",
1022
729e4ab9 1023#if !UCONFIG_NO_NORMALIZATION
374ca955
A
1024 "[:^lccc=0:]", // Lead canonical class
1025 "\\u0300\\u0301",
1026 "abcd\\u00c0\\u00c5",
1027
1028 "[:^tccc=0:]", // Trail canonical class
1029 "\\u0300\\u0301\\u00c0\\u00c5",
1030 "abcd",
1031
1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033 "\\u0300\\u0301\\u00c0\\u00c5",
1034 "abcd",
1035
1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037 "",
1038 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041 "\\u0F73\\u0F75\\u0F81",
1042 "abcd\\u0300\\u0301\\u00c0\\u00c5",
729e4ab9 1043#endif /* !UCONFIG_NO_NORMALIZATION */
374ca955 1044
73c04bcf
A
1045 "[:Assigned:]",
1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
729e4ab9
A
1047 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049 // Script_Extensions, new in Unicode 6.0
1050 "[:scx=Arab:]",
1051 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
51004dcb 1052 "\\u061D\\uFDEF\\uFDFE",
729e4ab9
A
1053
1054 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055 // so scx-sc is missing U+FDF2.
1056 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057 "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058 "\\uFDF2"
b75a7d8f
A
1059 };
1060
1061 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062
1063 for (int32_t i=0; i<DATA_LEN; i+=3) {
46f4442e 1064 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
b75a7d8f
A
1065 CharsToUnicodeString(DATA[i+2]));
1066 }
1067}
1068
73c04bcf
A
1069/**
1070 * Test that Posix style character classes [:digit:], etc.
1071 * have the Unicode definitions from TR 18.
1072 */
1073void UnicodeSetTest::TestPosixClasses() {
1074 {
1075 UErrorCode status = U_ZERO_ERROR;
1076 UnicodeSet s1("[:alpha:]", status);
46f4442e 1077 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
73c04bcf
A
1078 TEST_ASSERT_SUCCESS(status);
1079 TEST_ASSERT(s1==s2);
1080 }
1081 {
1082 UErrorCode status = U_ZERO_ERROR;
1083 UnicodeSet s1("[:lower:]", status);
46f4442e 1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
73c04bcf
A
1085 TEST_ASSERT_SUCCESS(status);
1086 TEST_ASSERT(s1==s2);
1087 }
1088 {
1089 UErrorCode status = U_ZERO_ERROR;
1090 UnicodeSet s1("[:upper:]", status);
46f4442e 1091 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
73c04bcf
A
1092 TEST_ASSERT_SUCCESS(status);
1093 TEST_ASSERT(s1==s2);
1094 }
1095 {
1096 UErrorCode status = U_ZERO_ERROR;
1097 UnicodeSet s1("[:punct:]", status);
46f4442e 1098 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
73c04bcf
A
1099 TEST_ASSERT_SUCCESS(status);
1100 TEST_ASSERT(s1==s2);
1101 }
1102 {
1103 UErrorCode status = U_ZERO_ERROR;
1104 UnicodeSet s1("[:digit:]", status);
46f4442e 1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
73c04bcf
A
1106 TEST_ASSERT_SUCCESS(status);
1107 TEST_ASSERT(s1==s2);
1108 }
1109 {
1110 UErrorCode status = U_ZERO_ERROR;
1111 UnicodeSet s1("[:xdigit:]", status);
46f4442e 1112 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
73c04bcf
A
1113 TEST_ASSERT_SUCCESS(status);
1114 TEST_ASSERT(s1==s2);
1115 }
1116 {
1117 UErrorCode status = U_ZERO_ERROR;
1118 UnicodeSet s1("[:alnum:]", status);
46f4442e 1119 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
73c04bcf
A
1120 TEST_ASSERT_SUCCESS(status);
1121 TEST_ASSERT(s1==s2);
1122 }
1123 {
1124 UErrorCode status = U_ZERO_ERROR;
1125 UnicodeSet s1("[:space:]", status);
46f4442e 1126 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
73c04bcf
A
1127 TEST_ASSERT_SUCCESS(status);
1128 TEST_ASSERT(s1==s2);
1129 }
1130 {
1131 UErrorCode status = U_ZERO_ERROR;
1132 UnicodeSet s1("[:blank:]", status);
1133 TEST_ASSERT_SUCCESS(status);
46f4442e 1134 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
73c04bcf
A
1135 status);
1136 TEST_ASSERT_SUCCESS(status);
1137 TEST_ASSERT(s1==s2);
1138 }
1139 {
1140 UErrorCode status = U_ZERO_ERROR;
1141 UnicodeSet s1("[:cntrl:]", status);
1142 TEST_ASSERT_SUCCESS(status);
46f4442e 1143 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
73c04bcf
A
1144 TEST_ASSERT_SUCCESS(status);
1145 TEST_ASSERT(s1==s2);
1146 }
1147 {
1148 UErrorCode status = U_ZERO_ERROR;
1149 UnicodeSet s1("[:graph:]", status);
1150 TEST_ASSERT_SUCCESS(status);
46f4442e 1151 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
73c04bcf
A
1152 TEST_ASSERT_SUCCESS(status);
1153 TEST_ASSERT(s1==s2);
1154 }
1155 {
1156 UErrorCode status = U_ZERO_ERROR;
1157 UnicodeSet s1("[:print:]", status);
1158 TEST_ASSERT_SUCCESS(status);
46f4442e 1159 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
73c04bcf
A
1160 TEST_ASSERT_SUCCESS(status);
1161 TEST_ASSERT(s1==s2);
1162 }
1163}
b75a7d8f
A
1164/**
1165 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1166 */
1167void UnicodeSetTest::TestClone() {
1168 UErrorCode ec = U_ZERO_ERROR;
1169 UnicodeSet s("[abcxyz]", ec);
1170 UnicodeSet t(s);
1171 expectContainment(t, "abc", "def");
1172}
1173
1174/**
1175 * Test the indexOf() and charAt() methods.
1176 */
1177void UnicodeSetTest::TestIndexOf() {
1178 UErrorCode ec = U_ZERO_ERROR;
1179 UnicodeSet set("[a-cx-y3578]", ec);
1180 if (U_FAILURE(ec)) {
1181 errln("FAIL: UnicodeSet constructor");
1182 return;
1183 }
1184 for (int32_t i=0; i<set.size(); ++i) {
1185 UChar32 c = set.charAt(i);
1186 if (set.indexOf(c) != i) {
1187 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188 i, c, set.indexOf(c));
1189 }
1190 }
1191 UChar32 c = set.charAt(set.size());
1192 if (c != -1) {
1193 errln("FAIL: charAt(<out of range>) = %X", c);
1194 }
1195 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196 if (j != -1) {
1197 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198 }
1199}
1200
1201/**
1202 * Test closure API.
1203 */
1204void UnicodeSetTest::TestCloseOver() {
1205 UErrorCode ec = U_ZERO_ERROR;
1206
73c04bcf 1207 char CASE[] = {(char)USET_CASE_INSENSITIVE};
374ca955 1208 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
b75a7d8f
A
1209 const char* DATA[] = {
1210 // selector, input, output
1211 CASE,
1212 "[aq\\u00DF{Bc}{bC}{Fi}]",
46f4442e 1213 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
b75a7d8f
A
1214
1215 CASE,
1216 "[\\u01F1]", // 'DZ'
1217 "[\\u01F1\\u01F2\\u01F3]",
1218
1219 CASE,
1220 "[\\u1FB4]",
1221 "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223 CASE,
1224 "[{F\\uFB01}]",
1225 "[\\uFB03{ffi}]",
1226
1227 CASE, // make sure binary search finds limits
1228 "[a\\uFF3A]",
1229 "[aA\\uFF3A\\uFF5A]",
1230
1231 CASE,
1232 "[a-z]","[A-Za-z\\u017F\\u212A]",
1233 CASE,
1234 "[abc]","[A-Ca-c]",
1235 CASE,
1236 "[ABC]","[A-Ca-c]",
1237
73c04bcf
A
1238 CASE, "[i]", "[iI]",
1239
1240 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1241 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1242
1243 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1244
1245 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1252
1253 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1256 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1257
1258 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1259
1260 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
729e4ab9 1264#if !UCONFIG_NO_FILE_IO
374ca955
A
1265 CASE_MAPPINGS,
1266 "[aq\\u00DF{Bc}{bC}{Fi}]",
1267 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
729e4ab9 1268#endif
374ca955
A
1269
1270 CASE_MAPPINGS,
1271 "[\\u01F1]", // 'DZ'
1272 "[\\u01F1\\u01F2\\u01F3]",
1273
1274 CASE_MAPPINGS,
1275 "[a-z]",
1276 "[A-Za-z]",
1277
b75a7d8f
A
1278 NULL
1279 };
1280
1281 UnicodeSet s;
1282 UnicodeSet t;
73c04bcf 1283 UnicodeString buf;
b75a7d8f
A
1284 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285 int32_t selector = DATA[i][0];
46f4442e
A
1286 UnicodeString pat(DATA[i+1], -1, US_INV);
1287 UnicodeString exp(DATA[i+2], -1, US_INV);
b75a7d8f
A
1288 s.applyPattern(pat, ec);
1289 s.closeOver(selector);
1290 t.applyPattern(exp, ec);
1291 if (U_FAILURE(ec)) {
1292 errln("FAIL: applyPattern failed");
1293 continue;
1294 }
1295 if (s == t) {
1296 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297 } else {
729e4ab9 1298 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
b75a7d8f
A
1299 s.toPattern(buf, TRUE) + ", expected " + exp);
1300 }
1301 }
1302
73c04bcf
A
1303#if 0
1304 /*
1305 * Unused test code.
1306 * This was used to compare the old implementation (using USET_CASE)
1307 * with the new one (using 0x100 temporarily)
1308 * while transitioning from hardcoded case closure tables in uniset.cpp
1309 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310 * and using ucase.c functions for closure.
1311 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312 *
1313 * Note: The old and new implementation never fully matched because
1314 * the old implementation turned out to not map U+0130 and U+0131 correctly
1315 * (dotted I and dotless i) and because the old implementation's data tables
1316 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317 * new implementation. (So sigmas and some other characters were not handled
1318 * according to the newer Unicode version.)
1319 */
1320 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321 UnicodeSetIterator si(sens);
1322 UnicodeString str, buf2;
1323 const UnicodeString *pStr;
1324 UChar32 c;
1325 while(si.next()) {
1326 if(!si.isString()) {
1327 c=si.getCodepoint();
1328 s.clear();
1329 s.add(c);
1330
1331 str.setTo(c);
1332 str.foldCase();
1333 sens2.add(str);
1334
1335 t=s;
1336 s.closeOver(USET_CASE);
1337 t.closeOver(0x100);
1338 if(s!=t) {
1339 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341 }
1342 }
1343 }
1344 // remove all code points
1345 // should contain all full case folding mapping strings
1346 sens2.remove(0, 0x10ffff);
1347 si.reset(sens2);
1348 while(si.next()) {
1349 if(si.isString()) {
1350 pStr=&si.getString();
1351 s.clear();
1352 s.add(*pStr);
1353 t=s2=s;
1354 s.closeOver(USET_CASE);
1355 t.closeOver(0x100);
1356 if(s!=t) {
1357 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359 }
1360 }
1361 }
1362#endif
1363
b75a7d8f 1364 // Test the pattern API
374ca955 1365 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
b75a7d8f
A
1366 if (U_FAILURE(ec)) {
1367 errln("FAIL: applyPattern failed");
1368 } else {
1369 expectContainment(s, "abcABC", "defDEF");
1370 }
374ca955 1371 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
b75a7d8f
A
1372 if (U_FAILURE(ec)) {
1373 errln("FAIL: constructor failed");
1374 } else {
1375 expectContainment(v, "defDEF", "abcABC");
1376 }
374ca955
A
1377 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378 if (U_FAILURE(ec)) {
1379 errln("FAIL: construct w/case mappings failed");
1380 } else {
1381 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382 }
b75a7d8f
A
1383}
1384
1385void UnicodeSetTest::TestEscapePattern() {
1386 const char pattern[] =
374ca955 1387 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
b75a7d8f 1388 const char exp[] =
374ca955 1389 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
b75a7d8f 1390 // We test this with two passes; in the second pass we
4388f060 1391 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
374ca955 1392 // this fails -- which is what we expect.
b75a7d8f
A
1393 for (int32_t pass=1; pass<=2; ++pass) {
1394 UErrorCode ec = U_ZERO_ERROR;
46f4442e 1395 UnicodeString pat(pattern, -1, US_INV);
b75a7d8f
A
1396 if (pass==2) {
1397 pat = pat.unescape();
1398 }
1399 // Pattern is only good for pass 1
1400 UBool isPatternValid = (pass==1);
1401
1402 UnicodeSet set(pat, ec);
1403 if (U_SUCCESS(ec) != isPatternValid){
1404 errln((UnicodeString)"FAIL: applyPattern(" +
1405 escape(pat) + ") => " +
1406 u_errorName(ec));
1407 continue;
1408 }
1409 if (U_FAILURE(ec)) {
1410 continue;
1411 }
1412 if (set.contains((UChar)0x0644)){
1413 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414 }
1415
1416 UnicodeString newpat;
1417 set.toPattern(newpat, TRUE);
46f4442e 1418 if (newpat == UnicodeString(exp, -1, US_INV)) {
b75a7d8f
A
1419 logln(escape(pat) + " => " + newpat);
1420 } else {
1421 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422 }
1423
1424 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425 UnicodeString str("Range ");
1426 str.append((UChar)(0x30 + i))
1427 .append(": ")
1428 .append((UChar32)set.getRangeStart(i))
1429 .append(" - ")
1430 .append((UChar32)set.getRangeEnd(i));
1431 str = str + " (" + set.getRangeStart(i) + " - " +
1432 set.getRangeEnd(i) + ")";
1433 if (set.getRangeStart(i) < 0) {
1434 errln((UnicodeString)"FAIL: " + escape(str));
1435 } else {
1436 logln(escape(str));
1437 }
1438 }
1439 }
1440}
1441
1442void UnicodeSetTest::expectRange(const UnicodeString& label,
1443 const UnicodeSet& set,
1444 UChar32 start, UChar32 end) {
1445 UnicodeSet exp(start, end);
1446 UnicodeString pat;
1447 if (set == exp) {
1448 logln(label + " => " + set.toPattern(pat, TRUE));
1449 } else {
1450 UnicodeString xpat;
1451 errln((UnicodeString)"FAIL: " + label + " => " +
1452 set.toPattern(pat, TRUE) +
1453 ", expected " + exp.toPattern(xpat, TRUE));
1454 }
1455}
1456
1457void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459 const UChar32 DATA[] = {
1460 // Test range Expected range
1461 0, 0x10FFFF, 0, 0x10FFFF,
1462 (UChar32)-1, 8, 0, 8,
1463 8, 0x110000, 8, 0x10FFFF
1464 };
1465 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466
1467 UnicodeString pat;
1468 int32_t i;
1469
1470 for (i=0; i<DATA_LENGTH; i+=4) {
1471 UChar32 start = DATA[i];
1472 UChar32 end = DATA[i+1];
1473 UChar32 xstart = DATA[i+2];
1474 UChar32 xend = DATA[i+3];
1475
1476 // Try various API using the test code points
1477
1478 UnicodeSet set(start, end);
1479 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480 set, xstart, xend);
1481
1482 set.clear();
1483 set.set(start, end);
1484 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485 set, xstart, xend);
1486
1487 UBool b = set.contains(start);
1488 b = set.contains(start, end);
1489 b = set.containsNone(start, end);
1490 b = set.containsSome(start, end);
1491
374ca955 1492 /*int32_t index = set.indexOf(start);*/
b75a7d8f
A
1493
1494 set.clear();
1495 set.add(start);
1496 set.add(start, end);
1497 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498 set, xstart, xend);
1499
1500 set.set(0, 0x10FFFF);
1501 set.retain(start, end);
1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503 set, xstart, xend);
1504 set.retain(start);
1505
1506 set.set(0, 0x10FFFF);
1507 set.remove(start);
1508 set.remove(start, end);
1509 set.complement();
1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511 set, xstart, xend);
1512
1513 set.set(0, 0x10FFFF);
1514 set.complement(start, end);
1515 set.complement();
1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517 set, xstart, xend);
1518 set.complement(start);
1519 }
1520
1521 const UChar32 DATA2[] = {
1522 0,
1523 0x10FFFF,
1524 (UChar32)-1,
1525 0x110000
1526 };
1527 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1528
1529 for (i=0; i<DATA2_LENGTH; ++i) {
1530 UChar32 c = DATA2[i], end = 0x10FFFF;
1531 UBool valid = (c >= 0 && c <= 0x10FFFF);
1532
1533 UnicodeSet set(0, 0x10FFFF);
1534
1535 // For single-codepoint contains, invalid codepoints are NOT contained
1536 UBool b = set.contains(c);
1537 if (b == valid) {
1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539 ") = " + b);
1540 } else {
1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542 ") = " + b);
1543 }
1544
1545 // For codepoint range contains, containsNone, and containsSome,
1546 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547 b = set.contains(c, end);
1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549 "," + end + ") = " + b);
1550
1551 b = set.containsNone(c, end);
1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553 "," + end + ") = " + b);
1554
1555 b = set.containsSome(c, end);
1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557 "," + end + ") = " + b);
1558
1559 int32_t index = set.indexOf(c);
1560 if ((index >= 0) == valid) {
1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562 ") = " + index);
1563 } else {
1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565 ") = " + index);
1566 }
1567 }
1568}
1569
374ca955
A
1570// Used by TestSymbolTable
1571class TokenSymbolTable : public SymbolTable {
1572public:
1573 Hashtable contents;
1574
1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
4388f060 1576 contents.setValueDeleter(uprv_deleteUObject);
374ca955
A
1577 }
1578
1579 ~TokenSymbolTable() {}
1580
1581 /**
1582 * (Non-SymbolTable API) Add the given variable and value to
1583 * the table. Variable should NOT contain leading '$'.
1584 */
1585 void add(const UnicodeString& var, const UnicodeString& value,
1586 UErrorCode& ec) {
1587 if (U_SUCCESS(ec)) {
1588 contents.put(var, new UnicodeString(value), ec);
1589 }
1590 }
1591
1592 /**
1593 * SymbolTable API
1594 */
1595 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596 return (const UnicodeString*) contents.get(s);
1597 }
1598
1599 /**
1600 * SymbolTable API
1601 */
1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603 return NULL;
1604 }
1605
1606 /**
1607 * SymbolTable API
1608 */
1609 virtual UnicodeString parseReference(const UnicodeString& text,
1610 ParsePosition& pos, int32_t limit) const {
1611 int32_t start = pos.getIndex();
1612 int32_t i = start;
1613 UnicodeString result;
1614 while (i < limit) {
1615 UChar c = text.charAt(i);
1616 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617 break;
1618 }
1619 ++i;
1620 }
1621 if (i == start) { // No valid name chars
1622 return result; // Indicate failure with empty string
1623 }
1624 pos.setIndex(i);
1625 text.extractBetween(start, i, result);
1626 return result;
1627 }
1628};
1629
1630void UnicodeSetTest::TestSymbolTable() {
1631 // Multiple test cases can be set up here. Each test case
1632 // is terminated by null:
1633 // var, value, var, value,..., input pat., exp. output pat., null
1634 const char* DATA[] = {
1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638 NULL
1639 };
1640
1641 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642 UErrorCode ec = U_ZERO_ERROR;
1643 TokenSymbolTable sym(ec);
1644 if (U_FAILURE(ec)) {
1645 errln("FAIL: couldn't construct TokenSymbolTable");
1646 continue;
1647 }
1648
1649 // Set up variables
1650 while (DATA[i+2] != NULL) {
46f4442e 1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
374ca955
A
1652 if (U_FAILURE(ec)) {
1653 errln("FAIL: couldn't add to TokenSymbolTable");
1654 continue;
1655 }
1656 i += 2;
1657 }
1658
1659 // Input pattern and expected output pattern
46f4442e 1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
374ca955
A
1661 i += 2;
1662
1663 ParsePosition pos(0);
1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665 if (U_FAILURE(ec)) {
1666 errln("FAIL: couldn't construct UnicodeSet");
1667 continue;
1668 }
1669
1670 // results
1671 if (pos.getIndex() != inpat.length()) {
1672 errln((UnicodeString)"Failed to read to end of string \""
1673 + inpat + "\": read to "
1674 + pos.getIndex() + ", length is "
1675 + inpat.length());
1676 }
1677
1678 UnicodeSet us2(exppat, ec);
1679 if (U_FAILURE(ec)) {
1680 errln("FAIL: couldn't construct expected UnicodeSet");
1681 continue;
1682 }
1683
1684 UnicodeString a, b;
1685 if (us != us2) {
1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687 ", expected " + us2.toPattern(b, TRUE));
1688 } else {
1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690 }
1691 }
1692}
1693
1694void UnicodeSetTest::TestSurrogate() {
1695 const char* DATA[] = {
1696 // These should all behave identically
1697 "[abc\\uD800\\uDC00]",
1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699 "[abc\\U00010000]",
1700 0
1701 };
1702 for (int i=0; DATA[i] != 0; ++i) {
1703 UErrorCode ec = U_ZERO_ERROR;
46f4442e 1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
729e4ab9
A
1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706 UnicodeSet set(str, ec);
374ca955
A
1707 if (U_FAILURE(ec)) {
1708 errln("FAIL: UnicodeSet constructor");
1709 continue;
1710 }
1711 expectContainment(set,
1712 CharsToUnicodeString("abc\\U00010000"),
1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714 if (set.size() != 4) {
46f4442e 1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
374ca955
A
1716 set.size() + ", expected 4");
1717 }
1718 }
1719}
1720
b75a7d8f
A
1721void UnicodeSetTest::TestExhaustive() {
1722 // exhaustive tests. Simulate UnicodeSets with integers.
1723 // That gives us very solid tests (except for large memory tests).
1724
1725 int32_t limit = 128;
1726
1727 UnicodeSet x, y, z, aa;
1728
1729 for (int32_t i = 0; i < limit; ++i) {
1730 bitsToSet(i, x);
1731 logln((UnicodeString)"Testing " + i + ", " + x);
1732 _testComplement(i, x, y);
1733
1734 // AS LONG AS WE ARE HERE, check roundtrip
1735 checkRoundTrip(bitsToSet(i, aa));
1736
1737 for (int32_t j = 0; j < limit; ++j) {
1738 _testAdd(i,j, x,y,z);
1739 _testXor(i,j, x,y,z);
1740 _testRetain(i,j, x,y,z);
1741 _testRemove(i,j, x,y,z);
1742 }
1743 }
1744}
1745
1746void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1747 bitsToSet(a, x);
1748 z = x;
1749 z.complement();
1750 int32_t c = setToBits(z);
1751 if (c != (~a)) {
1752 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1753 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1754 }
1755 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1756}
1757
1758void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1759 bitsToSet(a, x);
1760 bitsToSet(b, y);
1761 z = x;
1762 z.addAll(y);
1763 int32_t c = setToBits(z);
1764 if (c != (a | b)) {
1765 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1766 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1767 }
1768 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1769}
1770
1771void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772 bitsToSet(a, x);
1773 bitsToSet(b, y);
1774 z = x;
1775 z.retainAll(y);
1776 int32_t c = setToBits(z);
1777 if (c != (a & b)) {
1778 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1779 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1780 }
1781 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1782}
1783
1784void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785 bitsToSet(a, x);
1786 bitsToSet(b, y);
1787 z = x;
1788 z.removeAll(y);
1789 int32_t c = setToBits(z);
1790 if (c != (a &~ b)) {
1791 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1792 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1793 }
1794 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1795}
1796
1797void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798 bitsToSet(a, x);
1799 bitsToSet(b, y);
1800 z = x;
1801 z.complementAll(y);
1802 int32_t c = setToBits(z);
1803 if (c != (a ^ b)) {
1804 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1805 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1806 }
1807 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1808}
1809
1810/**
1811 * Check that ranges are monotonically increasing and non-
1812 * overlapping.
1813 */
1814void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1815 int32_t n = set.getRangeCount();
1816 if (n < 0) {
1817 errln((UnicodeString)"FAIL result of " + msg +
1818 ": range count should be >= 0 but is " +
1819 n /*+ " for " + set.toPattern())*/);
1820 return;
1821 }
1822 UChar32 last = 0;
1823 for (int32_t i=0; i<n; ++i) {
1824 UChar32 start = set.getRangeStart(i);
1825 UChar32 end = set.getRangeEnd(i);
1826 if (start > end) {
1827 errln((UnicodeString)"FAIL result of " + msg +
1828 ": range " + (i+1) +
1829 " start > end: " + (int)start + ", " + (int)end +
1830 " for " + set);
1831 }
1832 if (i > 0 && start <= last) {
1833 errln((UnicodeString)"FAIL result of " + msg +
1834 ": range " + (i+1) +
1835 " overlaps previous range: " + (int)start + ", " + (int)end +
1836 " for " + set);
1837 }
1838 last = end;
1839 }
1840}
1841
1842/**
1843 * Convert a bitmask to a UnicodeSet.
1844 */
1845UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1846 result.clear();
1847 for (UChar32 i = 0; i < 32; ++i) {
1848 if ((a & (1<<i)) != 0) {
1849 result.add(i);
1850 }
1851 }
1852 return result;
1853}
1854
1855/**
1856 * Convert a UnicodeSet to a bitmask. Only the characters
1857 * U+0000 to U+0020 are represented in the bitmask.
1858 */
1859int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1860 int32_t result = 0;
1861 for (int32_t i = 0; i < 32; ++i) {
1862 if (x.contains((UChar32)i)) {
1863 result |= (1<<i);
1864 }
1865 }
1866 return result;
1867}
1868
1869/**
1870 * Return the representation of an inversion list based UnicodeSet
1871 * as a pairs list. Ranges are listed in ascending Unicode order.
1872 * For example, the set [a-zA-M3] is represented as "33AMaz".
1873 */
1874UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1875 UnicodeString pairs;
1876 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1877 UChar32 start = set.getRangeStart(i);
1878 UChar32 end = set.getRangeEnd(i);
1879 if (end > 0xFFFF) {
1880 end = 0xFFFF;
1881 i = set.getRangeCount(); // Should be unnecessary
1882 }
1883 pairs.append((UChar)start).append((UChar)end);
1884 }
1885 return pairs;
1886}
1887
1888/**
1889 * Basic consistency check for a few items.
1890 * That the iterator works, and that we can create a pattern and
1891 * get the same thing back
1892 */
1893void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1894 UErrorCode ec = U_ZERO_ERROR;
1895
1896 UnicodeSet t(s);
1897 checkEqual(s, t, "copy ct");
1898
1899 t = s;
1900 checkEqual(s, t, "operator=");
1901
1902 copyWithIterator(t, s, FALSE);
1903 checkEqual(s, t, "iterator roundtrip");
1904
1905 copyWithIterator(t, s, TRUE); // try range
1906 checkEqual(s, t, "iterator roundtrip");
1907
1908 UnicodeString pat; s.toPattern(pat, FALSE);
1909 t.applyPattern(pat, ec);
1910 if (U_FAILURE(ec)) {
1911 errln("FAIL: applyPattern");
1912 return;
1913 } else {
1914 checkEqual(s, t, "toPattern(false)");
1915 }
1916
1917 s.toPattern(pat, TRUE);
1918 t.applyPattern(pat, ec);
1919 if (U_FAILURE(ec)) {
1920 errln("FAIL: applyPattern");
1921 return;
1922 } else {
1923 checkEqual(s, t, "toPattern(true)");
1924 }
1925}
1926
1927void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1928 t.clear();
1929 UnicodeSetIterator it(s);
1930 if (withRange) {
1931 while (it.nextRange()) {
1932 if (it.isString()) {
1933 t.add(it.getString());
1934 } else {
1935 t.add(it.getCodepoint(), it.getCodepointEnd());
1936 }
1937 }
1938 } else {
1939 while (it.next()) {
1940 if (it.isString()) {
1941 t.add(it.getString());
1942 } else {
1943 t.add(it.getCodepoint());
1944 }
1945 }
1946 }
1947}
1948
1949UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1950 UnicodeString source; s.toPattern(source, TRUE);
1951 UnicodeString result; t.toPattern(result, TRUE);
1952 if (s != t) {
1953 errln((UnicodeString)"FAIL: " + message
1954 + "; source = " + source
1955 + "; result = " + result
1956 );
1957 return FALSE;
1958 } else {
1959 logln((UnicodeString)"Ok: " + message
1960 + "; source = " + source
1961 + "; result = " + result
1962 );
1963 }
1964 return TRUE;
1965}
1966
1967void
1968UnicodeSetTest::expectContainment(const UnicodeString& pat,
1969 const UnicodeString& charsIn,
1970 const UnicodeString& charsOut) {
1971 UErrorCode ec = U_ZERO_ERROR;
1972 UnicodeSet set(pat, ec);
1973 if (U_FAILURE(ec)) {
729e4ab9 1974 dataerrln((UnicodeString)"FAIL: pattern \"" +
b75a7d8f
A
1975 pat + "\" => " + u_errorName(ec));
1976 return;
1977 }
1978 expectContainment(set, pat, charsIn, charsOut);
1979}
1980
1981void
1982UnicodeSetTest::expectContainment(const UnicodeSet& set,
1983 const UnicodeString& charsIn,
1984 const UnicodeString& charsOut) {
1985 UnicodeString pat;
1986 set.toPattern(pat);
1987 expectContainment(set, pat, charsIn, charsOut);
1988}
1989
1990void
1991UnicodeSetTest::expectContainment(const UnicodeSet& set,
1992 const UnicodeString& setName,
1993 const UnicodeString& charsIn,
1994 const UnicodeString& charsOut) {
1995 UnicodeString bad;
1996 UChar32 c;
1997 int32_t i;
1998
1999 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2000 c = charsIn.char32At(i);
2001 if (!set.contains(c)) {
2002 bad.append(c);
2003 }
2004 }
2005 if (bad.length() > 0) {
2006 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2007 ", expected containment of " + prettify(charsIn));
2008 } else {
2009 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2010 }
2011
2012 bad.truncate(0);
2013 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2014 c = charsOut.char32At(i);
2015 if (set.contains(c)) {
2016 bad.append(c);
2017 }
2018 }
2019 if (bad.length() > 0) {
2020 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2021 ", expected non-containment of " + prettify(charsOut));
2022 } else {
2023 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2024 }
2025}
2026
2027void
2028UnicodeSetTest::expectPattern(UnicodeSet& set,
2029 const UnicodeString& pattern,
2030 const UnicodeString& expectedPairs){
2031 UErrorCode status = U_ZERO_ERROR;
2032 set.applyPattern(pattern, status);
2033 if (U_FAILURE(status)) {
2034 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2035 "\") failed");
2036 return;
2037 } else {
2038 if (getPairs(set) != expectedPairs ) {
2039 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2040 "\") => pairs \"" +
2041 escape(getPairs(set)) + "\", expected \"" +
2042 escape(expectedPairs) + "\"");
2043 } else {
2044 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2045 "\") => pairs \"" +
2046 escape(getPairs(set)) + "\"");
2047 }
2048 }
2049 // the result of calling set.toPattern(), which is the string representation of
2050 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2051 // will produce another set that is equal to this one.
2052 UnicodeString temppattern;
2053 set.toPattern(temppattern);
2054 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2055 if (U_FAILURE(status)) {
2056 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2057 return;
2058 }
2059 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2060 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2061 escape(getPairs(set)) + "\""));
2062 } else{
2063 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2064 }
2065
2066 delete tempset;
2067
2068}
2069
2070void
2071UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2072 if (getPairs(set) != expectedPairs) {
2073 errln(UnicodeString("FAIL: Expected pair list \"") +
2074 escape(expectedPairs) + "\", got \"" +
2075 escape(getPairs(set)) + "\"");
2076 }
2077}
2078
2079void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2080 const UnicodeString& expPat,
2081 const char** expStrings) {
2082 UnicodeString pat;
2083 set.toPattern(pat, TRUE);
2084 if (pat == expPat) {
2085 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2086 } else {
2087 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2088 return;
2089 }
374ca955
A
2090 if (expStrings == NULL) {
2091 return;
2092 }
b75a7d8f
A
2093 UBool in = TRUE;
2094 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2095 if (expStrings[i] == NOT) { // sic; pointer comparison
2096 in = FALSE;
2097 continue;
2098 }
2099 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2100 UBool contained = set.contains(s);
2101 if (contained == in) {
2102 logln((UnicodeString)"Ok: " + expPat +
2103 (contained ? " contains {" : " does not contain {") +
2104 escape(expStrings[i]) + "}");
2105 } else {
2106 errln((UnicodeString)"FAIL: " + expPat +
2107 (contained ? " contains {" : " does not contain {") +
2108 escape(expStrings[i]) + "}");
2109 }
2110 }
2111}
2112
2113static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2114
2115void
2116UnicodeSetTest::doAssert(UBool condition, const char *message)
2117{
2118 if (!condition) {
2119 errln(UnicodeString("ERROR : ") + message);
2120 }
2121}
2122
2123UnicodeString
2124UnicodeSetTest::escape(const UnicodeString& s) {
2125 UnicodeString buf;
2126 for (int32_t i=0; i<s.length(); )
2127 {
2128 UChar32 c = s.char32At(i);
2129 if (0x0020 <= c && c <= 0x007F) {
2130 buf += c;
2131 } else {
2132 if (c <= 0xFFFF) {
2133 buf += (UChar)0x5c; buf += (UChar)0x75;
2134 } else {
2135 buf += (UChar)0x5c; buf += (UChar)0x55;
2136 buf += toHexString((c & 0xF0000000) >> 28);
2137 buf += toHexString((c & 0x0F000000) >> 24);
2138 buf += toHexString((c & 0x00F00000) >> 20);
2139 buf += toHexString((c & 0x000F0000) >> 16);
2140 }
2141 buf += toHexString((c & 0xF000) >> 12);
2142 buf += toHexString((c & 0x0F00) >> 8);
2143 buf += toHexString((c & 0x00F0) >> 4);
2144 buf += toHexString(c & 0x000F);
2145 }
2146 i += U16_LENGTH(c);
2147 }
2148 return buf;
2149}
46f4442e
A
2150
2151void UnicodeSetTest::TestFreezable() {
2152 UErrorCode errorCode=U_ZERO_ERROR;
2153 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2154 UnicodeSet idSet(idPattern, errorCode);
2155 if(U_FAILURE(errorCode)) {
729e4ab9 2156 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
46f4442e
A
2157 return;
2158 }
2159
2160 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2161 UnicodeSet wsSet(wsPattern, errorCode);
2162 if(U_FAILURE(errorCode)) {
729e4ab9 2163 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
46f4442e
A
2164 return;
2165 }
2166
2167 idSet.add(idPattern);
2168 UnicodeSet frozen(idSet);
2169 frozen.freeze();
2170
2171 if(idSet.isFrozen() || !frozen.isFrozen()) {
2172 errln("FAIL: isFrozen() is wrong");
2173 }
2174 if(frozen!=idSet || !(frozen==idSet)) {
2175 errln("FAIL: a copy-constructed frozen set differs from its original");
2176 }
2177
2178 frozen=wsSet;
2179 if(frozen!=idSet || !(frozen==idSet)) {
2180 errln("FAIL: a frozen set was modified by operator=");
2181 }
2182
2183 UnicodeSet frozen2(frozen);
2184 if(frozen2!=frozen || frozen2!=idSet) {
2185 errln("FAIL: a copied frozen set differs from its frozen original");
2186 }
2187 if(!frozen2.isFrozen()) {
2188 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2189 }
2190 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2191 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2192 errln("FAIL: UnicodeSet(5, 55) failed");
2193 }
2194 frozen3=frozen;
2195 if(!frozen3.isFrozen()) {
2196 errln("FAIL: copying a frozen set results in a thawed one");
2197 }
2198
2199 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2200 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2201 errln("FAIL: clone() failed");
2202 }
2203 cloned->add(0xd802, 0xd805);
2204 if(cloned->containsSome(0xd802, 0xd805)) {
2205 errln("FAIL: unable to modify clone");
2206 }
2207 delete cloned;
2208
2209 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2210 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2211 errln("FAIL: cloneAsThawed() failed");
2212 }
2213 thawed->add(0xd802, 0xd805);
2214 if(!thawed->contains(0xd802, 0xd805)) {
2215 errln("FAIL: unable to modify thawed clone");
2216 }
2217 delete thawed;
2218
2219 frozen.set(5, 55);
2220 if(frozen!=idSet || !(frozen==idSet)) {
2221 errln("FAIL: UnicodeSet::set() modified a frozen set");
2222 }
2223
2224 frozen.clear();
2225 if(frozen!=idSet || !(frozen==idSet)) {
2226 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2227 }
2228
2229 frozen.closeOver(USET_CASE_INSENSITIVE);
2230 if(frozen!=idSet || !(frozen==idSet)) {
2231 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2232 }
2233
2234 frozen.compact();
2235 if(frozen!=idSet || !(frozen==idSet)) {
2236 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2237 }
2238
2239 ParsePosition pos;
2240 frozen.
2241 applyPattern(wsPattern, errorCode).
2242 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2243 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2244 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2245 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2246 if(frozen!=idSet || !(frozen==idSet)) {
2247 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2248 }
2249
2250 frozen.
2251 add(0xd800).
2252 add(0xd802, 0xd805).
2253 add(wsPattern).
2254 addAll(idPattern).
2255 addAll(wsSet);
2256 if(frozen!=idSet || !(frozen==idSet)) {
2257 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2258 }
2259
2260 frozen.
2261 retain(0x62).
2262 retain(0x64, 0x69).
2263 retainAll(wsPattern).
2264 retainAll(wsSet);
2265 if(frozen!=idSet || !(frozen==idSet)) {
2266 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2267 }
2268
2269 frozen.
2270 remove(0x62).
2271 remove(0x64, 0x69).
2272 remove(idPattern).
2273 removeAll(idPattern).
2274 removeAll(idSet);
2275 if(frozen!=idSet || !(frozen==idSet)) {
2276 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2277 }
2278
2279 frozen.
2280 complement().
2281 complement(0x62).
2282 complement(0x64, 0x69).
2283 complement(idPattern).
2284 complementAll(idPattern).
2285 complementAll(idSet);
2286 if(frozen!=idSet || !(frozen==idSet)) {
2287 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2288 }
2289}
2290
2291// Test span() etc. -------------------------------------------------------- ***
2292
2293// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2294static int32_t
2295appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2296 UErrorCode errorCode=U_ZERO_ERROR;
2297 int32_t length8=0;
2298 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2299 if(U_SUCCESS(errorCode)) {
2300 return length8;
2301 } else {
2302 // The string contains an unpaired surrogate.
2303 // Ignore this string.
2304 return 0;
2305 }
2306}
2307
2308class UnicodeSetWithStringsIterator;
2309
2310// Make the strings in a UnicodeSet easily accessible.
2311class UnicodeSetWithStrings {
2312public:
2313 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2314 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2315 int32_t size=set.size();
2316 if(size>0 && set.charAt(size-1)<0) {
2317 // If a set's last element is not a code point, then it must contain strings.
2318 // Iterate over the set, skip all code point ranges, and cache the strings.
2319 // Convert them to UTF-8 for spanUTF8().
2320 UnicodeSetIterator iter(set);
2321 const UnicodeString *s;
2322 char *s8=utf8;
2323 int32_t length8, utf8Count=0;
2324 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2325 if(iter.isString()) {
2326 // Store the pointer to the set's string element
2327 // which we happen to know is a stable pointer.
2328 strings[stringsLength]=s=&iter.getString();
2329 utf8Count+=
2330 utf8Lengths[stringsLength]=length8=
2331 appendUTF8(s->getBuffer(), s->length(),
2332 s8, (int32_t)(sizeof(utf8)-utf8Count));
2333 if(length8==0) {
2334 hasSurrogates=TRUE; // Contains unpaired surrogates.
2335 }
2336 s8+=length8;
2337 ++stringsLength;
2338 }
2339 }
2340 }
2341 }
2342
2343 const UnicodeSet &getSet() const {
2344 return set;
2345 }
2346
2347 UBool hasStrings() const {
2348 return (UBool)(stringsLength>0);
2349 }
2350
2351 UBool hasStringsWithSurrogates() const {
2352 return hasSurrogates;
2353 }
2354
2355private:
2356 friend class UnicodeSetWithStringsIterator;
2357
2358 const UnicodeSet &set;
2359
2360 const UnicodeString *strings[20];
2361 int32_t stringsLength;
2362 UBool hasSurrogates;
2363
2364 char utf8[1024];
2365 int32_t utf8Lengths[20];
2366
2367 int32_t nextStringIndex;
2368 int32_t nextUTF8Start;
2369};
2370
2371class UnicodeSetWithStringsIterator {
2372public:
2373 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2374 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2375 }
2376
2377 void reset() {
2378 nextStringIndex=nextUTF8Start=0;
2379 }
2380
2381 const UnicodeString *nextString() {
2382 if(nextStringIndex<fSet.stringsLength) {
2383 return fSet.strings[nextStringIndex++];
2384 } else {
2385 return NULL;
2386 }
2387 }
2388
2389 // Do not mix with calls to nextString().
2390 const char *nextUTF8(int32_t &length) {
2391 if(nextStringIndex<fSet.stringsLength) {
2392 const char *s8=fSet.utf8+nextUTF8Start;
2393 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2394 return s8;
2395 } else {
2396 length=0;
2397 return NULL;
2398 }
2399 }
2400
2401private:
2402 const UnicodeSetWithStrings &fSet;
2403 int32_t nextStringIndex;
2404 int32_t nextUTF8Start;
2405};
2406
2407// Compare 16-bit Unicode strings (which may be malformed UTF-16)
2408// at code point boundaries.
2409// That is, each edge of a match must not be in the middle of a surrogate pair.
2410static inline UBool
2411matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2412 s+=start;
2413 limit-=start;
2414 int32_t length=t.length();
2415 return 0==t.compare(s, length) &&
2416 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2417 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2418}
2419
2420// Implement span() with contains() for comparison.
2421static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2422 USetSpanCondition spanCondition) {
2423 const UnicodeSet &realSet(set.getSet());
2424 if(!set.hasStrings()) {
2425 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2426 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2427 }
2428
2429 UChar32 c;
2430 int32_t start=0, prev;
2431 while((prev=start)<length) {
2432 U16_NEXT(s, start, length, c);
2433 if(realSet.contains(c)!=spanCondition) {
2434 break;
2435 }
2436 }
2437 return prev;
2438 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2439 UnicodeSetWithStringsIterator iter(set);
2440 UChar32 c;
2441 int32_t start, next;
2442 for(start=next=0; start<length;) {
2443 U16_NEXT(s, next, length, c);
2444 if(realSet.contains(c)) {
2445 break;
2446 }
2447 const UnicodeString *str;
2448 iter.reset();
2449 while((str=iter.nextString())!=NULL) {
2450 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2451 // spanNeedsStrings=TRUE;
2452 return start;
2453 }
2454 }
2455 start=next;
2456 }
2457 return start;
2458 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2459 UnicodeSetWithStringsIterator iter(set);
2460 UChar32 c;
2461 int32_t start, next, maxSpanLimit=0;
2462 for(start=next=0; start<length;) {
2463 U16_NEXT(s, next, length, c);
2464 if(!realSet.contains(c)) {
2465 next=start; // Do not span this single, not-contained code point.
2466 }
2467 const UnicodeString *str;
2468 iter.reset();
2469 while((str=iter.nextString())!=NULL) {
2470 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2471 // spanNeedsStrings=TRUE;
2472 int32_t matchLimit=start+str->length();
2473 if(matchLimit==length) {
2474 return length;
2475 }
2476 if(spanCondition==USET_SPAN_CONTAINED) {
2477 // Iterate for the shortest match at each position.
2478 // Recurse for each but the shortest match.
2479 if(next==start) {
2480 next=matchLimit; // First match from start.
2481 } else {
2482 if(matchLimit<next) {
2483 // Remember shortest match from start for iteration.
2484 int32_t temp=next;
2485 next=matchLimit;
2486 matchLimit=temp;
2487 }
2488 // Recurse for non-shortest match from start.
2489 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2490 USET_SPAN_CONTAINED);
2491 if((matchLimit+spanLength)>maxSpanLimit) {
2492 maxSpanLimit=matchLimit+spanLength;
2493 if(maxSpanLimit==length) {
2494 return length;
2495 }
2496 }
2497 }
2498 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2499 if(matchLimit>next) {
2500 // Remember longest match from start.
2501 next=matchLimit;
2502 }
2503 }
2504 }
2505 }
2506 if(next==start) {
2507 break; // No match from start.
2508 }
2509 start=next;
2510 }
2511 if(start>maxSpanLimit) {
2512 return start;
2513 } else {
2514 return maxSpanLimit;
2515 }
2516 }
2517}
2518
2519static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2520 USetSpanCondition spanCondition) {
2521 if(length==0) {
2522 return 0;
2523 }
2524 const UnicodeSet &realSet(set.getSet());
2525 if(!set.hasStrings()) {
2526 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2527 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2528 }
2529
2530 UChar32 c;
2531 int32_t prev=length;
2532 do {
2533 U16_PREV(s, 0, length, c);
2534 if(realSet.contains(c)!=spanCondition) {
2535 break;
2536 }
2537 } while((prev=length)>0);
2538 return prev;
2539 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2540 UnicodeSetWithStringsIterator iter(set);
2541 UChar32 c;
2542 int32_t prev=length, length0=length;
2543 do {
2544 U16_PREV(s, 0, length, c);
2545 if(realSet.contains(c)) {
2546 break;
2547 }
2548 const UnicodeString *str;
2549 iter.reset();
2550 while((str=iter.nextString())!=NULL) {
2551 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2552 // spanNeedsStrings=TRUE;
2553 return prev;
2554 }
2555 }
2556 } while((prev=length)>0);
2557 return prev;
2558 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2559 UnicodeSetWithStringsIterator iter(set);
2560 UChar32 c;
2561 int32_t prev=length, minSpanStart=length, length0=length;
2562 do {
2563 U16_PREV(s, 0, length, c);
2564 if(!realSet.contains(c)) {
2565 length=prev; // Do not span this single, not-contained code point.
2566 }
2567 const UnicodeString *str;
2568 iter.reset();
2569 while((str=iter.nextString())!=NULL) {
2570 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2571 // spanNeedsStrings=TRUE;
2572 int32_t matchStart=prev-str->length();
2573 if(matchStart==0) {
2574 return 0;
2575 }
2576 if(spanCondition==USET_SPAN_CONTAINED) {
2577 // Iterate for the shortest match at each position.
2578 // Recurse for each but the shortest match.
2579 if(length==prev) {
2580 length=matchStart; // First match from prev.
2581 } else {
2582 if(matchStart>length) {
2583 // Remember shortest match from prev for iteration.
2584 int32_t temp=length;
2585 length=matchStart;
2586 matchStart=temp;
2587 }
2588 // Recurse for non-shortest match from prev.
2589 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2590 USET_SPAN_CONTAINED);
2591 if(spanStart<minSpanStart) {
2592 minSpanStart=spanStart;
2593 if(minSpanStart==0) {
2594 return 0;
2595 }
2596 }
2597 }
2598 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2599 if(matchStart<length) {
2600 // Remember longest match from prev.
2601 length=matchStart;
2602 }
2603 }
2604 }
2605 }
2606 if(length==prev) {
2607 break; // No match from prev.
2608 }
2609 } while((prev=length)>0);
2610 if(prev<minSpanStart) {
2611 return prev;
2612 } else {
2613 return minSpanStart;
2614 }
2615 }
2616}
2617
2618static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2619 USetSpanCondition spanCondition) {
2620 const UnicodeSet &realSet(set.getSet());
2621 if(!set.hasStrings()) {
2622 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2623 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2624 }
2625
2626 UChar32 c;
2627 int32_t start=0, prev;
2628 while((prev=start)<length) {
51004dcb 2629 U8_NEXT_OR_FFFD(s, start, length, c);
46f4442e
A
2630 if(realSet.contains(c)!=spanCondition) {
2631 break;
2632 }
2633 }
2634 return prev;
2635 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2636 UnicodeSetWithStringsIterator iter(set);
2637 UChar32 c;
2638 int32_t start, next;
2639 for(start=next=0; start<length;) {
51004dcb 2640 U8_NEXT_OR_FFFD(s, next, length, c);
46f4442e
A
2641 if(realSet.contains(c)) {
2642 break;
2643 }
2644 const char *s8;
2645 int32_t length8;
2646 iter.reset();
2647 while((s8=iter.nextUTF8(length8))!=NULL) {
2648 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2649 // spanNeedsStrings=TRUE;
2650 return start;
2651 }
2652 }
2653 start=next;
2654 }
2655 return start;
2656 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2657 UnicodeSetWithStringsIterator iter(set);
2658 UChar32 c;
2659 int32_t start, next, maxSpanLimit=0;
2660 for(start=next=0; start<length;) {
51004dcb 2661 U8_NEXT_OR_FFFD(s, next, length, c);
46f4442e
A
2662 if(!realSet.contains(c)) {
2663 next=start; // Do not span this single, not-contained code point.
2664 }
2665 const char *s8;
2666 int32_t length8;
2667 iter.reset();
2668 while((s8=iter.nextUTF8(length8))!=NULL) {
2669 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2670 // spanNeedsStrings=TRUE;
2671 int32_t matchLimit=start+length8;
2672 if(matchLimit==length) {
2673 return length;
2674 }
2675 if(spanCondition==USET_SPAN_CONTAINED) {
2676 // Iterate for the shortest match at each position.
2677 // Recurse for each but the shortest match.
2678 if(next==start) {
2679 next=matchLimit; // First match from start.
2680 } else {
2681 if(matchLimit<next) {
2682 // Remember shortest match from start for iteration.
2683 int32_t temp=next;
2684 next=matchLimit;
2685 matchLimit=temp;
2686 }
2687 // Recurse for non-shortest match from start.
2688 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2689 USET_SPAN_CONTAINED);
2690 if((matchLimit+spanLength)>maxSpanLimit) {
2691 maxSpanLimit=matchLimit+spanLength;
2692 if(maxSpanLimit==length) {
2693 return length;
2694 }
2695 }
2696 }
2697 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2698 if(matchLimit>next) {
2699 // Remember longest match from start.
2700 next=matchLimit;
2701 }
2702 }
2703 }
2704 }
2705 if(next==start) {
2706 break; // No match from start.
2707 }
2708 start=next;
2709 }
2710 if(start>maxSpanLimit) {
2711 return start;
2712 } else {
2713 return maxSpanLimit;
2714 }
2715 }
2716}
2717
2718static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2719 USetSpanCondition spanCondition) {
2720 if(length==0) {
2721 return 0;
2722 }
2723 const UnicodeSet &realSet(set.getSet());
2724 if(!set.hasStrings()) {
2725 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2726 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2727 }
2728
2729 UChar32 c;
2730 int32_t prev=length;
2731 do {
51004dcb 2732 U8_PREV_OR_FFFD(s, 0, length, c);
46f4442e
A
2733 if(realSet.contains(c)!=spanCondition) {
2734 break;
2735 }
2736 } while((prev=length)>0);
2737 return prev;
2738 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2739 UnicodeSetWithStringsIterator iter(set);
2740 UChar32 c;
2741 int32_t prev=length;
2742 do {
51004dcb 2743 U8_PREV_OR_FFFD(s, 0, length, c);
46f4442e
A
2744 if(realSet.contains(c)) {
2745 break;
2746 }
2747 const char *s8;
2748 int32_t length8;
2749 iter.reset();
2750 while((s8=iter.nextUTF8(length8))!=NULL) {
2751 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2752 // spanNeedsStrings=TRUE;
2753 return prev;
2754 }
2755 }
2756 } while((prev=length)>0);
2757 return prev;
2758 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2759 UnicodeSetWithStringsIterator iter(set);
2760 UChar32 c;
2761 int32_t prev=length, minSpanStart=length;
2762 do {
51004dcb 2763 U8_PREV_OR_FFFD(s, 0, length, c);
46f4442e
A
2764 if(!realSet.contains(c)) {
2765 length=prev; // Do not span this single, not-contained code point.
2766 }
2767 const char *s8;
2768 int32_t length8;
2769 iter.reset();
2770 while((s8=iter.nextUTF8(length8))!=NULL) {
2771 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2772 // spanNeedsStrings=TRUE;
2773 int32_t matchStart=prev-length8;
2774 if(matchStart==0) {
2775 return 0;
2776 }
2777 if(spanCondition==USET_SPAN_CONTAINED) {
2778 // Iterate for the shortest match at each position.
2779 // Recurse for each but the shortest match.
2780 if(length==prev) {
2781 length=matchStart; // First match from prev.
2782 } else {
2783 if(matchStart>length) {
2784 // Remember shortest match from prev for iteration.
2785 int32_t temp=length;
2786 length=matchStart;
2787 matchStart=temp;
2788 }
2789 // Recurse for non-shortest match from prev.
2790 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2791 USET_SPAN_CONTAINED);
2792 if(spanStart<minSpanStart) {
2793 minSpanStart=spanStart;
2794 if(minSpanStart==0) {
2795 return 0;
2796 }
2797 }
2798 }
2799 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2800 if(matchStart<length) {
2801 // Remember longest match from prev.
2802 length=matchStart;
2803 }
2804 }
2805 }
2806 }
2807 if(length==prev) {
2808 break; // No match from prev.
2809 }
2810 } while((prev=length)>0);
2811 if(prev<minSpanStart) {
2812 return prev;
2813 } else {
2814 return minSpanStart;
2815 }
2816 }
2817}
2818
2819// spans to be performed and compared
2820enum {
2821 SPAN_UTF16 =1,
2822 SPAN_UTF8 =2,
2823 SPAN_UTFS =3,
2824
2825 SPAN_SET =4,
2826 SPAN_COMPLEMENT =8,
2827 SPAN_POLARITY =0xc,
2828
2829 SPAN_FWD =0x10,
2830 SPAN_BACK =0x20,
2831 SPAN_DIRS =0x30,
2832
2833 SPAN_CONTAINED =0x100,
2834 SPAN_SIMPLE =0x200,
2835 SPAN_CONDITION =0x300,
2836
2837 SPAN_ALL =0x33f
2838};
2839
2840static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2841 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2842}
2843
2844static inline int32_t slen(const void *s, UBool isUTF16) {
2845 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2846}
2847
2848/*
2849 * Count spans on a string with the method according to type and set the span limits.
2850 * The set may be the complement of the original.
2851 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2852 * according to the expected number of spans.
2853 * Sets typeName to an empty string if there is no such type.
2854 * Returns -1 if the span option is filtered out.
2855 */
2856static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2857 const void *s, int32_t length, UBool isUTF16,
2858 uint32_t whichSpans,
2859 int type, const char *&typeName,
2860 int32_t limits[], int32_t limitsCapacity,
2861 int32_t expectCount) {
2862 const UnicodeSet &realSet(set.getSet());
2863 int32_t start, count;
2864 USetSpanCondition spanCondition, firstSpanCondition, contained;
2865 UBool isForward;
2866
2867 if(type<0 || 7<type) {
2868 typeName="";
2869 return 0;
2870 }
2871
2872 static const char *const typeNames16[]={
2873 "contains", "contains(LM)",
2874 "span", "span(LM)",
2875 "containsBack", "containsBack(LM)",
2876 "spanBack", "spanBack(LM)"
2877 };
2878
2879 static const char *const typeNames8[]={
2880 "containsUTF8", "containsUTF8(LM)",
2881 "spanUTF8", "spanUTF8(LM)",
2882 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2883 "spanBackUTF8", "spanBackUTF8(LM)"
2884 };
2885
2886 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2887
2888 // filter span options
2889 if(type<=3) {
2890 // span forward
2891 if((whichSpans&SPAN_FWD)==0) {
2892 return -1;
2893 }
2894 isForward=TRUE;
2895 } else {
2896 // span backward
2897 if((whichSpans&SPAN_BACK)==0) {
2898 return -1;
2899 }
2900 isForward=FALSE;
2901 }
2902 if((type&1)==0) {
2903 // use USET_SPAN_CONTAINED
2904 if((whichSpans&SPAN_CONTAINED)==0) {
2905 return -1;
2906 }
2907 contained=USET_SPAN_CONTAINED;
2908 } else {
2909 // use USET_SPAN_SIMPLE
2910 if((whichSpans&SPAN_SIMPLE)==0) {
2911 return -1;
2912 }
2913 contained=USET_SPAN_SIMPLE;
2914 }
2915
2916 // Default first span condition for going forward with an uncomplemented set.
2917 spanCondition=USET_SPAN_NOT_CONTAINED;
2918 if(isComplement) {
2919 spanCondition=invertSpanCondition(spanCondition, contained);
2920 }
2921
2922 // First span condition for span(), used to terminate the spanBack() iteration.
2923 firstSpanCondition=spanCondition;
2924
2925 // spanBack(): Its initial span condition is span()'s last span condition,
2926 // which is the opposite of span()'s first span condition
2927 // if we expect an even number of spans.
2928 // (The loop inverts spanCondition (expectCount-1) times
2929 // before the expectCount'th span() call.)
2930 // If we do not compare forward and backward directions, then we do not have an
2931 // expectCount and just start with firstSpanCondition.
2932 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2933 spanCondition=invertSpanCondition(spanCondition, contained);
2934 }
2935
2936 count=0;
2937 switch(type) {
2938 case 0:
2939 case 1:
2940 start=0;
2941 if(length<0) {
2942 length=slen(s, isUTF16);
2943 }
2944 for(;;) {
2945 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2946 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2947 if(count<limitsCapacity) {
2948 limits[count]=start;
2949 }
2950 ++count;
2951 if(start>=length) {
2952 break;
2953 }
2954 spanCondition=invertSpanCondition(spanCondition, contained);
2955 }
2956 break;
2957 case 2:
2958 case 3:
2959 start=0;
2960 for(;;) {
2961 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2962 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2963 if(count<limitsCapacity) {
2964 limits[count]=start;
2965 }
2966 ++count;
2967 if(length>=0 ? start>=length :
2968 isUTF16 ? ((const UChar *)s)[start]==0 :
2969 ((const char *)s)[start]==0
2970 ) {
2971 break;
2972 }
2973 spanCondition=invertSpanCondition(spanCondition, contained);
2974 }
2975 break;
2976 case 4:
2977 case 5:
2978 if(length<0) {
2979 length=slen(s, isUTF16);
2980 }
2981 for(;;) {
2982 ++count;
2983 if(count<=limitsCapacity) {
2984 limits[limitsCapacity-count]=length;
2985 }
2986 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2987 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2988 if(length==0 && spanCondition==firstSpanCondition) {
2989 break;
2990 }
2991 spanCondition=invertSpanCondition(spanCondition, contained);
2992 }
2993 if(count<limitsCapacity) {
2994 memmove(limits, limits+(limitsCapacity-count), count*4);
2995 }
2996 break;
2997 case 6:
2998 case 7:
2999 for(;;) {
3000 ++count;
3001 if(count<=limitsCapacity) {
3002 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3003 }
3004 // Note: Length<0 is tested only for the first spanBack().
3005 // If we wanted to keep length<0 for all spanBack()s, we would have to
3006 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3007 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3008 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3009 if(length==0 && spanCondition==firstSpanCondition) {
3010 break;
3011 }
3012 spanCondition=invertSpanCondition(spanCondition, contained);
3013 }
3014 if(count<limitsCapacity) {
3015 memmove(limits, limits+(limitsCapacity-count), count*4);
3016 }
3017 break;
3018 default:
3019 typeName="";
3020 return -1;
3021 }
3022
3023 return count;
3024}
3025
3026// sets to be tested; odd index=isComplement
3027enum {
3028 SLOW,
3029 SLOW_NOT,
3030 FAST,
3031 FAST_NOT,
3032 SET_COUNT
3033};
3034
3035static const char *const setNames[SET_COUNT]={
3036 "slow",
3037 "slow.not",
3038 "fast",
3039 "fast.not"
3040};
3041
3042/*
3043 * Verify that we get the same results whether we look at text with contains(),
3044 * span() or spanBack(), using unfrozen or frozen versions of the set,
3045 * and using the set or its complement (switching the spanConditions accordingly).
3046 * The latter verifies that
3047 * set.span(spanCondition) == set.complement().span(!spanCondition).
3048 *
3049 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3050 * or returned to the caller (with an input expectCount<0).
3051 */
3052void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3053 const void *s, int32_t length, UBool isUTF16,
3054 uint32_t whichSpans,
3055 int32_t expectLimits[], int32_t &expectCount,
3056 const char *testName, int32_t index) {
3057 int32_t limits[500];
3058 int32_t limitsCount;
3059 int i, j;
3060
3061 const char *typeName;
3062 int type;
3063
3064 for(i=0; i<SET_COUNT; ++i) {
3065 if((i&1)==0) {
3066 // Even-numbered sets are original, uncomplemented sets.
3067 if((whichSpans&SPAN_SET)==0) {
3068 continue;
3069 }
3070 } else {
3071 // Odd-numbered sets are complemented.
3072 if((whichSpans&SPAN_COMPLEMENT)==0) {
3073 continue;
3074 }
3075 }
3076 for(type=0;; ++type) {
3077 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3078 s, length, isUTF16,
3079 whichSpans,
3080 type, typeName,
3081 limits, LENGTHOF(limits), expectCount);
3082 if(typeName[0]==0) {
3083 break; // All types tried.
3084 }
3085 if(limitsCount<0) {
3086 continue; // Span option filtered out.
3087 }
3088 if(expectCount<0) {
3089 expectCount=limitsCount;
3090 if(limitsCount>LENGTHOF(limits)) {
3091 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3092 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3093 return;
3094 }
3095 memcpy(expectLimits, limits, limitsCount*4);
3096 } else if(limitsCount!=expectCount) {
3097 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3098 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3099 } else {
3100 for(j=0; j<limitsCount; ++j) {
3101 if(limits[j]!=expectLimits[j]) {
3102 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3103 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3104 j, (long)limits[j], (long)expectLimits[j]);
3105 break;
3106 }
3107 }
3108 }
3109 }
3110 }
3111
3112 // Compare span() with containsAll()/containsNone(),
3113 // but only if we have expectLimits[] from the uncomplemented set.
3114 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3115 const UChar *s16=(const UChar *)s;
3116 UnicodeString string;
3117 int32_t prev=0, limit, length;
3118 for(i=0; i<expectCount; ++i) {
3119 limit=expectLimits[i];
3120 length=limit-prev;
3121 if(length>0) {
3122 string.setTo(FALSE, s16+prev, length); // read-only alias
3123 if(i&1) {
3124 if(!sets[SLOW]->getSet().containsAll(string)) {
3125 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3126 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3127 return;
3128 }
3129 if(!sets[FAST]->getSet().containsAll(string)) {
3130 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3131 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3132 return;
3133 }
3134 } else {
3135 if(!sets[SLOW]->getSet().containsNone(string)) {
3136 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3137 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3138 return;
3139 }
3140 if(!sets[FAST]->getSet().containsNone(string)) {
3141 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3142 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3143 return;
3144 }
3145 }
3146 }
3147 prev=limit;
3148 }
3149 }
3150}
3151
3152// Specifically test either UTF-16 or UTF-8.
3153void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3154 const void *s, int32_t length, UBool isUTF16,
3155 uint32_t whichSpans,
3156 const char *testName, int32_t index) {
3157 int32_t expectLimits[500];
3158 int32_t expectCount=-1;
3159 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3160}
3161
3162UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3163 UChar c, c2;
3164
3165 if(length>=0) {
3166 while(length>0) {
3167 c=*s++;
3168 --length;
3169 if(0xd800<=c && c<0xe000) {
3170 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3171 return TRUE;
3172 }
3173 --length;
3174 }
3175 }
3176 } else {
3177 while((c=*s++)!=0) {
3178 if(0xd800<=c && c<0xe000) {
3179 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3180 return TRUE;
3181 }
3182 }
3183 }
3184 }
3185 return FALSE;
3186}
3187
3188// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3189// unless either UTF is turned off in whichSpans.
3190// Testing UTF-16 and UTF-8 together requires that surrogate code points
3191// have the same contains(c) value as U+FFFD.
3192void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3193 const UChar *s16, int32_t length16,
3194 uint32_t whichSpans,
3195 const char *testName, int32_t index) {
3196 int32_t expectLimits[500];
3197 int32_t expectCount;
3198
3199 expectCount=-1; // Get expectLimits[] from testSpan().
3200
3201 if((whichSpans&SPAN_UTF16)!=0) {
3202 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3203 }
3204 if((whichSpans&SPAN_UTF8)==0) {
3205 return;
3206 }
3207
3208 // Convert s16[] and expectLimits[] to UTF-8.
3209 uint8_t s8[3000];
3210 int32_t offsets[3000];
3211
3212 const UChar *s16Limit=s16+length16;
3213 char *t=(char *)s8;
3214 char *tLimit=t+sizeof(s8);
3215 int32_t *o=offsets;
3216 UErrorCode errorCode=U_ZERO_ERROR;
3217
3218 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3219 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3220 if(U_FAILURE(errorCode)) {
3221 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3222 testName, (long)index, u_errorName(errorCode));
3223 ucnv_resetFromUnicode(utf8Cnv);
3224 return;
3225 }
3226 int32_t length8=(int32_t)(t-(char *)s8);
3227
3228 // Convert expectLimits[].
3229 int32_t i, j, expect;
3230 for(i=j=0; i<expectCount; ++i) {
3231 expect=expectLimits[i];
3232 if(expect==length16) {
3233 expectLimits[i]=length8;
3234 } else {
3235 while(offsets[j]<expect) {
3236 ++j;
3237 }
3238 expectLimits[i]=j;
3239 }
3240 }
3241
3242 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3243}
3244
3245static UChar32 nextCodePoint(UChar32 c) {
3246 // Skip some large and boring ranges.
3247 switch(c) {
3248 case 0x3441:
3249 return 0x4d7f;
3250 case 0x5100:
3251 return 0x9f00;
3252 case 0xb040:
3253 return 0xd780;
3254 case 0xe041:
3255 return 0xf8fe;
3256 case 0x10100:
3257 return 0x20000;
3258 case 0x20041:
3259 return 0xe0000;
3260 case 0xe0101:
3261 return 0x10fffd;
3262 default:
3263 return c+1;
3264 }
3265}
3266
3267// Verify that all implementations represent the same set.
3268void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3269 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3270 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3271 // Skip the UTF-8 part of the test - if the string contains surrogates -
3272 // because it is likely to produce a different result.
3273 UBool inconsistentSurrogates=
3274 (!(sets[0]->getSet().contains(0xfffd) ?
3275 sets[0]->getSet().contains(0xd800, 0xdfff) :
3276 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3277 sets[0]->hasStringsWithSurrogates());
3278
3279 UChar s[1000];
3280 int32_t length=0;
3281 uint32_t localWhichSpans;
3282
3283 UChar32 c, first;
3284 for(first=c=0;; c=nextCodePoint(c)) {
3285 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3286 localWhichSpans=whichSpans;
3287 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3288 localWhichSpans&=~SPAN_UTF8;
3289 }
3290 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3291 if(c>0x10ffff) {
3292 break;
3293 }
3294 length=0;
3295 first=c;
3296 }
3297 U16_APPEND_UNSAFE(s, length, c);
3298 }
3299}
3300
3301// Test with a particular, interesting string.
3302// Specify length and try NUL-termination.
3303void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3304 static const UChar s[]={
3305 0x61, 0x62, 0x20, // Latin, space
3306 0x3b1, 0x3b2, 0x3b3, // Greek
3307 0xd900, // lead surrogate
3308 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3309 0xdc05, // trail surrogate
3310 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3311 0xd900, 0xdc05, // unassigned supplementary
3312 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3313 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3314 0 // NUL
3315 };
3316
3317 if((whichSpans&SPAN_UTF16)==0) {
3318 return;
3319 }
3320 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3321 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3322}
3323
3324void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3325 static const char s[]={
3326 "abc" // Latin
3327
3328 /* trail byte in lead position */
3329 "\x80"
3330
3331 " " // space
3332
3333 /* truncated multi-byte sequences */
3334 "\xd0"
3335 "\xe0"
3336 "\xe1"
3337 "\xed"
3338 "\xee"
3339 "\xf0"
3340 "\xf1"
3341 "\xf4"
3342 "\xf8"
3343 "\xfc"
3344
3345 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3346
3347 /* trail byte in lead position */
3348 "\x80"
3349
3350 "\xe0\x80"
3351 "\xe0\xa0"
3352 "\xe1\x80"
3353 "\xed\x80"
3354 "\xed\xa0"
3355 "\xee\x80"
3356 "\xf0\x80"
3357 "\xf0\x90"
3358 "\xf1\x80"
3359 "\xf4\x80"
3360 "\xf4\x90"
3361 "\xf8\x80"
3362 "\xfc\x80"
3363
3364 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3365
3366 /* trail byte in lead position */
3367 "\x80"
3368
3369 "\xf0\x80\x80"
3370 "\xf0\x90\x80"
3371 "\xf1\x80\x80"
3372 "\xf4\x80\x80"
3373 "\xf4\x90\x80"
3374 "\xf8\x80\x80"
3375 "\xfc\x80\x80"
3376
3377 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3378
3379 /* trail byte in lead position */
3380 "\x80"
3381
3382 "\xf8\x80\x80\x80"
3383 "\xfc\x80\x80\x80"
3384
3385 "\xF1\x90\x80\x85" // unassigned supplementary
3386
3387 /* trail byte in lead position */
3388 "\x80"
3389
3390 "\xfc\x80\x80\x80\x80"
3391
3392 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3393
3394 /* trail byte in lead position */
3395 "\x80"
3396
3397 /* complete sequences but non-shortest forms or out of range etc. */
3398 "\xc0\x80"
3399 "\xe0\x80\x80"
3400 "\xed\xa0\x80"
3401 "\xf0\x80\x80\x80"
3402 "\xf4\x90\x80\x80"
3403 "\xf8\x80\x80\x80\x80"
3404 "\xfc\x80\x80\x80\x80\x80"
3405 "\xfe"
3406 "\xff"
3407
3408 /* trail byte in lead position */
3409 "\x80"
3410
3411 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3412 };
3413
3414 if((whichSpans&SPAN_UTF8)==0) {
3415 return;
3416 }
3417 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3418 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3419}
3420
3421// Take a set of span options and multiply them so that
3422// each portion only has one of the options a, b and c.
3423// If b==0, then the set of options is just modified with mask and a.
3424// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3425static int32_t
3426addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3427 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3428 uint32_t s;
3429 int32_t i;
3430
3431 for(i=0; i<whichSpansCount; ++i) {
3432 s=whichSpans[i]&mask;
3433 whichSpans[i]=s|a;
3434 if(b!=0) {
3435 whichSpans[whichSpansCount+i]=s|b;
3436 if(c!=0) {
3437 whichSpans[2*whichSpansCount+i]=s|c;
3438 }
3439 }
3440 }
3441 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3442}
3443
3444#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3445#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3446#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3447#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3448
3449void UnicodeSetTest::TestSpan() {
3450 // "[...]" is a UnicodeSet pattern.
3451 // "*" performs tests on all Unicode code points and on a selection of
3452 // malformed UTF-8/16 strings.
3453 // "-options" limits the scope of testing for the current set.
3454 // By default, the test verifies that equivalent boundaries are found
3455 // for UTF-16 and UTF-8, going forward and backward,
3456 // alternating USET_SPAN_NOT_CONTAINED with
3457 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3458 // Single-character options:
3459 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3460 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3461 // or the set contains strings with unpaired surrogates
3462 // which do not translate to valid UTF-8.
3463 // c -- set.span() and set.complement().span() boundaries may differ.
3464 // Cause: Set strings are not complemented.
3465 // b -- span() and spanBack() boundaries may differ.
3466 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3467 // and spanBack(USET_SPAN_SIMPLE) are defined to
3468 // match with non-overlapping substrings.
3469 // For example, with a set containing "ab" and "ba",
3470 // span() of "aba" yields boundaries { 0, 2, 3 }
3471 // because the initial "ab" matches from 0 to 2,
3472 // while spanBack() yields boundaries { 0, 1, 3 }
3473 // because the final "ba" matches from 1 to 3.
3474 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3475 // Cause: Strings in the set overlap, and a longer match may
3476 // require a sequence including non-longest substrings.
3477 // For example, with a set containing "ab", "abc" and "cd",
3478 // span(contained) of "abcd" spans the entire string
3479 // but span(longest match) only spans the first 3 characters.
3480 // Each "-options" first resets all options and then applies the specified options.
3481 // A "-" without options resets the options.
3482 // The options are also reset for each new set.
3483 // Other strings will be spanned.
3484 static const char *const testdata[]={
3485 "[:ID_Continue:]",
3486 "*",
3487 "[:White_Space:]",
3488 "*",
3489 "[]",
3490 "*",
3491 "[\\u0000-\\U0010FFFF]",
3492 "*",
3493 "[\\u0000\\u0080\\u0800\\U00010000]",
3494 "*",
3495 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3496 "*",
3497 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3498 "-c",
3499 "*",
3500 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3501 "-c",
3502 "*",
3503
3504 // Overlapping strings cause overlapping attempts to match.
3505 "[x{xy}{xya}{axy}{ax}]",
3506 "-cl",
3507
3508 // More repetitions of "xya" would take too long with the recursive
3509 // reference implementation.
3510 // containsAll()=FALSE
3511 // test_string 0x14
3512 "xx"
3513 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3514 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3515 "xyaxyaxyaxya"
3516 "xx"
3517 "xyaxyaxyaxya" // span() ends here.
3518 "aaa",
3519
3520 // containsAll()=TRUE
3521 // test_string 0x15
3522 "xx"
3523 "xyaxyaxyaxya"
3524 "xx"
3525 "xyaxyaxyaxya"
3526 "xx"
3527 "xyaxyaxyaxy",
3528
3529 "-bc",
3530 // test_string 0x17
3531 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3532 "-c",
3533 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3534 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3535 "-",
3536 "byaya", // span() -> { 5 }
3537 "byay", // span() -> { 4 }
3538 "bya", // span() -> { 3 }
3539
3540 // span(longest match) will not span the whole string.
3541 "[a{ab}{bc}]",
3542 "-cl",
3543 // test_string 0x21
3544 "abc",
3545
3546 "[a{ab}{abc}{cd}]",
3547 "-cl",
3548 "acdabcdabccd",
3549
3550 // spanBack(longest match) will not span the whole string.
3551 "[c{ab}{bc}]",
3552 "-cl",
3553 "abc",
3554
3555 "[d{cd}{bcd}{ab}]",
3556 "-cl",
3557 "abbcdabcdabd",
3558
3559 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3560 // and UTF-8 trail bytes.
3561 // Copies of above test sets and strings, but transliterated to have
3562 // different code points with similar trail units.
3563 // Previous: a b c d
3564 // Unicode: 042B 30AB 200AB 204AB
3565 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3566 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3567 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3568 "-cl",
3569 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3570
3571 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3572 "-cl",
3573 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3574
3575 // Stress bookkeeping and recursion.
3576 // The following strings are barely doable with the recursive
3577 // reference implementation.
3578 // The not-contained character at the end prevents an early exit from the span().
3579 "[b{bb}]",
3580 "-c",
3581 // test_string 0x33
3582 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3583 // On complement sets, span() and spanBack() get different results
3584 // because b is not in the complement set and there is an odd number of b's
3585 // in the test string.
3586 "-bc",
3587 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3588
3589 // Test with set strings with an initial or final code point span
3590 // longer than 254.
3591 "[a{" _64_a _64_a _64_a _64_a "b}"
3592 "{a" _64_b _64_b _64_b _64_b "}]",
3593 "-c",
3594 _64_a _64_a _64_a _63_a "b",
3595 _64_a _64_a _64_a _64_a "b",
3596 _64_a _64_a _64_a _64_a "aaaabbbb",
3597 "a" _64_b _64_b _64_b _63_b,
3598 "a" _64_b _64_b _64_b _64_b,
3599 "aaaabbbb" _64_b _64_b _64_b _64_b,
3600
3601 // Test with strings containing unpaired surrogates.
3602 // They are not representable in UTF-8, and a leading trail surrogate
3603 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3604 // U+20001 == \\uD840\\uDC01
3605 // U+20400 == \\uD841\\uDC00
3606 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3607 "-8cl",
3608 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3609 };
3610 uint32_t whichSpans[96]={ SPAN_ALL };
3611 int32_t whichSpansCount=1;
3612
3613 UnicodeSet *sets[SET_COUNT]={ NULL };
3614 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3615
3616 char testName[1024];
3617 char *testNameLimit=testName;
3618
3619 int32_t i, j;
3620 for(i=0; i<LENGTHOF(testdata); ++i) {
3621 const char *s=testdata[i];
3622 if(s[0]=='[') {
3623 // Create new test sets from this pattern.
3624 for(j=0; j<SET_COUNT; ++j) {
3625 delete sets_with_str[j];
3626 delete sets[j];
3627 }
3628 UErrorCode errorCode=U_ZERO_ERROR;
3629 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3630 if(U_FAILURE(errorCode)) {
729e4ab9 3631 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
46f4442e
A
3632 break;
3633 }
3634 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3635 sets[SLOW_NOT]->complement();
3636 // Intermediate set: Test cloning of a frozen set.
3637 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3638 fast->freeze();
3639 sets[FAST]=(UnicodeSet *)fast->clone();
3640 delete fast;
3641 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3642 fastNot->freeze();
3643 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3644 delete fastNot;
3645
3646 for(j=0; j<SET_COUNT; ++j) {
3647 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3648 }
3649
3650 strcpy(testName, s);
3651 testNameLimit=strchr(testName, 0);
3652 *testNameLimit++=':';
3653 *testNameLimit=0;
3654
3655 whichSpans[0]=SPAN_ALL;
3656 whichSpansCount=1;
3657 } else if(s[0]=='-') {
3658 whichSpans[0]=SPAN_ALL;
3659 whichSpansCount=1;
3660
3661 while(*++s!=0) {
3662 switch(*s) {
3663 case 'c':
3664 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3665 ~SPAN_POLARITY,
3666 SPAN_SET,
3667 SPAN_COMPLEMENT,
3668 0);
3669 break;
3670 case 'b':
3671 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3672 ~SPAN_DIRS,
3673 SPAN_FWD,
3674 SPAN_BACK,
3675 0);
3676 break;
3677 case 'l':
3678 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3679 // USET_SPAN_SIMPLE only FWD, and separately
3680 // USET_SPAN_SIMPLE only BACK
3681 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3682 ~(SPAN_DIRS|SPAN_CONDITION),
3683 SPAN_DIRS|SPAN_CONTAINED,
3684 SPAN_FWD|SPAN_SIMPLE,
3685 SPAN_BACK|SPAN_SIMPLE);
3686 break;
3687 case '8':
3688 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3689 ~SPAN_UTFS,
3690 SPAN_UTF16,
3691 SPAN_UTF8,
3692 0);
3693 break;
3694 default:
3695 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3696 break;
3697 }
3698 }
3699 } else if(0==strcmp(s, "*")) {
3700 strcpy(testNameLimit, "bad_string");
3701 for(j=0; j<whichSpansCount; ++j) {
3702 if(whichSpansCount>1) {
3703 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3704 "%%0x%3x",
3705 whichSpans[j]);
3706 }
3707 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3708 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3709 }
3710
3711 strcpy(testNameLimit, "contents");
3712 for(j=0; j<whichSpansCount; ++j) {
3713 if(whichSpansCount>1) {
3714 sprintf(testNameLimit+8 /* strlen("contents") */,
3715 "%%0x%3x",
3716 whichSpans[j]);
3717 }
3718 testSpanContents(sets_with_str, whichSpans[j], testName);
3719 }
3720 } else {
3721 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3722 strcpy(testNameLimit, "test_string");
3723 for(j=0; j<whichSpansCount; ++j) {
3724 if(whichSpansCount>1) {
3725 sprintf(testNameLimit+11 /* strlen("test_string") */,
3726 "%%0x%3x",
3727 whichSpans[j]);
3728 }
3729 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3730 }
3731 }
3732 }
3733 for(j=0; j<SET_COUNT; ++j) {
3734 delete sets_with_str[j];
3735 delete sets[j];
3736 }
3737}
3738
3739// Test select patterns and strings, and test USET_SPAN_SIMPLE.
3740void UnicodeSetTest::TestStringSpan() {
3741 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3742 static const char *const string=
3743 "xx"
3744 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3745 "xx"
3746 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3747 "xx"
3748 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3749 "aaaa";
3750
3751 UErrorCode errorCode=U_ZERO_ERROR;
3752 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3753 UnicodeSet set(pattern16, errorCode);
3754 if(U_FAILURE(errorCode)) {
3755 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3756 return;
3757 }
3758
3759 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3760
3761 if(set.containsAll(string16)) {
3762 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3763 }
3764
3765 // Remove trailing "aaaa".
3766 string16.truncate(string16.length()-4);
3767 if(!set.containsAll(string16)) {
3768 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3769 }
3770
3771 string16=UNICODE_STRING_SIMPLE("byayaxya");
3772 const UChar *s16=string16.getBuffer();
3773 int32_t length16=string16.length();
3774 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3775 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3776 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3777 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3778 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3779 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3780 ) {
3781 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3782 }
3783
3784 pattern="[a{ab}{abc}{cd}]";
3785 pattern16=UnicodeString(pattern, -1, US_INV);
3786 set.applyPattern(pattern16, errorCode);
3787 if(U_FAILURE(errorCode)) {
3788 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3789 return;
3790 }
3791 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3792 s16=string16.getBuffer();
3793 length16=string16.length();
3794 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3795 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3796 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3797 ) {
3798 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3799 }
3800
3801 pattern="[d{cd}{bcd}{ab}]";
3802 pattern16=UnicodeString(pattern, -1, US_INV);
3803 set.applyPattern(pattern16, errorCode).freeze();
3804 if(U_FAILURE(errorCode)) {
3805 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3806 return;
3807 }
3808 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3809 s16=string16.getBuffer();
3810 length16=string16.length();
3811 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3812 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3813 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3814 ) {
3815 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3816 }
3817}