]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/usettest.cpp
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / test / intltest / usettest.cpp
... / ...
CommitLineData
1/*
2********************************************************************************
3* Copyright (C) 1999-2008 International Business Machines Corporation and
4* others. All Rights Reserved.
5********************************************************************************
6* Date Name Description
7* 10/20/99 alan Creation.
8* 03/22/2000 Madhu Added additional tests
9********************************************************************************
10*/
11
12#include <stdio.h>
13
14#include <string.h>
15#include "unicode/utypes.h"
16#include "usettest.h"
17#include "unicode/ucnv.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/usetiter.h"
21#include "unicode/ustring.h"
22#include "unicode/parsepos.h"
23#include "unicode/symtable.h"
24#include "unicode/uversion.h"
25#include "hash.h"
26
27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28
29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30 errln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31 u_errorName(status));}}
32
33#define TEST_ASSERT(expr) {if (!(expr)) { \
34 errln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35
36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37 UnicodeString pat;
38 set.toPattern(pat);
39 return left + UnicodeSetTest::escape(pat);
40}
41
42#define CASE(id,test) case id: \
43 name = #test; \
44 if (exec) { \
45 logln(#test "---"); \
46 logln(); \
47 test(); \
48 } \
49 break
50
51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52}
53
54UConverter *UnicodeSetTest::openUTF8Converter() {
55 if(utf8Cnv==NULL) {
56 UErrorCode errorCode=U_ZERO_ERROR;
57 utf8Cnv=ucnv_open("UTF-8", &errorCode);
58 }
59 return utf8Cnv;
60}
61
62UnicodeSetTest::~UnicodeSetTest() {
63 ucnv_close(utf8Cnv);
64}
65
66void
67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68 const char* &name, char* /*par*/) {
69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70 switch (index) {
71 CASE(0,TestPatterns);
72 CASE(1,TestAddRemove);
73 CASE(2,TestCategories);
74 CASE(3,TestCloneEqualHash);
75 CASE(4,TestMinimalRep);
76 CASE(5,TestAPI);
77 CASE(6,TestScriptSet);
78 CASE(7,TestPropertySet);
79 CASE(8,TestClone);
80 CASE(9,TestExhaustive);
81 CASE(10,TestToPattern);
82 CASE(11,TestIndexOf);
83 CASE(12,TestStrings);
84 CASE(13,Testj2268);
85 CASE(14,TestCloseOver);
86 CASE(15,TestEscapePattern);
87 CASE(16,TestInvalidCodePoint);
88 CASE(17,TestSymbolTable);
89 CASE(18,TestSurrogate);
90 CASE(19,TestPosixClasses);
91 CASE(20,TestIteration);
92 CASE(21,TestFreezable);
93 CASE(22,TestSpan);
94 CASE(23,TestStringSpan);
95 default: name = ""; break;
96 }
97}
98
99static const char NOT[] = "%%%%";
100
101/**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105void UnicodeSetTest::Testj2268() {
106 UnicodeSet t;
107 t.add(UnicodeString("abc"));
108 UnicodeSet test(t);
109 UnicodeString ustrPat;
110 test.toPattern(ustrPat, TRUE);
111}
112
113/**
114 * Test toPattern().
115 */
116void UnicodeSetTest::TestToPattern() {
117 UErrorCode ec = U_ZERO_ERROR;
118
119 // Test that toPattern() round trips with syntax characters and
120 // whitespace.
121 {
122 static const char* OTHER_TOPATTERN_TESTS[] = {
123 "[[:latin:]&[:greek:]]",
124 "[[:latin:]-[:greek:]]",
125 "[:nonspacing mark:]",
126 NULL
127 };
128
129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130 ec = U_ZERO_ERROR;
131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132 if (U_FAILURE(ec)) {
133 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
134 continue;
135 }
136 checkPat(OTHER_TOPATTERN_TESTS[j], s);
137 }
138
139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142 // check various combinations to make sure they all work.
143 if (i != 0 && !toPatternAux(i, i)){
144 continue;
145 }
146 if (!toPatternAux(0, i)){
147 continue;
148 }
149 if (!toPatternAux(i, 0xFFFF)){
150 continue;
151 }
152 }
153 }
154 }
155
156 // Test pattern behavior of multicharacter strings.
157 {
158 ec = U_ZERO_ERROR;
159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161 // This loop isn't a loop. It's here to make the compiler happy.
162 // If you're curious, try removing it and changing the 'break'
163 // statements (except for the last) to goto's.
164 for (;;) {
165 if (U_FAILURE(ec)) break;
166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169 s->add("ac");
170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174 if (U_FAILURE(ec)) break;
175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177
178 s->add("[]");
179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181
182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183 if (U_FAILURE(ec)) break;
184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186
187 // j2189
188 s->clear();
189 s->add(UnicodeString("abc", ""));
190 s->add(UnicodeString("abc", ""));
191 const char* exp6[] = {"abc", NOT, "ab", NULL};
192 expectToPattern(*s, "[{abc}]", exp6);
193
194 break;
195 }
196
197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198 delete s;
199 }
200
201 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202 UnicodeSet s;
203 s.add((UChar)97, (UChar)98); // 'a', 'b'
204 expectToPattern(s, "[ab]", NULL);
205}
206
207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209 // use Integer.toString because Utility.hex doesn't handle ints
210 UnicodeString pat = "";
211 // TODO do these in hex
212 //String source = "0x" + Integer.toString(start,16).toUpperCase();
213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214 UnicodeString source;
215 source = source + (uint32_t)start;
216 if (start != end)
217 source = source + ".." + (uint32_t)end;
218 UnicodeSet testSet;
219 testSet.add(start, end);
220 return checkPat(source, testSet);
221}
222
223UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224 const UnicodeSet& testSet) {
225 // What we want to make sure of is that a pattern generated
226 // by toPattern(), with or without escaped unprintables, can
227 // be passed back into the UnicodeSet constructor.
228 UnicodeString pat0;
229
230 testSet.toPattern(pat0, TRUE);
231
232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234 //String pat1 = unescapeLeniently(pat0);
235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237 UnicodeString pat2;
238 testSet.toPattern(pat2, FALSE);
239 if (!checkPat(source, testSet, pat2)) return FALSE;
240
241 //String pat3 = unescapeLeniently(pat2);
242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246 return TRUE;
247}
248
249UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250 const UnicodeSet& testSet,
251 const UnicodeString& pat) {
252 UErrorCode ec = U_ZERO_ERROR;
253 UnicodeSet testSet2(pat, ec);
254 if (testSet2 != testSet) {
255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256 return FALSE;
257 }
258 return TRUE;
259}
260
261void
262UnicodeSetTest::TestPatterns(void) {
263 UnicodeSet set;
264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271 // Throw in a test of complement
272 set.complement();
273 UnicodeString exp;
274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275 expectPairs(set, exp);
276}
277
278void
279UnicodeSetTest::TestCategories(void) {
280 UErrorCode status = U_ZERO_ERROR;
281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282 UnicodeSet set(pat, status);
283 if (U_FAILURE(status)) {
284 errln((UnicodeString)"Fail: Can't construct set with " + pat);
285 } else {
286 expectContainment(set, pat, "ABC", "abc");
287 }
288
289 UChar32 i;
290 int32_t failures = 0;
291 // Make sure generation of L doesn't pollute cached Lu set
292 // First generate L, then Lu
293 set.applyPattern("[:L:]", status);
294 if (U_FAILURE(status)) { errln("FAIL"); return; }
295 for (i=0; i<0x200; ++i) {
296 UBool l = u_isalpha((UChar)i);
297 if (l != set.contains(i)) {
298 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
299 set.contains(i));
300 if (++failures == 10) break;
301 }
302 }
303
304 set.applyPattern("[:Lu:]", status);
305 if (U_FAILURE(status)) { errln("FAIL"); return; }
306 for (i=0; i<0x200; ++i) {
307 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
308 if (lu != set.contains(i)) {
309 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
310 set.contains(i));
311 if (++failures == 20) break;
312 }
313 }
314}
315void
316UnicodeSetTest::TestCloneEqualHash(void) {
317 UErrorCode status = U_ZERO_ERROR;
318 // set1 and set2 used to be built with the obsolete constructor taking
319 // UCharCategory values; replaced with pattern constructors
320 // markus 20030502
321 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
322 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
323 if (U_FAILURE(status)){
324 errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
325 return;
326 }
327 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
328 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
329 if (U_FAILURE(status)){
330 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
331 return;
332 }
333
334 if (*set1 != *set1a) {
335 errln("FAIL: category constructor for Ll broken");
336 }
337 if (*set2 != *set2a) {
338 errln("FAIL: category constructor for Nd broken");
339 }
340 delete set1a;
341 delete set2a;
342
343 logln("Testing copy construction");
344 UnicodeSet *set1copy=new UnicodeSet(*set1);
345 if(*set1 != *set1copy || *set1 == *set2 ||
346 getPairs(*set1) != getPairs(*set1copy) ||
347 set1->hashCode() != set1copy->hashCode()){
348 errln("FAIL : Error in copy construction");
349 return;
350 }
351
352 logln("Testing =operator");
353 UnicodeSet set1equal=*set1;
354 UnicodeSet set2equal=*set2;
355 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
356 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
357 errln("FAIL: Error in =operator");
358 }
359
360 logln("Testing clone()");
361 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
362 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
363 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
364 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
365 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
366 errln("FAIL: Error in clone");
367 }
368
369 logln("Testing hashcode");
370 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
371 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
372 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
373 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
374 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
375 errln("FAIL: Error in hashCode()");
376 }
377
378 delete set1;
379 delete set1copy;
380 delete set2;
381 delete set1clone;
382 delete set2clone;
383
384
385}
386void
387UnicodeSetTest::TestAddRemove(void) {
388 UnicodeSet set; // Construct empty set
389 doAssert(set.isEmpty() == TRUE, "set should be empty");
390 doAssert(set.size() == 0, "size should be 0");
391 set.complement();
392 doAssert(set.size() == 0x110000, "size should be 0x110000");
393 set.clear();
394 set.add(0x0061, 0x007a);
395 expectPairs(set, "az");
396 doAssert(set.isEmpty() == FALSE, "set should not be empty");
397 doAssert(set.size() != 0, "size should not be equal to 0");
398 doAssert(set.size() == 26, "size should be equal to 26");
399 set.remove(0x006d, 0x0070);
400 expectPairs(set, "alqz");
401 doAssert(set.size() == 22, "size should be equal to 22");
402 set.remove(0x0065, 0x0067);
403 expectPairs(set, "adhlqz");
404 doAssert(set.size() == 19, "size should be equal to 19");
405 set.remove(0x0064, 0x0069);
406 expectPairs(set, "acjlqz");
407 doAssert(set.size() == 16, "size should be equal to 16");
408 set.remove(0x0063, 0x0072);
409 expectPairs(set, "absz");
410 doAssert(set.size() == 10, "size should be equal to 10");
411 set.add(0x0066, 0x0071);
412 expectPairs(set, "abfqsz");
413 doAssert(set.size() == 22, "size should be equal to 22");
414 set.remove(0x0061, 0x0067);
415 expectPairs(set, "hqsz");
416 set.remove(0x0061, 0x007a);
417 expectPairs(set, "");
418 doAssert(set.isEmpty() == TRUE, "set should be empty");
419 doAssert(set.size() == 0, "size should be 0");
420 set.add(0x0061);
421 doAssert(set.isEmpty() == FALSE, "set should not be empty");
422 doAssert(set.size() == 1, "size should not be equal to 1");
423 set.add(0x0062);
424 set.add(0x0063);
425 expectPairs(set, "ac");
426 doAssert(set.size() == 3, "size should not be equal to 3");
427 set.add(0x0070);
428 set.add(0x0071);
429 expectPairs(set, "acpq");
430 doAssert(set.size() == 5, "size should not be equal to 5");
431 set.clear();
432 expectPairs(set, "");
433 doAssert(set.isEmpty() == TRUE, "set should be empty");
434 doAssert(set.size() == 0, "size should be 0");
435
436 // Try removing an entire set from another set
437 expectPattern(set, "[c-x]", "cx");
438 UnicodeSet set2;
439 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
440 set.removeAll(set2);
441 expectPairs(set, "deluxx");
442
443 // Try adding an entire set to another set
444 expectPattern(set, "[jackiemclean]", "aacceein");
445 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
446 set.addAll(set2);
447 expectPairs(set, "aacehort");
448 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
449
450 // Try retaining an set of elements contained in another set (intersection)
451 UnicodeSet set3;
452 expectPattern(set3, "[a-c]", "ac");
453 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
454 set3.remove(0x0062);
455 expectPairs(set3, "aacc");
456 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
457 set.retainAll(set3);
458 expectPairs(set, "aacc");
459 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
460 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
461 set.clear();
462 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
463
464 // Test commutativity
465 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
466 expectPattern(set2, "[jackiemclean]", "aacceein");
467 set.addAll(set2);
468 expectPairs(set, "aacehort");
469 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
470
471
472
473
474}
475
476/**
477 * Make sure minimal representation is maintained.
478 */
479void UnicodeSetTest::TestMinimalRep() {
480 UErrorCode status = U_ZERO_ERROR;
481 // This is pretty thoroughly tested by checkCanonicalRep()
482 // run against the exhaustive operation results. Use the code
483 // here for debugging specific spot problems.
484
485 // 1 overlap against 2
486 UnicodeSet set("[h-km-q]", status);
487 if (U_FAILURE(status)) { errln("FAIL"); return; }
488 UnicodeSet set2("[i-o]", status);
489 if (U_FAILURE(status)) { errln("FAIL"); return; }
490 set.addAll(set2);
491 expectPairs(set, "hq");
492 // right
493 set.applyPattern("[a-m]", status);
494 if (U_FAILURE(status)) { errln("FAIL"); return; }
495 set2.applyPattern("[e-o]", status);
496 if (U_FAILURE(status)) { errln("FAIL"); return; }
497 set.addAll(set2);
498 expectPairs(set, "ao");
499 // left
500 set.applyPattern("[e-o]", status);
501 if (U_FAILURE(status)) { errln("FAIL"); return; }
502 set2.applyPattern("[a-m]", status);
503 if (U_FAILURE(status)) { errln("FAIL"); return; }
504 set.addAll(set2);
505 expectPairs(set, "ao");
506 // 1 overlap against 3
507 set.applyPattern("[a-eg-mo-w]", status);
508 if (U_FAILURE(status)) { errln("FAIL"); return; }
509 set2.applyPattern("[d-q]", status);
510 if (U_FAILURE(status)) { errln("FAIL"); return; }
511 set.addAll(set2);
512 expectPairs(set, "aw");
513}
514
515void UnicodeSetTest::TestAPI() {
516 UErrorCode status = U_ZERO_ERROR;
517 // default ct
518 UnicodeSet set;
519 if (!set.isEmpty() || set.getRangeCount() != 0) {
520 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
521 set);
522 }
523
524 // clear(), isEmpty()
525 set.add(0x0061);
526 if (set.isEmpty()) {
527 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
528 set);
529 }
530 set.clear();
531 if (!set.isEmpty()) {
532 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
533 set);
534 }
535
536 // size()
537 set.clear();
538 if (set.size() != 0) {
539 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
540 ": " + set);
541 }
542 set.add(0x0061);
543 if (set.size() != 1) {
544 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
545 ": " + set);
546 }
547 set.add(0x0031, 0x0039);
548 if (set.size() != 10) {
549 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
550 ": " + set);
551 }
552
553 // contains(first, last)
554 set.clear();
555 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
556 if (U_FAILURE(status)) { errln("FAIL"); return; }
557 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
558 UChar32 a = set.getRangeStart(i);
559 UChar32 b = set.getRangeEnd(i);
560 if (!set.contains(a, b)) {
561 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
562 " but doesn't: " + set);
563 }
564 if (set.contains((UChar32)(a-1), b)) {
565 errln((UnicodeString)"FAIL, shouldn't contain " +
566 (unsigned short)(a-1) + '-' + (unsigned short)b +
567 " but does: " + set);
568 }
569 if (set.contains(a, (UChar32)(b+1))) {
570 errln((UnicodeString)"FAIL, shouldn't contain " +
571 (unsigned short)a + '-' + (unsigned short)(b+1) +
572 " but does: " + set);
573 }
574 }
575
576 // Ported InversionList test.
577 UnicodeSet a((UChar32)3,(UChar32)10);
578 UnicodeSet b((UChar32)7,(UChar32)15);
579 UnicodeSet c;
580
581 logln((UnicodeString)"a [3-10]: " + a);
582 logln((UnicodeString)"b [7-15]: " + b);
583 c = a;
584 c.addAll(b);
585 UnicodeSet exp((UChar32)3,(UChar32)15);
586 if (c == exp) {
587 logln((UnicodeString)"c.set(a).add(b): " + c);
588 } else {
589 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
590 }
591 c.complement();
592 exp.set((UChar32)0, (UChar32)2);
593 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
594 if (c == exp) {
595 logln((UnicodeString)"c.complement(): " + c);
596 } else {
597 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
598 }
599 c.complement();
600 exp.set((UChar32)3, (UChar32)15);
601 if (c == exp) {
602 logln((UnicodeString)"c.complement(): " + c);
603 } else {
604 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
605 }
606 c = a;
607 c.complementAll(b);
608 exp.set((UChar32)3,(UChar32)6);
609 exp.add((UChar32)11,(UChar32) 15);
610 if (c == exp) {
611 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
612 } else {
613 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
614 }
615
616 exp = c;
617 bitsToSet(setToBits(c), c);
618 if (c == exp) {
619 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
620 } else {
621 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
622 }
623
624 // Additional tests for coverage JB#2118
625 //UnicodeSet::complement(class UnicodeString const &)
626 //UnicodeSet::complementAll(class UnicodeString const &)
627 //UnicodeSet::containsNone(class UnicodeSet const &)
628 //UnicodeSet::containsNone(long,long)
629 //UnicodeSet::containsSome(class UnicodeSet const &)
630 //UnicodeSet::containsSome(long,long)
631 //UnicodeSet::removeAll(class UnicodeString const &)
632 //UnicodeSet::retain(long)
633 //UnicodeSet::retainAll(class UnicodeString const &)
634 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
635 //UnicodeSetIterator::getString(void)
636 set.clear();
637 set.complement("ab");
638 exp.applyPattern("[{ab}]", status);
639 if (U_FAILURE(status)) { errln("FAIL"); return; }
640 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
641
642 UnicodeSetIterator iset(set);
643 if (!iset.next() || !iset.isString()) {
644 errln("FAIL: UnicodeSetIterator::next/isString");
645 } else if (iset.getString() != "ab") {
646 errln("FAIL: UnicodeSetIterator::getString");
647 }
648
649 set.add((UChar32)0x61, (UChar32)0x7A);
650 set.complementAll("alan");
651 exp.applyPattern("[{ab}b-kmo-z]", status);
652 if (U_FAILURE(status)) { errln("FAIL"); return; }
653 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
654
655 exp.applyPattern("[a-z]", status);
656 if (U_FAILURE(status)) { errln("FAIL"); return; }
657 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
658 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
659 exp.applyPattern("[aln]", status);
660 if (U_FAILURE(status)) { errln("FAIL"); return; }
661 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
662 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
663
664 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
665 errln("FAIL: containsNone(UChar32, UChar32)");
666 }
667 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
668 errln("FAIL: containsSome(UChar32, UChar32)");
669 }
670 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
671 errln("FAIL: containsNone(UChar32, UChar32)");
672 }
673 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
674 errln("FAIL: containsSome(UChar32, UChar32)");
675 }
676
677 set.removeAll("liu");
678 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
679 if (U_FAILURE(status)) { errln("FAIL"); return; }
680 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
681
682 set.retainAll("star");
683 exp.applyPattern("[rst]", status);
684 if (U_FAILURE(status)) { errln("FAIL"); return; }
685 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
686
687 set.retain((UChar32)0x73);
688 exp.applyPattern("[s]", status);
689 if (U_FAILURE(status)) { errln("FAIL"); return; }
690 if (set != exp) { errln("FAIL: retain('s')"); return; }
691
692 uint16_t buf[32];
693 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
694 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
695 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
696 errln("FAIL: serialize");
697 return;
698 }
699}
700
701void UnicodeSetTest::TestIteration() {
702 UErrorCode ec = U_ZERO_ERROR;
703 int i = 0;
704 int outerLoop;
705
706 // 6 code points, 3 ranges, 2 strings, 8 total elements
707 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
708 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
709 TEST_ASSERT_SUCCESS(ec);
710 UnicodeSetIterator it(set);
711
712 for (outerLoop=0; outerLoop<3; outerLoop++) {
713 // Run the test multiple times, to check that iterator.reset() is working.
714 for (i=0; i<10; i++) {
715 UBool nextv = it.next();
716 UBool isString = it.isString();
717 int32_t codePoint = it.getCodepoint();
718 //int32_t codePointEnd = it.getCodepointEnd();
719 UnicodeString s = it.getString();
720 switch (i) {
721 case 0:
722 TEST_ASSERT(nextv == TRUE);
723 TEST_ASSERT(isString == FALSE);
724 TEST_ASSERT(codePoint==0x61);
725 TEST_ASSERT(s == "a");
726 break;
727 case 1:
728 TEST_ASSERT(nextv == TRUE);
729 TEST_ASSERT(isString == FALSE);
730 TEST_ASSERT(codePoint==0x62);
731 TEST_ASSERT(s == "b");
732 break;
733 case 2:
734 TEST_ASSERT(nextv == TRUE);
735 TEST_ASSERT(isString == FALSE);
736 TEST_ASSERT(codePoint==0x63);
737 TEST_ASSERT(s == "c");
738 break;
739 case 3:
740 TEST_ASSERT(nextv == TRUE);
741 TEST_ASSERT(isString == FALSE);
742 TEST_ASSERT(codePoint==0x79);
743 TEST_ASSERT(s == "y");
744 break;
745 case 4:
746 TEST_ASSERT(nextv == TRUE);
747 TEST_ASSERT(isString == FALSE);
748 TEST_ASSERT(codePoint==0x7a);
749 TEST_ASSERT(s == "z");
750 break;
751 case 5:
752 TEST_ASSERT(nextv == TRUE);
753 TEST_ASSERT(isString == FALSE);
754 TEST_ASSERT(codePoint==0x1abcd);
755 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
756 break;
757 case 6:
758 TEST_ASSERT(nextv == TRUE);
759 TEST_ASSERT(isString == TRUE);
760 TEST_ASSERT(s == "str1");
761 break;
762 case 7:
763 TEST_ASSERT(nextv == TRUE);
764 TEST_ASSERT(isString == TRUE);
765 TEST_ASSERT(s == "str2");
766 break;
767 case 8:
768 TEST_ASSERT(nextv == FALSE);
769 break;
770 case 9:
771 TEST_ASSERT(nextv == FALSE);
772 break;
773 }
774 }
775 it.reset(); // prepare to run the iteration again.
776 }
777}
778
779
780
781
782void UnicodeSetTest::TestStrings() {
783 UErrorCode ec = U_ZERO_ERROR;
784
785 UnicodeSet* testList[] = {
786 UnicodeSet::createFromAll("abc"),
787 new UnicodeSet("[a-c]", ec),
788
789 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
790 new UnicodeSet("[{ll}{ch}a-z]", ec),
791
792 UnicodeSet::createFrom("ab}c"),
793 new UnicodeSet("[{ab\\}c}]", ec),
794
795 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
796 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
797
798 NULL
799 };
800
801 if (U_FAILURE(ec)) {
802 errln("FAIL: couldn't construct test sets");
803 }
804
805 for (int32_t i = 0; testList[i] != NULL; i+=2) {
806 if (U_SUCCESS(ec)) {
807 UnicodeString pat0, pat1;
808 testList[i]->toPattern(pat0, TRUE);
809 testList[i+1]->toPattern(pat1, TRUE);
810 if (*testList[i] == *testList[i+1]) {
811 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
812 } else {
813 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
814 }
815 }
816 delete testList[i];
817 delete testList[i+1];
818 }
819}
820
821/**
822 * Test the [:Latin:] syntax.
823 */
824void UnicodeSetTest::TestScriptSet() {
825 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
826
827 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
828
829 /* Jitterbug 1423 */
830 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
831
832}
833
834/**
835 * Test the [:Latin:] syntax.
836 */
837void UnicodeSetTest::TestPropertySet() {
838 static const char* const DATA[] = {
839 // Pattern, Chars IN, Chars NOT in
840
841 "[:Latin:]",
842 "aA",
843 "\\u0391\\u03B1",
844
845 "[\\p{Greek}]",
846 "\\u0391\\u03B1",
847 "aA",
848
849 "\\P{ GENERAL Category = upper case letter }",
850 "abc",
851 "ABC",
852
853 // Combining class: @since ICU 2.2
854 // Check both symbolic and numeric
855 "\\p{ccc=Nukta}",
856 "\\u0ABC",
857 "abc",
858
859 "\\p{Canonical Combining Class = 11}",
860 "\\u05B1",
861 "\\u05B2",
862
863 "[:c c c = iota subscript :]",
864 "\\u0345",
865 "xyz",
866
867 // Bidi class: @since ICU 2.2
868 "\\p{bidiclass=lefttoright}",
869 "abc",
870 "\\u0671\\u0672",
871
872 // Binary properties: @since ICU 2.2
873 "\\p{ideographic}",
874 "\\u4E0A",
875 "x",
876
877 "[:math=false:]",
878 "q)*(",
879 // weiv: )(and * were removed from math in Unicode 4.0.1
880 //"(*+)",
881 "+<>^",
882
883 // JB#1767 \N{}, \p{ASCII}
884 "[:Ascii:]",
885 "abc\\u0000\\u007F",
886 "\\u0080\\u4E00",
887
888 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
889 "az",
890 "qrs",
891
892 // JB#2015
893 "[:any:]",
894 "a\\U0010FFFF",
895 "",
896
897 "[:nv=0.5:]",
898 "\\u00BD\\u0F2A",
899 "\\u00BC",
900
901 // JB#2653: Age
902 "[:Age=1.1:]",
903 "\\u03D6", // 1.1
904 "\\u03D8\\u03D9", // 3.2
905
906 "[:Age=3.1:]",
907 "\\u1800\\u3400\\U0002f800",
908 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
909
910 // JB#2350: Case_Sensitive
911 "[:Case Sensitive:]",
912 "A\\u1FFC\\U00010410",
913 ";\\u00B4\\U00010500",
914
915 // JB#2832: C99-compatibility props
916 "[:blank:]",
917 " \\u0009",
918 "1-9A-Z",
919
920 "[:graph:]",
921 "19AZ",
922 " \\u0003\\u0007\\u0009\\u000A\\u000D",
923
924 "[:punct:]",
925 "!@#%&*()[]{}-_\\/;:,.?'\"",
926 "09azAZ",
927
928 "[:xdigit:]",
929 "09afAF",
930 "gG!",
931
932 // Regex compatibility test
933 "[-b]", // leading '-' is literal
934 "-b",
935 "ac",
936
937 "[^-b]", // leading '-' is literal
938 "ac",
939 "-b",
940
941 "[b-]", // trailing '-' is literal
942 "-b",
943 "ac",
944
945 "[^b-]", // trailing '-' is literal
946 "ac",
947 "-b",
948
949 "[a-b-]", // trailing '-' is literal
950 "ab-",
951 "c=",
952
953 "[[a-q]&[p-z]-]", // trailing '-' is literal
954 "pq-",
955 "or=",
956
957 "[\\s|\\)|:|$|\\>]", // from regex tests
958 "s|):$>",
959 "abc",
960
961 "[\\uDC00cd]", // JB#2906: isolated trail at start
962 "cd\\uDC00",
963 "ab\\uD800\\U00010000",
964
965 "[ab\\uD800]", // JB#2906: isolated trail at start
966 "ab\\uD800",
967 "cd\\uDC00\\U00010000",
968
969 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
970 "abcd\\uD800",
971 "ef\\uDC00\\U00010000",
972
973 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
974 "abcd\\uDC00",
975 "ef\\uD800\\U00010000",
976
977 "[:^lccc=0:]", // Lead canonical class
978 "\\u0300\\u0301",
979 "abcd\\u00c0\\u00c5",
980
981 "[:^tccc=0:]", // Trail canonical class
982 "\\u0300\\u0301\\u00c0\\u00c5",
983 "abcd",
984
985 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
986 "\\u0300\\u0301\\u00c0\\u00c5",
987 "abcd",
988
989 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
990 "",
991 "abcd\\u0300\\u0301\\u00c0\\u00c5",
992
993 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
994 "\\u0F73\\u0F75\\u0F81",
995 "abcd\\u0300\\u0301\\u00c0\\u00c5",
996
997 "[:Assigned:]",
998 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
999 "\\u0888\\uFDD3\\uFFFE\\U00050005"
1000 };
1001
1002 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1003
1004 for (int32_t i=0; i<DATA_LEN; i+=3) {
1005 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1006 CharsToUnicodeString(DATA[i+2]));
1007 }
1008}
1009
1010/**
1011 * Test that Posix style character classes [:digit:], etc.
1012 * have the Unicode definitions from TR 18.
1013 */
1014void UnicodeSetTest::TestPosixClasses() {
1015 {
1016 UErrorCode status = U_ZERO_ERROR;
1017 UnicodeSet s1("[:alpha:]", status);
1018 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1019 TEST_ASSERT_SUCCESS(status);
1020 TEST_ASSERT(s1==s2);
1021 }
1022 {
1023 UErrorCode status = U_ZERO_ERROR;
1024 UnicodeSet s1("[:lower:]", status);
1025 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1026 TEST_ASSERT_SUCCESS(status);
1027 TEST_ASSERT(s1==s2);
1028 }
1029 {
1030 UErrorCode status = U_ZERO_ERROR;
1031 UnicodeSet s1("[:upper:]", status);
1032 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1033 TEST_ASSERT_SUCCESS(status);
1034 TEST_ASSERT(s1==s2);
1035 }
1036 {
1037 UErrorCode status = U_ZERO_ERROR;
1038 UnicodeSet s1("[:punct:]", status);
1039 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1040 TEST_ASSERT_SUCCESS(status);
1041 TEST_ASSERT(s1==s2);
1042 }
1043 {
1044 UErrorCode status = U_ZERO_ERROR;
1045 UnicodeSet s1("[:digit:]", status);
1046 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1047 TEST_ASSERT_SUCCESS(status);
1048 TEST_ASSERT(s1==s2);
1049 }
1050 {
1051 UErrorCode status = U_ZERO_ERROR;
1052 UnicodeSet s1("[:xdigit:]", status);
1053 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1054 TEST_ASSERT_SUCCESS(status);
1055 TEST_ASSERT(s1==s2);
1056 }
1057 {
1058 UErrorCode status = U_ZERO_ERROR;
1059 UnicodeSet s1("[:alnum:]", status);
1060 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1061 TEST_ASSERT_SUCCESS(status);
1062 TEST_ASSERT(s1==s2);
1063 }
1064 {
1065 UErrorCode status = U_ZERO_ERROR;
1066 UnicodeSet s1("[:space:]", status);
1067 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1068 TEST_ASSERT_SUCCESS(status);
1069 TEST_ASSERT(s1==s2);
1070 }
1071 {
1072 UErrorCode status = U_ZERO_ERROR;
1073 UnicodeSet s1("[:blank:]", status);
1074 TEST_ASSERT_SUCCESS(status);
1075 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1076 status);
1077 TEST_ASSERT_SUCCESS(status);
1078 TEST_ASSERT(s1==s2);
1079 }
1080 {
1081 UErrorCode status = U_ZERO_ERROR;
1082 UnicodeSet s1("[:cntrl:]", status);
1083 TEST_ASSERT_SUCCESS(status);
1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1085 TEST_ASSERT_SUCCESS(status);
1086 TEST_ASSERT(s1==s2);
1087 }
1088 {
1089 UErrorCode status = U_ZERO_ERROR;
1090 UnicodeSet s1("[:graph:]", status);
1091 TEST_ASSERT_SUCCESS(status);
1092 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1093 TEST_ASSERT_SUCCESS(status);
1094 TEST_ASSERT(s1==s2);
1095 }
1096 {
1097 UErrorCode status = U_ZERO_ERROR;
1098 UnicodeSet s1("[:print:]", status);
1099 TEST_ASSERT_SUCCESS(status);
1100 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1101 TEST_ASSERT_SUCCESS(status);
1102 TEST_ASSERT(s1==s2);
1103 }
1104}
1105/**
1106 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1107 */
1108void UnicodeSetTest::TestClone() {
1109 UErrorCode ec = U_ZERO_ERROR;
1110 UnicodeSet s("[abcxyz]", ec);
1111 UnicodeSet t(s);
1112 expectContainment(t, "abc", "def");
1113}
1114
1115/**
1116 * Test the indexOf() and charAt() methods.
1117 */
1118void UnicodeSetTest::TestIndexOf() {
1119 UErrorCode ec = U_ZERO_ERROR;
1120 UnicodeSet set("[a-cx-y3578]", ec);
1121 if (U_FAILURE(ec)) {
1122 errln("FAIL: UnicodeSet constructor");
1123 return;
1124 }
1125 for (int32_t i=0; i<set.size(); ++i) {
1126 UChar32 c = set.charAt(i);
1127 if (set.indexOf(c) != i) {
1128 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1129 i, c, set.indexOf(c));
1130 }
1131 }
1132 UChar32 c = set.charAt(set.size());
1133 if (c != -1) {
1134 errln("FAIL: charAt(<out of range>) = %X", c);
1135 }
1136 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1137 if (j != -1) {
1138 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1139 }
1140}
1141
1142/**
1143 * Test closure API.
1144 */
1145void UnicodeSetTest::TestCloseOver() {
1146 UErrorCode ec = U_ZERO_ERROR;
1147
1148 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1149 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1150 const char* DATA[] = {
1151 // selector, input, output
1152 CASE,
1153 "[aq\\u00DF{Bc}{bC}{Fi}]",
1154 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1155
1156 CASE,
1157 "[\\u01F1]", // 'DZ'
1158 "[\\u01F1\\u01F2\\u01F3]",
1159
1160 CASE,
1161 "[\\u1FB4]",
1162 "[\\u1FB4{\\u03AC\\u03B9}]",
1163
1164 CASE,
1165 "[{F\\uFB01}]",
1166 "[\\uFB03{ffi}]",
1167
1168 CASE, // make sure binary search finds limits
1169 "[a\\uFF3A]",
1170 "[aA\\uFF3A\\uFF5A]",
1171
1172 CASE,
1173 "[a-z]","[A-Za-z\\u017F\\u212A]",
1174 CASE,
1175 "[abc]","[A-Ca-c]",
1176 CASE,
1177 "[ABC]","[A-Ca-c]",
1178
1179 CASE, "[i]", "[iI]",
1180
1181 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1182 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1183
1184 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1185
1186 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1187
1188 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1189
1190 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1191
1192 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1193
1194 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1195
1196 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1197 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1198
1199 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1200
1201 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1202
1203 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1204
1205 CASE_MAPPINGS,
1206 "[aq\\u00DF{Bc}{bC}{Fi}]",
1207 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1208
1209 CASE_MAPPINGS,
1210 "[\\u01F1]", // 'DZ'
1211 "[\\u01F1\\u01F2\\u01F3]",
1212
1213 CASE_MAPPINGS,
1214 "[a-z]",
1215 "[A-Za-z]",
1216
1217 NULL
1218 };
1219
1220 UnicodeSet s;
1221 UnicodeSet t;
1222 UnicodeString buf;
1223 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1224 int32_t selector = DATA[i][0];
1225 UnicodeString pat(DATA[i+1], -1, US_INV);
1226 UnicodeString exp(DATA[i+2], -1, US_INV);
1227 s.applyPattern(pat, ec);
1228 s.closeOver(selector);
1229 t.applyPattern(exp, ec);
1230 if (U_FAILURE(ec)) {
1231 errln("FAIL: applyPattern failed");
1232 continue;
1233 }
1234 if (s == t) {
1235 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1236 } else {
1237 errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1238 s.toPattern(buf, TRUE) + ", expected " + exp);
1239 }
1240 }
1241
1242#if 0
1243 /*
1244 * Unused test code.
1245 * This was used to compare the old implementation (using USET_CASE)
1246 * with the new one (using 0x100 temporarily)
1247 * while transitioning from hardcoded case closure tables in uniset.cpp
1248 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1249 * and using ucase.c functions for closure.
1250 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1251 *
1252 * Note: The old and new implementation never fully matched because
1253 * the old implementation turned out to not map U+0130 and U+0131 correctly
1254 * (dotted I and dotless i) and because the old implementation's data tables
1255 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1256 * new implementation. (So sigmas and some other characters were not handled
1257 * according to the newer Unicode version.)
1258 */
1259 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1260 UnicodeSetIterator si(sens);
1261 UnicodeString str, buf2;
1262 const UnicodeString *pStr;
1263 UChar32 c;
1264 while(si.next()) {
1265 if(!si.isString()) {
1266 c=si.getCodepoint();
1267 s.clear();
1268 s.add(c);
1269
1270 str.setTo(c);
1271 str.foldCase();
1272 sens2.add(str);
1273
1274 t=s;
1275 s.closeOver(USET_CASE);
1276 t.closeOver(0x100);
1277 if(s!=t) {
1278 errln("FAIL: closeOver(U+%04x) differs: ", c);
1279 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1280 }
1281 }
1282 }
1283 // remove all code points
1284 // should contain all full case folding mapping strings
1285 sens2.remove(0, 0x10ffff);
1286 si.reset(sens2);
1287 while(si.next()) {
1288 if(si.isString()) {
1289 pStr=&si.getString();
1290 s.clear();
1291 s.add(*pStr);
1292 t=s2=s;
1293 s.closeOver(USET_CASE);
1294 t.closeOver(0x100);
1295 if(s!=t) {
1296 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1297 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1298 }
1299 }
1300 }
1301#endif
1302
1303 // Test the pattern API
1304 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1305 if (U_FAILURE(ec)) {
1306 errln("FAIL: applyPattern failed");
1307 } else {
1308 expectContainment(s, "abcABC", "defDEF");
1309 }
1310 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1311 if (U_FAILURE(ec)) {
1312 errln("FAIL: constructor failed");
1313 } else {
1314 expectContainment(v, "defDEF", "abcABC");
1315 }
1316 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1317 if (U_FAILURE(ec)) {
1318 errln("FAIL: construct w/case mappings failed");
1319 } else {
1320 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1321 }
1322}
1323
1324void UnicodeSetTest::TestEscapePattern() {
1325 const char pattern[] =
1326 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1327 const char exp[] =
1328 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1329 // We test this with two passes; in the second pass we
1330 // pre-unescape the pattern. Since U+200E is rule whitespace,
1331 // this fails -- which is what we expect.
1332 for (int32_t pass=1; pass<=2; ++pass) {
1333 UErrorCode ec = U_ZERO_ERROR;
1334 UnicodeString pat(pattern, -1, US_INV);
1335 if (pass==2) {
1336 pat = pat.unescape();
1337 }
1338 // Pattern is only good for pass 1
1339 UBool isPatternValid = (pass==1);
1340
1341 UnicodeSet set(pat, ec);
1342 if (U_SUCCESS(ec) != isPatternValid){
1343 errln((UnicodeString)"FAIL: applyPattern(" +
1344 escape(pat) + ") => " +
1345 u_errorName(ec));
1346 continue;
1347 }
1348 if (U_FAILURE(ec)) {
1349 continue;
1350 }
1351 if (set.contains((UChar)0x0644)){
1352 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1353 }
1354
1355 UnicodeString newpat;
1356 set.toPattern(newpat, TRUE);
1357 if (newpat == UnicodeString(exp, -1, US_INV)) {
1358 logln(escape(pat) + " => " + newpat);
1359 } else {
1360 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1361 }
1362
1363 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1364 UnicodeString str("Range ");
1365 str.append((UChar)(0x30 + i))
1366 .append(": ")
1367 .append((UChar32)set.getRangeStart(i))
1368 .append(" - ")
1369 .append((UChar32)set.getRangeEnd(i));
1370 str = str + " (" + set.getRangeStart(i) + " - " +
1371 set.getRangeEnd(i) + ")";
1372 if (set.getRangeStart(i) < 0) {
1373 errln((UnicodeString)"FAIL: " + escape(str));
1374 } else {
1375 logln(escape(str));
1376 }
1377 }
1378 }
1379}
1380
1381void UnicodeSetTest::expectRange(const UnicodeString& label,
1382 const UnicodeSet& set,
1383 UChar32 start, UChar32 end) {
1384 UnicodeSet exp(start, end);
1385 UnicodeString pat;
1386 if (set == exp) {
1387 logln(label + " => " + set.toPattern(pat, TRUE));
1388 } else {
1389 UnicodeString xpat;
1390 errln((UnicodeString)"FAIL: " + label + " => " +
1391 set.toPattern(pat, TRUE) +
1392 ", expected " + exp.toPattern(xpat, TRUE));
1393 }
1394}
1395
1396void UnicodeSetTest::TestInvalidCodePoint() {
1397
1398 const UChar32 DATA[] = {
1399 // Test range Expected range
1400 0, 0x10FFFF, 0, 0x10FFFF,
1401 (UChar32)-1, 8, 0, 8,
1402 8, 0x110000, 8, 0x10FFFF
1403 };
1404 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1405
1406 UnicodeString pat;
1407 int32_t i;
1408
1409 for (i=0; i<DATA_LENGTH; i+=4) {
1410 UChar32 start = DATA[i];
1411 UChar32 end = DATA[i+1];
1412 UChar32 xstart = DATA[i+2];
1413 UChar32 xend = DATA[i+3];
1414
1415 // Try various API using the test code points
1416
1417 UnicodeSet set(start, end);
1418 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1419 set, xstart, xend);
1420
1421 set.clear();
1422 set.set(start, end);
1423 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1424 set, xstart, xend);
1425
1426 UBool b = set.contains(start);
1427 b = set.contains(start, end);
1428 b = set.containsNone(start, end);
1429 b = set.containsSome(start, end);
1430
1431 /*int32_t index = set.indexOf(start);*/
1432
1433 set.clear();
1434 set.add(start);
1435 set.add(start, end);
1436 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1437 set, xstart, xend);
1438
1439 set.set(0, 0x10FFFF);
1440 set.retain(start, end);
1441 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1442 set, xstart, xend);
1443 set.retain(start);
1444
1445 set.set(0, 0x10FFFF);
1446 set.remove(start);
1447 set.remove(start, end);
1448 set.complement();
1449 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1450 set, xstart, xend);
1451
1452 set.set(0, 0x10FFFF);
1453 set.complement(start, end);
1454 set.complement();
1455 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1456 set, xstart, xend);
1457 set.complement(start);
1458 }
1459
1460 const UChar32 DATA2[] = {
1461 0,
1462 0x10FFFF,
1463 (UChar32)-1,
1464 0x110000
1465 };
1466 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1467
1468 for (i=0; i<DATA2_LENGTH; ++i) {
1469 UChar32 c = DATA2[i], end = 0x10FFFF;
1470 UBool valid = (c >= 0 && c <= 0x10FFFF);
1471
1472 UnicodeSet set(0, 0x10FFFF);
1473
1474 // For single-codepoint contains, invalid codepoints are NOT contained
1475 UBool b = set.contains(c);
1476 if (b == valid) {
1477 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1478 ") = " + b);
1479 } else {
1480 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1481 ") = " + b);
1482 }
1483
1484 // For codepoint range contains, containsNone, and containsSome,
1485 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1486 b = set.contains(c, end);
1487 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1488 "," + end + ") = " + b);
1489
1490 b = set.containsNone(c, end);
1491 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1492 "," + end + ") = " + b);
1493
1494 b = set.containsSome(c, end);
1495 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1496 "," + end + ") = " + b);
1497
1498 int32_t index = set.indexOf(c);
1499 if ((index >= 0) == valid) {
1500 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1501 ") = " + index);
1502 } else {
1503 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1504 ") = " + index);
1505 }
1506 }
1507}
1508
1509// Used by TestSymbolTable
1510class TokenSymbolTable : public SymbolTable {
1511public:
1512 Hashtable contents;
1513
1514 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1515 contents.setValueDeleter(uhash_deleteUnicodeString);
1516 }
1517
1518 ~TokenSymbolTable() {}
1519
1520 /**
1521 * (Non-SymbolTable API) Add the given variable and value to
1522 * the table. Variable should NOT contain leading '$'.
1523 */
1524 void add(const UnicodeString& var, const UnicodeString& value,
1525 UErrorCode& ec) {
1526 if (U_SUCCESS(ec)) {
1527 contents.put(var, new UnicodeString(value), ec);
1528 }
1529 }
1530
1531 /**
1532 * SymbolTable API
1533 */
1534 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1535 return (const UnicodeString*) contents.get(s);
1536 }
1537
1538 /**
1539 * SymbolTable API
1540 */
1541 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1542 return NULL;
1543 }
1544
1545 /**
1546 * SymbolTable API
1547 */
1548 virtual UnicodeString parseReference(const UnicodeString& text,
1549 ParsePosition& pos, int32_t limit) const {
1550 int32_t start = pos.getIndex();
1551 int32_t i = start;
1552 UnicodeString result;
1553 while (i < limit) {
1554 UChar c = text.charAt(i);
1555 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1556 break;
1557 }
1558 ++i;
1559 }
1560 if (i == start) { // No valid name chars
1561 return result; // Indicate failure with empty string
1562 }
1563 pos.setIndex(i);
1564 text.extractBetween(start, i, result);
1565 return result;
1566 }
1567};
1568
1569void UnicodeSetTest::TestSymbolTable() {
1570 // Multiple test cases can be set up here. Each test case
1571 // is terminated by null:
1572 // var, value, var, value,..., input pat., exp. output pat., null
1573 const char* DATA[] = {
1574 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1575 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1576 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1577 NULL
1578 };
1579
1580 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1581 UErrorCode ec = U_ZERO_ERROR;
1582 TokenSymbolTable sym(ec);
1583 if (U_FAILURE(ec)) {
1584 errln("FAIL: couldn't construct TokenSymbolTable");
1585 continue;
1586 }
1587
1588 // Set up variables
1589 while (DATA[i+2] != NULL) {
1590 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1591 if (U_FAILURE(ec)) {
1592 errln("FAIL: couldn't add to TokenSymbolTable");
1593 continue;
1594 }
1595 i += 2;
1596 }
1597
1598 // Input pattern and expected output pattern
1599 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1600 i += 2;
1601
1602 ParsePosition pos(0);
1603 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1604 if (U_FAILURE(ec)) {
1605 errln("FAIL: couldn't construct UnicodeSet");
1606 continue;
1607 }
1608
1609 // results
1610 if (pos.getIndex() != inpat.length()) {
1611 errln((UnicodeString)"Failed to read to end of string \""
1612 + inpat + "\": read to "
1613 + pos.getIndex() + ", length is "
1614 + inpat.length());
1615 }
1616
1617 UnicodeSet us2(exppat, ec);
1618 if (U_FAILURE(ec)) {
1619 errln("FAIL: couldn't construct expected UnicodeSet");
1620 continue;
1621 }
1622
1623 UnicodeString a, b;
1624 if (us != us2) {
1625 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1626 ", expected " + us2.toPattern(b, TRUE));
1627 } else {
1628 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1629 }
1630 }
1631}
1632
1633void UnicodeSetTest::TestSurrogate() {
1634 const char* DATA[] = {
1635 // These should all behave identically
1636 "[abc\\uD800\\uDC00]",
1637 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1638 "[abc\\U00010000]",
1639 0
1640 };
1641 for (int i=0; DATA[i] != 0; ++i) {
1642 UErrorCode ec = U_ZERO_ERROR;
1643 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1644 UnicodeSet set(UnicodeString(DATA[i], -1, US_INV), ec);
1645 if (U_FAILURE(ec)) {
1646 errln("FAIL: UnicodeSet constructor");
1647 continue;
1648 }
1649 expectContainment(set,
1650 CharsToUnicodeString("abc\\U00010000"),
1651 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1652 if (set.size() != 4) {
1653 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1654 set.size() + ", expected 4");
1655 }
1656 }
1657}
1658
1659void UnicodeSetTest::TestExhaustive() {
1660 // exhaustive tests. Simulate UnicodeSets with integers.
1661 // That gives us very solid tests (except for large memory tests).
1662
1663 int32_t limit = 128;
1664
1665 UnicodeSet x, y, z, aa;
1666
1667 for (int32_t i = 0; i < limit; ++i) {
1668 bitsToSet(i, x);
1669 logln((UnicodeString)"Testing " + i + ", " + x);
1670 _testComplement(i, x, y);
1671
1672 // AS LONG AS WE ARE HERE, check roundtrip
1673 checkRoundTrip(bitsToSet(i, aa));
1674
1675 for (int32_t j = 0; j < limit; ++j) {
1676 _testAdd(i,j, x,y,z);
1677 _testXor(i,j, x,y,z);
1678 _testRetain(i,j, x,y,z);
1679 _testRemove(i,j, x,y,z);
1680 }
1681 }
1682}
1683
1684void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1685 bitsToSet(a, x);
1686 z = x;
1687 z.complement();
1688 int32_t c = setToBits(z);
1689 if (c != (~a)) {
1690 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1691 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1692 }
1693 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1694}
1695
1696void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1697 bitsToSet(a, x);
1698 bitsToSet(b, y);
1699 z = x;
1700 z.addAll(y);
1701 int32_t c = setToBits(z);
1702 if (c != (a | b)) {
1703 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1704 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1705 }
1706 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1707}
1708
1709void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1710 bitsToSet(a, x);
1711 bitsToSet(b, y);
1712 z = x;
1713 z.retainAll(y);
1714 int32_t c = setToBits(z);
1715 if (c != (a & b)) {
1716 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1717 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1718 }
1719 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1720}
1721
1722void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1723 bitsToSet(a, x);
1724 bitsToSet(b, y);
1725 z = x;
1726 z.removeAll(y);
1727 int32_t c = setToBits(z);
1728 if (c != (a &~ b)) {
1729 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1730 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1731 }
1732 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1733}
1734
1735void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1736 bitsToSet(a, x);
1737 bitsToSet(b, y);
1738 z = x;
1739 z.complementAll(y);
1740 int32_t c = setToBits(z);
1741 if (c != (a ^ b)) {
1742 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1743 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1744 }
1745 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1746}
1747
1748/**
1749 * Check that ranges are monotonically increasing and non-
1750 * overlapping.
1751 */
1752void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1753 int32_t n = set.getRangeCount();
1754 if (n < 0) {
1755 errln((UnicodeString)"FAIL result of " + msg +
1756 ": range count should be >= 0 but is " +
1757 n /*+ " for " + set.toPattern())*/);
1758 return;
1759 }
1760 UChar32 last = 0;
1761 for (int32_t i=0; i<n; ++i) {
1762 UChar32 start = set.getRangeStart(i);
1763 UChar32 end = set.getRangeEnd(i);
1764 if (start > end) {
1765 errln((UnicodeString)"FAIL result of " + msg +
1766 ": range " + (i+1) +
1767 " start > end: " + (int)start + ", " + (int)end +
1768 " for " + set);
1769 }
1770 if (i > 0 && start <= last) {
1771 errln((UnicodeString)"FAIL result of " + msg +
1772 ": range " + (i+1) +
1773 " overlaps previous range: " + (int)start + ", " + (int)end +
1774 " for " + set);
1775 }
1776 last = end;
1777 }
1778}
1779
1780/**
1781 * Convert a bitmask to a UnicodeSet.
1782 */
1783UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1784 result.clear();
1785 for (UChar32 i = 0; i < 32; ++i) {
1786 if ((a & (1<<i)) != 0) {
1787 result.add(i);
1788 }
1789 }
1790 return result;
1791}
1792
1793/**
1794 * Convert a UnicodeSet to a bitmask. Only the characters
1795 * U+0000 to U+0020 are represented in the bitmask.
1796 */
1797int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1798 int32_t result = 0;
1799 for (int32_t i = 0; i < 32; ++i) {
1800 if (x.contains((UChar32)i)) {
1801 result |= (1<<i);
1802 }
1803 }
1804 return result;
1805}
1806
1807/**
1808 * Return the representation of an inversion list based UnicodeSet
1809 * as a pairs list. Ranges are listed in ascending Unicode order.
1810 * For example, the set [a-zA-M3] is represented as "33AMaz".
1811 */
1812UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1813 UnicodeString pairs;
1814 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1815 UChar32 start = set.getRangeStart(i);
1816 UChar32 end = set.getRangeEnd(i);
1817 if (end > 0xFFFF) {
1818 end = 0xFFFF;
1819 i = set.getRangeCount(); // Should be unnecessary
1820 }
1821 pairs.append((UChar)start).append((UChar)end);
1822 }
1823 return pairs;
1824}
1825
1826/**
1827 * Basic consistency check for a few items.
1828 * That the iterator works, and that we can create a pattern and
1829 * get the same thing back
1830 */
1831void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1832 UErrorCode ec = U_ZERO_ERROR;
1833
1834 UnicodeSet t(s);
1835 checkEqual(s, t, "copy ct");
1836
1837 t = s;
1838 checkEqual(s, t, "operator=");
1839
1840 copyWithIterator(t, s, FALSE);
1841 checkEqual(s, t, "iterator roundtrip");
1842
1843 copyWithIterator(t, s, TRUE); // try range
1844 checkEqual(s, t, "iterator roundtrip");
1845
1846 UnicodeString pat; s.toPattern(pat, FALSE);
1847 t.applyPattern(pat, ec);
1848 if (U_FAILURE(ec)) {
1849 errln("FAIL: applyPattern");
1850 return;
1851 } else {
1852 checkEqual(s, t, "toPattern(false)");
1853 }
1854
1855 s.toPattern(pat, TRUE);
1856 t.applyPattern(pat, ec);
1857 if (U_FAILURE(ec)) {
1858 errln("FAIL: applyPattern");
1859 return;
1860 } else {
1861 checkEqual(s, t, "toPattern(true)");
1862 }
1863}
1864
1865void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1866 t.clear();
1867 UnicodeSetIterator it(s);
1868 if (withRange) {
1869 while (it.nextRange()) {
1870 if (it.isString()) {
1871 t.add(it.getString());
1872 } else {
1873 t.add(it.getCodepoint(), it.getCodepointEnd());
1874 }
1875 }
1876 } else {
1877 while (it.next()) {
1878 if (it.isString()) {
1879 t.add(it.getString());
1880 } else {
1881 t.add(it.getCodepoint());
1882 }
1883 }
1884 }
1885}
1886
1887UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1888 UnicodeString source; s.toPattern(source, TRUE);
1889 UnicodeString result; t.toPattern(result, TRUE);
1890 if (s != t) {
1891 errln((UnicodeString)"FAIL: " + message
1892 + "; source = " + source
1893 + "; result = " + result
1894 );
1895 return FALSE;
1896 } else {
1897 logln((UnicodeString)"Ok: " + message
1898 + "; source = " + source
1899 + "; result = " + result
1900 );
1901 }
1902 return TRUE;
1903}
1904
1905void
1906UnicodeSetTest::expectContainment(const UnicodeString& pat,
1907 const UnicodeString& charsIn,
1908 const UnicodeString& charsOut) {
1909 UErrorCode ec = U_ZERO_ERROR;
1910 UnicodeSet set(pat, ec);
1911 if (U_FAILURE(ec)) {
1912 errln((UnicodeString)"FAIL: pattern \"" +
1913 pat + "\" => " + u_errorName(ec));
1914 return;
1915 }
1916 expectContainment(set, pat, charsIn, charsOut);
1917}
1918
1919void
1920UnicodeSetTest::expectContainment(const UnicodeSet& set,
1921 const UnicodeString& charsIn,
1922 const UnicodeString& charsOut) {
1923 UnicodeString pat;
1924 set.toPattern(pat);
1925 expectContainment(set, pat, charsIn, charsOut);
1926}
1927
1928void
1929UnicodeSetTest::expectContainment(const UnicodeSet& set,
1930 const UnicodeString& setName,
1931 const UnicodeString& charsIn,
1932 const UnicodeString& charsOut) {
1933 UnicodeString bad;
1934 UChar32 c;
1935 int32_t i;
1936
1937 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1938 c = charsIn.char32At(i);
1939 if (!set.contains(c)) {
1940 bad.append(c);
1941 }
1942 }
1943 if (bad.length() > 0) {
1944 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1945 ", expected containment of " + prettify(charsIn));
1946 } else {
1947 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1948 }
1949
1950 bad.truncate(0);
1951 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1952 c = charsOut.char32At(i);
1953 if (set.contains(c)) {
1954 bad.append(c);
1955 }
1956 }
1957 if (bad.length() > 0) {
1958 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1959 ", expected non-containment of " + prettify(charsOut));
1960 } else {
1961 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1962 }
1963}
1964
1965void
1966UnicodeSetTest::expectPattern(UnicodeSet& set,
1967 const UnicodeString& pattern,
1968 const UnicodeString& expectedPairs){
1969 UErrorCode status = U_ZERO_ERROR;
1970 set.applyPattern(pattern, status);
1971 if (U_FAILURE(status)) {
1972 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1973 "\") failed");
1974 return;
1975 } else {
1976 if (getPairs(set) != expectedPairs ) {
1977 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1978 "\") => pairs \"" +
1979 escape(getPairs(set)) + "\", expected \"" +
1980 escape(expectedPairs) + "\"");
1981 } else {
1982 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
1983 "\") => pairs \"" +
1984 escape(getPairs(set)) + "\"");
1985 }
1986 }
1987 // the result of calling set.toPattern(), which is the string representation of
1988 // this set(set), is passed to a UnicodeSet constructor, and tested that it
1989 // will produce another set that is equal to this one.
1990 UnicodeString temppattern;
1991 set.toPattern(temppattern);
1992 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1993 if (U_FAILURE(status)) {
1994 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1995 return;
1996 }
1997 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1998 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1999 escape(getPairs(set)) + "\""));
2000 } else{
2001 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2002 }
2003
2004 delete tempset;
2005
2006}
2007
2008void
2009UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2010 if (getPairs(set) != expectedPairs) {
2011 errln(UnicodeString("FAIL: Expected pair list \"") +
2012 escape(expectedPairs) + "\", got \"" +
2013 escape(getPairs(set)) + "\"");
2014 }
2015}
2016
2017void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2018 const UnicodeString& expPat,
2019 const char** expStrings) {
2020 UnicodeString pat;
2021 set.toPattern(pat, TRUE);
2022 if (pat == expPat) {
2023 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2024 } else {
2025 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2026 return;
2027 }
2028 if (expStrings == NULL) {
2029 return;
2030 }
2031 UBool in = TRUE;
2032 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2033 if (expStrings[i] == NOT) { // sic; pointer comparison
2034 in = FALSE;
2035 continue;
2036 }
2037 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2038 UBool contained = set.contains(s);
2039 if (contained == in) {
2040 logln((UnicodeString)"Ok: " + expPat +
2041 (contained ? " contains {" : " does not contain {") +
2042 escape(expStrings[i]) + "}");
2043 } else {
2044 errln((UnicodeString)"FAIL: " + expPat +
2045 (contained ? " contains {" : " does not contain {") +
2046 escape(expStrings[i]) + "}");
2047 }
2048 }
2049}
2050
2051static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2052
2053void
2054UnicodeSetTest::doAssert(UBool condition, const char *message)
2055{
2056 if (!condition) {
2057 errln(UnicodeString("ERROR : ") + message);
2058 }
2059}
2060
2061UnicodeString
2062UnicodeSetTest::escape(const UnicodeString& s) {
2063 UnicodeString buf;
2064 for (int32_t i=0; i<s.length(); )
2065 {
2066 UChar32 c = s.char32At(i);
2067 if (0x0020 <= c && c <= 0x007F) {
2068 buf += c;
2069 } else {
2070 if (c <= 0xFFFF) {
2071 buf += (UChar)0x5c; buf += (UChar)0x75;
2072 } else {
2073 buf += (UChar)0x5c; buf += (UChar)0x55;
2074 buf += toHexString((c & 0xF0000000) >> 28);
2075 buf += toHexString((c & 0x0F000000) >> 24);
2076 buf += toHexString((c & 0x00F00000) >> 20);
2077 buf += toHexString((c & 0x000F0000) >> 16);
2078 }
2079 buf += toHexString((c & 0xF000) >> 12);
2080 buf += toHexString((c & 0x0F00) >> 8);
2081 buf += toHexString((c & 0x00F0) >> 4);
2082 buf += toHexString(c & 0x000F);
2083 }
2084 i += U16_LENGTH(c);
2085 }
2086 return buf;
2087}
2088
2089void UnicodeSetTest::TestFreezable() {
2090 UErrorCode errorCode=U_ZERO_ERROR;
2091 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2092 UnicodeSet idSet(idPattern, errorCode);
2093 if(U_FAILURE(errorCode)) {
2094 errln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2095 return;
2096 }
2097
2098 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2099 UnicodeSet wsSet(wsPattern, errorCode);
2100 if(U_FAILURE(errorCode)) {
2101 errln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2102 return;
2103 }
2104
2105 idSet.add(idPattern);
2106 UnicodeSet frozen(idSet);
2107 frozen.freeze();
2108
2109 if(idSet.isFrozen() || !frozen.isFrozen()) {
2110 errln("FAIL: isFrozen() is wrong");
2111 }
2112 if(frozen!=idSet || !(frozen==idSet)) {
2113 errln("FAIL: a copy-constructed frozen set differs from its original");
2114 }
2115
2116 frozen=wsSet;
2117 if(frozen!=idSet || !(frozen==idSet)) {
2118 errln("FAIL: a frozen set was modified by operator=");
2119 }
2120
2121 UnicodeSet frozen2(frozen);
2122 if(frozen2!=frozen || frozen2!=idSet) {
2123 errln("FAIL: a copied frozen set differs from its frozen original");
2124 }
2125 if(!frozen2.isFrozen()) {
2126 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2127 }
2128 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2129 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2130 errln("FAIL: UnicodeSet(5, 55) failed");
2131 }
2132 frozen3=frozen;
2133 if(!frozen3.isFrozen()) {
2134 errln("FAIL: copying a frozen set results in a thawed one");
2135 }
2136
2137 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2138 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2139 errln("FAIL: clone() failed");
2140 }
2141 cloned->add(0xd802, 0xd805);
2142 if(cloned->containsSome(0xd802, 0xd805)) {
2143 errln("FAIL: unable to modify clone");
2144 }
2145 delete cloned;
2146
2147 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2148 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2149 errln("FAIL: cloneAsThawed() failed");
2150 }
2151 thawed->add(0xd802, 0xd805);
2152 if(!thawed->contains(0xd802, 0xd805)) {
2153 errln("FAIL: unable to modify thawed clone");
2154 }
2155 delete thawed;
2156
2157 frozen.set(5, 55);
2158 if(frozen!=idSet || !(frozen==idSet)) {
2159 errln("FAIL: UnicodeSet::set() modified a frozen set");
2160 }
2161
2162 frozen.clear();
2163 if(frozen!=idSet || !(frozen==idSet)) {
2164 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2165 }
2166
2167 frozen.closeOver(USET_CASE_INSENSITIVE);
2168 if(frozen!=idSet || !(frozen==idSet)) {
2169 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2170 }
2171
2172 frozen.compact();
2173 if(frozen!=idSet || !(frozen==idSet)) {
2174 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2175 }
2176
2177 ParsePosition pos;
2178 frozen.
2179 applyPattern(wsPattern, errorCode).
2180 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2181 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2182 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2183 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2184 if(frozen!=idSet || !(frozen==idSet)) {
2185 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2186 }
2187
2188 frozen.
2189 add(0xd800).
2190 add(0xd802, 0xd805).
2191 add(wsPattern).
2192 addAll(idPattern).
2193 addAll(wsSet);
2194 if(frozen!=idSet || !(frozen==idSet)) {
2195 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2196 }
2197
2198 frozen.
2199 retain(0x62).
2200 retain(0x64, 0x69).
2201 retainAll(wsPattern).
2202 retainAll(wsSet);
2203 if(frozen!=idSet || !(frozen==idSet)) {
2204 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2205 }
2206
2207 frozen.
2208 remove(0x62).
2209 remove(0x64, 0x69).
2210 remove(idPattern).
2211 removeAll(idPattern).
2212 removeAll(idSet);
2213 if(frozen!=idSet || !(frozen==idSet)) {
2214 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2215 }
2216
2217 frozen.
2218 complement().
2219 complement(0x62).
2220 complement(0x64, 0x69).
2221 complement(idPattern).
2222 complementAll(idPattern).
2223 complementAll(idSet);
2224 if(frozen!=idSet || !(frozen==idSet)) {
2225 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2226 }
2227}
2228
2229// Test span() etc. -------------------------------------------------------- ***
2230
2231// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2232static int32_t
2233appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2234 UErrorCode errorCode=U_ZERO_ERROR;
2235 int32_t length8=0;
2236 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2237 if(U_SUCCESS(errorCode)) {
2238 return length8;
2239 } else {
2240 // The string contains an unpaired surrogate.
2241 // Ignore this string.
2242 return 0;
2243 }
2244}
2245
2246class UnicodeSetWithStringsIterator;
2247
2248// Make the strings in a UnicodeSet easily accessible.
2249class UnicodeSetWithStrings {
2250public:
2251 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2252 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2253 int32_t size=set.size();
2254 if(size>0 && set.charAt(size-1)<0) {
2255 // If a set's last element is not a code point, then it must contain strings.
2256 // Iterate over the set, skip all code point ranges, and cache the strings.
2257 // Convert them to UTF-8 for spanUTF8().
2258 UnicodeSetIterator iter(set);
2259 const UnicodeString *s;
2260 char *s8=utf8;
2261 int32_t length8, utf8Count=0;
2262 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2263 if(iter.isString()) {
2264 // Store the pointer to the set's string element
2265 // which we happen to know is a stable pointer.
2266 strings[stringsLength]=s=&iter.getString();
2267 utf8Count+=
2268 utf8Lengths[stringsLength]=length8=
2269 appendUTF8(s->getBuffer(), s->length(),
2270 s8, (int32_t)(sizeof(utf8)-utf8Count));
2271 if(length8==0) {
2272 hasSurrogates=TRUE; // Contains unpaired surrogates.
2273 }
2274 s8+=length8;
2275 ++stringsLength;
2276 }
2277 }
2278 }
2279 }
2280
2281 const UnicodeSet &getSet() const {
2282 return set;
2283 }
2284
2285 UBool hasStrings() const {
2286 return (UBool)(stringsLength>0);
2287 }
2288
2289 UBool hasStringsWithSurrogates() const {
2290 return hasSurrogates;
2291 }
2292
2293private:
2294 friend class UnicodeSetWithStringsIterator;
2295
2296 const UnicodeSet &set;
2297
2298 const UnicodeString *strings[20];
2299 int32_t stringsLength;
2300 UBool hasSurrogates;
2301
2302 char utf8[1024];
2303 int32_t utf8Lengths[20];
2304
2305 int32_t nextStringIndex;
2306 int32_t nextUTF8Start;
2307};
2308
2309class UnicodeSetWithStringsIterator {
2310public:
2311 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2312 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2313 }
2314
2315 void reset() {
2316 nextStringIndex=nextUTF8Start=0;
2317 }
2318
2319 const UnicodeString *nextString() {
2320 if(nextStringIndex<fSet.stringsLength) {
2321 return fSet.strings[nextStringIndex++];
2322 } else {
2323 return NULL;
2324 }
2325 }
2326
2327 // Do not mix with calls to nextString().
2328 const char *nextUTF8(int32_t &length) {
2329 if(nextStringIndex<fSet.stringsLength) {
2330 const char *s8=fSet.utf8+nextUTF8Start;
2331 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2332 return s8;
2333 } else {
2334 length=0;
2335 return NULL;
2336 }
2337 }
2338
2339private:
2340 const UnicodeSetWithStrings &fSet;
2341 int32_t nextStringIndex;
2342 int32_t nextUTF8Start;
2343};
2344
2345// Compare 16-bit Unicode strings (which may be malformed UTF-16)
2346// at code point boundaries.
2347// That is, each edge of a match must not be in the middle of a surrogate pair.
2348static inline UBool
2349matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2350 s+=start;
2351 limit-=start;
2352 int32_t length=t.length();
2353 return 0==t.compare(s, length) &&
2354 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2355 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2356}
2357
2358// Implement span() with contains() for comparison.
2359static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2360 USetSpanCondition spanCondition) {
2361 const UnicodeSet &realSet(set.getSet());
2362 if(!set.hasStrings()) {
2363 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2364 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2365 }
2366
2367 UChar32 c;
2368 int32_t start=0, prev;
2369 while((prev=start)<length) {
2370 U16_NEXT(s, start, length, c);
2371 if(realSet.contains(c)!=spanCondition) {
2372 break;
2373 }
2374 }
2375 return prev;
2376 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2377 UnicodeSetWithStringsIterator iter(set);
2378 UChar32 c;
2379 int32_t start, next;
2380 for(start=next=0; start<length;) {
2381 U16_NEXT(s, next, length, c);
2382 if(realSet.contains(c)) {
2383 break;
2384 }
2385 const UnicodeString *str;
2386 iter.reset();
2387 while((str=iter.nextString())!=NULL) {
2388 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2389 // spanNeedsStrings=TRUE;
2390 return start;
2391 }
2392 }
2393 start=next;
2394 }
2395 return start;
2396 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2397 UnicodeSetWithStringsIterator iter(set);
2398 UChar32 c;
2399 int32_t start, next, maxSpanLimit=0;
2400 for(start=next=0; start<length;) {
2401 U16_NEXT(s, next, length, c);
2402 if(!realSet.contains(c)) {
2403 next=start; // Do not span this single, not-contained code point.
2404 }
2405 const UnicodeString *str;
2406 iter.reset();
2407 while((str=iter.nextString())!=NULL) {
2408 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2409 // spanNeedsStrings=TRUE;
2410 int32_t matchLimit=start+str->length();
2411 if(matchLimit==length) {
2412 return length;
2413 }
2414 if(spanCondition==USET_SPAN_CONTAINED) {
2415 // Iterate for the shortest match at each position.
2416 // Recurse for each but the shortest match.
2417 if(next==start) {
2418 next=matchLimit; // First match from start.
2419 } else {
2420 if(matchLimit<next) {
2421 // Remember shortest match from start for iteration.
2422 int32_t temp=next;
2423 next=matchLimit;
2424 matchLimit=temp;
2425 }
2426 // Recurse for non-shortest match from start.
2427 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2428 USET_SPAN_CONTAINED);
2429 if((matchLimit+spanLength)>maxSpanLimit) {
2430 maxSpanLimit=matchLimit+spanLength;
2431 if(maxSpanLimit==length) {
2432 return length;
2433 }
2434 }
2435 }
2436 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2437 if(matchLimit>next) {
2438 // Remember longest match from start.
2439 next=matchLimit;
2440 }
2441 }
2442 }
2443 }
2444 if(next==start) {
2445 break; // No match from start.
2446 }
2447 start=next;
2448 }
2449 if(start>maxSpanLimit) {
2450 return start;
2451 } else {
2452 return maxSpanLimit;
2453 }
2454 }
2455}
2456
2457static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2458 USetSpanCondition spanCondition) {
2459 if(length==0) {
2460 return 0;
2461 }
2462 const UnicodeSet &realSet(set.getSet());
2463 if(!set.hasStrings()) {
2464 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2465 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2466 }
2467
2468 UChar32 c;
2469 int32_t prev=length;
2470 do {
2471 U16_PREV(s, 0, length, c);
2472 if(realSet.contains(c)!=spanCondition) {
2473 break;
2474 }
2475 } while((prev=length)>0);
2476 return prev;
2477 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2478 UnicodeSetWithStringsIterator iter(set);
2479 UChar32 c;
2480 int32_t prev=length, length0=length;
2481 do {
2482 U16_PREV(s, 0, length, c);
2483 if(realSet.contains(c)) {
2484 break;
2485 }
2486 const UnicodeString *str;
2487 iter.reset();
2488 while((str=iter.nextString())!=NULL) {
2489 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2490 // spanNeedsStrings=TRUE;
2491 return prev;
2492 }
2493 }
2494 } while((prev=length)>0);
2495 return prev;
2496 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2497 UnicodeSetWithStringsIterator iter(set);
2498 UChar32 c;
2499 int32_t prev=length, minSpanStart=length, length0=length;
2500 do {
2501 U16_PREV(s, 0, length, c);
2502 if(!realSet.contains(c)) {
2503 length=prev; // Do not span this single, not-contained code point.
2504 }
2505 const UnicodeString *str;
2506 iter.reset();
2507 while((str=iter.nextString())!=NULL) {
2508 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2509 // spanNeedsStrings=TRUE;
2510 int32_t matchStart=prev-str->length();
2511 if(matchStart==0) {
2512 return 0;
2513 }
2514 if(spanCondition==USET_SPAN_CONTAINED) {
2515 // Iterate for the shortest match at each position.
2516 // Recurse for each but the shortest match.
2517 if(length==prev) {
2518 length=matchStart; // First match from prev.
2519 } else {
2520 if(matchStart>length) {
2521 // Remember shortest match from prev for iteration.
2522 int32_t temp=length;
2523 length=matchStart;
2524 matchStart=temp;
2525 }
2526 // Recurse for non-shortest match from prev.
2527 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2528 USET_SPAN_CONTAINED);
2529 if(spanStart<minSpanStart) {
2530 minSpanStart=spanStart;
2531 if(minSpanStart==0) {
2532 return 0;
2533 }
2534 }
2535 }
2536 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2537 if(matchStart<length) {
2538 // Remember longest match from prev.
2539 length=matchStart;
2540 }
2541 }
2542 }
2543 }
2544 if(length==prev) {
2545 break; // No match from prev.
2546 }
2547 } while((prev=length)>0);
2548 if(prev<minSpanStart) {
2549 return prev;
2550 } else {
2551 return minSpanStart;
2552 }
2553 }
2554}
2555
2556static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2557 USetSpanCondition spanCondition) {
2558 const UnicodeSet &realSet(set.getSet());
2559 if(!set.hasStrings()) {
2560 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2561 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2562 }
2563
2564 UChar32 c;
2565 int32_t start=0, prev;
2566 while((prev=start)<length) {
2567 U8_NEXT(s, start, length, c);
2568 if(c<0) {
2569 c=0xfffd;
2570 }
2571 if(realSet.contains(c)!=spanCondition) {
2572 break;
2573 }
2574 }
2575 return prev;
2576 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2577 UnicodeSetWithStringsIterator iter(set);
2578 UChar32 c;
2579 int32_t start, next;
2580 for(start=next=0; start<length;) {
2581 U8_NEXT(s, next, length, c);
2582 if(c<0) {
2583 c=0xfffd;
2584 }
2585 if(realSet.contains(c)) {
2586 break;
2587 }
2588 const char *s8;
2589 int32_t length8;
2590 iter.reset();
2591 while((s8=iter.nextUTF8(length8))!=NULL) {
2592 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2593 // spanNeedsStrings=TRUE;
2594 return start;
2595 }
2596 }
2597 start=next;
2598 }
2599 return start;
2600 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2601 UnicodeSetWithStringsIterator iter(set);
2602 UChar32 c;
2603 int32_t start, next, maxSpanLimit=0;
2604 for(start=next=0; start<length;) {
2605 U8_NEXT(s, next, length, c);
2606 if(c<0) {
2607 c=0xfffd;
2608 }
2609 if(!realSet.contains(c)) {
2610 next=start; // Do not span this single, not-contained code point.
2611 }
2612 const char *s8;
2613 int32_t length8;
2614 iter.reset();
2615 while((s8=iter.nextUTF8(length8))!=NULL) {
2616 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2617 // spanNeedsStrings=TRUE;
2618 int32_t matchLimit=start+length8;
2619 if(matchLimit==length) {
2620 return length;
2621 }
2622 if(spanCondition==USET_SPAN_CONTAINED) {
2623 // Iterate for the shortest match at each position.
2624 // Recurse for each but the shortest match.
2625 if(next==start) {
2626 next=matchLimit; // First match from start.
2627 } else {
2628 if(matchLimit<next) {
2629 // Remember shortest match from start for iteration.
2630 int32_t temp=next;
2631 next=matchLimit;
2632 matchLimit=temp;
2633 }
2634 // Recurse for non-shortest match from start.
2635 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2636 USET_SPAN_CONTAINED);
2637 if((matchLimit+spanLength)>maxSpanLimit) {
2638 maxSpanLimit=matchLimit+spanLength;
2639 if(maxSpanLimit==length) {
2640 return length;
2641 }
2642 }
2643 }
2644 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2645 if(matchLimit>next) {
2646 // Remember longest match from start.
2647 next=matchLimit;
2648 }
2649 }
2650 }
2651 }
2652 if(next==start) {
2653 break; // No match from start.
2654 }
2655 start=next;
2656 }
2657 if(start>maxSpanLimit) {
2658 return start;
2659 } else {
2660 return maxSpanLimit;
2661 }
2662 }
2663}
2664
2665static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2666 USetSpanCondition spanCondition) {
2667 if(length==0) {
2668 return 0;
2669 }
2670 const UnicodeSet &realSet(set.getSet());
2671 if(!set.hasStrings()) {
2672 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2673 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2674 }
2675
2676 UChar32 c;
2677 int32_t prev=length;
2678 do {
2679 U8_PREV(s, 0, length, c);
2680 if(c<0) {
2681 c=0xfffd;
2682 }
2683 if(realSet.contains(c)!=spanCondition) {
2684 break;
2685 }
2686 } while((prev=length)>0);
2687 return prev;
2688 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2689 UnicodeSetWithStringsIterator iter(set);
2690 UChar32 c;
2691 int32_t prev=length;
2692 do {
2693 U8_PREV(s, 0, length, c);
2694 if(c<0) {
2695 c=0xfffd;
2696 }
2697 if(realSet.contains(c)) {
2698 break;
2699 }
2700 const char *s8;
2701 int32_t length8;
2702 iter.reset();
2703 while((s8=iter.nextUTF8(length8))!=NULL) {
2704 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2705 // spanNeedsStrings=TRUE;
2706 return prev;
2707 }
2708 }
2709 } while((prev=length)>0);
2710 return prev;
2711 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2712 UnicodeSetWithStringsIterator iter(set);
2713 UChar32 c;
2714 int32_t prev=length, minSpanStart=length;
2715 do {
2716 U8_PREV(s, 0, length, c);
2717 if(c<0) {
2718 c=0xfffd;
2719 }
2720 if(!realSet.contains(c)) {
2721 length=prev; // Do not span this single, not-contained code point.
2722 }
2723 const char *s8;
2724 int32_t length8;
2725 iter.reset();
2726 while((s8=iter.nextUTF8(length8))!=NULL) {
2727 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2728 // spanNeedsStrings=TRUE;
2729 int32_t matchStart=prev-length8;
2730 if(matchStart==0) {
2731 return 0;
2732 }
2733 if(spanCondition==USET_SPAN_CONTAINED) {
2734 // Iterate for the shortest match at each position.
2735 // Recurse for each but the shortest match.
2736 if(length==prev) {
2737 length=matchStart; // First match from prev.
2738 } else {
2739 if(matchStart>length) {
2740 // Remember shortest match from prev for iteration.
2741 int32_t temp=length;
2742 length=matchStart;
2743 matchStart=temp;
2744 }
2745 // Recurse for non-shortest match from prev.
2746 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2747 USET_SPAN_CONTAINED);
2748 if(spanStart<minSpanStart) {
2749 minSpanStart=spanStart;
2750 if(minSpanStart==0) {
2751 return 0;
2752 }
2753 }
2754 }
2755 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2756 if(matchStart<length) {
2757 // Remember longest match from prev.
2758 length=matchStart;
2759 }
2760 }
2761 }
2762 }
2763 if(length==prev) {
2764 break; // No match from prev.
2765 }
2766 } while((prev=length)>0);
2767 if(prev<minSpanStart) {
2768 return prev;
2769 } else {
2770 return minSpanStart;
2771 }
2772 }
2773}
2774
2775// spans to be performed and compared
2776enum {
2777 SPAN_UTF16 =1,
2778 SPAN_UTF8 =2,
2779 SPAN_UTFS =3,
2780
2781 SPAN_SET =4,
2782 SPAN_COMPLEMENT =8,
2783 SPAN_POLARITY =0xc,
2784
2785 SPAN_FWD =0x10,
2786 SPAN_BACK =0x20,
2787 SPAN_DIRS =0x30,
2788
2789 SPAN_CONTAINED =0x100,
2790 SPAN_SIMPLE =0x200,
2791 SPAN_CONDITION =0x300,
2792
2793 SPAN_ALL =0x33f
2794};
2795
2796static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2797 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2798}
2799
2800static inline int32_t slen(const void *s, UBool isUTF16) {
2801 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2802}
2803
2804/*
2805 * Count spans on a string with the method according to type and set the span limits.
2806 * The set may be the complement of the original.
2807 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2808 * according to the expected number of spans.
2809 * Sets typeName to an empty string if there is no such type.
2810 * Returns -1 if the span option is filtered out.
2811 */
2812static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2813 const void *s, int32_t length, UBool isUTF16,
2814 uint32_t whichSpans,
2815 int type, const char *&typeName,
2816 int32_t limits[], int32_t limitsCapacity,
2817 int32_t expectCount) {
2818 const UnicodeSet &realSet(set.getSet());
2819 int32_t start, count;
2820 USetSpanCondition spanCondition, firstSpanCondition, contained;
2821 UBool isForward;
2822
2823 if(type<0 || 7<type) {
2824 typeName="";
2825 return 0;
2826 }
2827
2828 static const char *const typeNames16[]={
2829 "contains", "contains(LM)",
2830 "span", "span(LM)",
2831 "containsBack", "containsBack(LM)",
2832 "spanBack", "spanBack(LM)"
2833 };
2834
2835 static const char *const typeNames8[]={
2836 "containsUTF8", "containsUTF8(LM)",
2837 "spanUTF8", "spanUTF8(LM)",
2838 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2839 "spanBackUTF8", "spanBackUTF8(LM)"
2840 };
2841
2842 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2843
2844 // filter span options
2845 if(type<=3) {
2846 // span forward
2847 if((whichSpans&SPAN_FWD)==0) {
2848 return -1;
2849 }
2850 isForward=TRUE;
2851 } else {
2852 // span backward
2853 if((whichSpans&SPAN_BACK)==0) {
2854 return -1;
2855 }
2856 isForward=FALSE;
2857 }
2858 if((type&1)==0) {
2859 // use USET_SPAN_CONTAINED
2860 if((whichSpans&SPAN_CONTAINED)==0) {
2861 return -1;
2862 }
2863 contained=USET_SPAN_CONTAINED;
2864 } else {
2865 // use USET_SPAN_SIMPLE
2866 if((whichSpans&SPAN_SIMPLE)==0) {
2867 return -1;
2868 }
2869 contained=USET_SPAN_SIMPLE;
2870 }
2871
2872 // Default first span condition for going forward with an uncomplemented set.
2873 spanCondition=USET_SPAN_NOT_CONTAINED;
2874 if(isComplement) {
2875 spanCondition=invertSpanCondition(spanCondition, contained);
2876 }
2877
2878 // First span condition for span(), used to terminate the spanBack() iteration.
2879 firstSpanCondition=spanCondition;
2880
2881 // spanBack(): Its initial span condition is span()'s last span condition,
2882 // which is the opposite of span()'s first span condition
2883 // if we expect an even number of spans.
2884 // (The loop inverts spanCondition (expectCount-1) times
2885 // before the expectCount'th span() call.)
2886 // If we do not compare forward and backward directions, then we do not have an
2887 // expectCount and just start with firstSpanCondition.
2888 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2889 spanCondition=invertSpanCondition(spanCondition, contained);
2890 }
2891
2892 count=0;
2893 switch(type) {
2894 case 0:
2895 case 1:
2896 start=0;
2897 if(length<0) {
2898 length=slen(s, isUTF16);
2899 }
2900 for(;;) {
2901 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2902 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2903 if(count<limitsCapacity) {
2904 limits[count]=start;
2905 }
2906 ++count;
2907 if(start>=length) {
2908 break;
2909 }
2910 spanCondition=invertSpanCondition(spanCondition, contained);
2911 }
2912 break;
2913 case 2:
2914 case 3:
2915 start=0;
2916 for(;;) {
2917 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2918 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2919 if(count<limitsCapacity) {
2920 limits[count]=start;
2921 }
2922 ++count;
2923 if(length>=0 ? start>=length :
2924 isUTF16 ? ((const UChar *)s)[start]==0 :
2925 ((const char *)s)[start]==0
2926 ) {
2927 break;
2928 }
2929 spanCondition=invertSpanCondition(spanCondition, contained);
2930 }
2931 break;
2932 case 4:
2933 case 5:
2934 if(length<0) {
2935 length=slen(s, isUTF16);
2936 }
2937 for(;;) {
2938 ++count;
2939 if(count<=limitsCapacity) {
2940 limits[limitsCapacity-count]=length;
2941 }
2942 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2943 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2944 if(length==0 && spanCondition==firstSpanCondition) {
2945 break;
2946 }
2947 spanCondition=invertSpanCondition(spanCondition, contained);
2948 }
2949 if(count<limitsCapacity) {
2950 memmove(limits, limits+(limitsCapacity-count), count*4);
2951 }
2952 break;
2953 case 6:
2954 case 7:
2955 for(;;) {
2956 ++count;
2957 if(count<=limitsCapacity) {
2958 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
2959 }
2960 // Note: Length<0 is tested only for the first spanBack().
2961 // If we wanted to keep length<0 for all spanBack()s, we would have to
2962 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
2963 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
2964 realSet.spanBackUTF8((const char *)s, length, spanCondition);
2965 if(length==0 && spanCondition==firstSpanCondition) {
2966 break;
2967 }
2968 spanCondition=invertSpanCondition(spanCondition, contained);
2969 }
2970 if(count<limitsCapacity) {
2971 memmove(limits, limits+(limitsCapacity-count), count*4);
2972 }
2973 break;
2974 default:
2975 typeName="";
2976 return -1;
2977 }
2978
2979 return count;
2980}
2981
2982// sets to be tested; odd index=isComplement
2983enum {
2984 SLOW,
2985 SLOW_NOT,
2986 FAST,
2987 FAST_NOT,
2988 SET_COUNT
2989};
2990
2991static const char *const setNames[SET_COUNT]={
2992 "slow",
2993 "slow.not",
2994 "fast",
2995 "fast.not"
2996};
2997
2998/*
2999 * Verify that we get the same results whether we look at text with contains(),
3000 * span() or spanBack(), using unfrozen or frozen versions of the set,
3001 * and using the set or its complement (switching the spanConditions accordingly).
3002 * The latter verifies that
3003 * set.span(spanCondition) == set.complement().span(!spanCondition).
3004 *
3005 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3006 * or returned to the caller (with an input expectCount<0).
3007 */
3008void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3009 const void *s, int32_t length, UBool isUTF16,
3010 uint32_t whichSpans,
3011 int32_t expectLimits[], int32_t &expectCount,
3012 const char *testName, int32_t index) {
3013 int32_t limits[500];
3014 int32_t limitsCount;
3015 int i, j;
3016
3017 const char *typeName;
3018 int type;
3019
3020 for(i=0; i<SET_COUNT; ++i) {
3021 if((i&1)==0) {
3022 // Even-numbered sets are original, uncomplemented sets.
3023 if((whichSpans&SPAN_SET)==0) {
3024 continue;
3025 }
3026 } else {
3027 // Odd-numbered sets are complemented.
3028 if((whichSpans&SPAN_COMPLEMENT)==0) {
3029 continue;
3030 }
3031 }
3032 for(type=0;; ++type) {
3033 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3034 s, length, isUTF16,
3035 whichSpans,
3036 type, typeName,
3037 limits, LENGTHOF(limits), expectCount);
3038 if(typeName[0]==0) {
3039 break; // All types tried.
3040 }
3041 if(limitsCount<0) {
3042 continue; // Span option filtered out.
3043 }
3044 if(expectCount<0) {
3045 expectCount=limitsCount;
3046 if(limitsCount>LENGTHOF(limits)) {
3047 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3048 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3049 return;
3050 }
3051 memcpy(expectLimits, limits, limitsCount*4);
3052 } else if(limitsCount!=expectCount) {
3053 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3054 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3055 } else {
3056 for(j=0; j<limitsCount; ++j) {
3057 if(limits[j]!=expectLimits[j]) {
3058 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3059 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3060 j, (long)limits[j], (long)expectLimits[j]);
3061 break;
3062 }
3063 }
3064 }
3065 }
3066 }
3067
3068 // Compare span() with containsAll()/containsNone(),
3069 // but only if we have expectLimits[] from the uncomplemented set.
3070 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3071 const UChar *s16=(const UChar *)s;
3072 UnicodeString string;
3073 int32_t prev=0, limit, length;
3074 for(i=0; i<expectCount; ++i) {
3075 limit=expectLimits[i];
3076 length=limit-prev;
3077 if(length>0) {
3078 string.setTo(FALSE, s16+prev, length); // read-only alias
3079 if(i&1) {
3080 if(!sets[SLOW]->getSet().containsAll(string)) {
3081 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3082 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3083 return;
3084 }
3085 if(!sets[FAST]->getSet().containsAll(string)) {
3086 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3087 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3088 return;
3089 }
3090 } else {
3091 if(!sets[SLOW]->getSet().containsNone(string)) {
3092 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3093 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3094 return;
3095 }
3096 if(!sets[FAST]->getSet().containsNone(string)) {
3097 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3098 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3099 return;
3100 }
3101 }
3102 }
3103 prev=limit;
3104 }
3105 }
3106}
3107
3108// Specifically test either UTF-16 or UTF-8.
3109void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3110 const void *s, int32_t length, UBool isUTF16,
3111 uint32_t whichSpans,
3112 const char *testName, int32_t index) {
3113 int32_t expectLimits[500];
3114 int32_t expectCount=-1;
3115 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3116}
3117
3118UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3119 UChar c, c2;
3120
3121 if(length>=0) {
3122 while(length>0) {
3123 c=*s++;
3124 --length;
3125 if(0xd800<=c && c<0xe000) {
3126 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3127 return TRUE;
3128 }
3129 --length;
3130 }
3131 }
3132 } else {
3133 while((c=*s++)!=0) {
3134 if(0xd800<=c && c<0xe000) {
3135 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3136 return TRUE;
3137 }
3138 }
3139 }
3140 }
3141 return FALSE;
3142}
3143
3144// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3145// unless either UTF is turned off in whichSpans.
3146// Testing UTF-16 and UTF-8 together requires that surrogate code points
3147// have the same contains(c) value as U+FFFD.
3148void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3149 const UChar *s16, int32_t length16,
3150 uint32_t whichSpans,
3151 const char *testName, int32_t index) {
3152 int32_t expectLimits[500];
3153 int32_t expectCount;
3154
3155 expectCount=-1; // Get expectLimits[] from testSpan().
3156
3157 if((whichSpans&SPAN_UTF16)!=0) {
3158 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3159 }
3160 if((whichSpans&SPAN_UTF8)==0) {
3161 return;
3162 }
3163
3164 // Convert s16[] and expectLimits[] to UTF-8.
3165 uint8_t s8[3000];
3166 int32_t offsets[3000];
3167
3168 const UChar *s16Limit=s16+length16;
3169 char *t=(char *)s8;
3170 char *tLimit=t+sizeof(s8);
3171 int32_t *o=offsets;
3172 UErrorCode errorCode=U_ZERO_ERROR;
3173
3174 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3175 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3176 if(U_FAILURE(errorCode)) {
3177 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3178 testName, (long)index, u_errorName(errorCode));
3179 ucnv_resetFromUnicode(utf8Cnv);
3180 return;
3181 }
3182 int32_t length8=(int32_t)(t-(char *)s8);
3183
3184 // Convert expectLimits[].
3185 int32_t i, j, expect;
3186 for(i=j=0; i<expectCount; ++i) {
3187 expect=expectLimits[i];
3188 if(expect==length16) {
3189 expectLimits[i]=length8;
3190 } else {
3191 while(offsets[j]<expect) {
3192 ++j;
3193 }
3194 expectLimits[i]=j;
3195 }
3196 }
3197
3198 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3199}
3200
3201static UChar32 nextCodePoint(UChar32 c) {
3202 // Skip some large and boring ranges.
3203 switch(c) {
3204 case 0x3441:
3205 return 0x4d7f;
3206 case 0x5100:
3207 return 0x9f00;
3208 case 0xb040:
3209 return 0xd780;
3210 case 0xe041:
3211 return 0xf8fe;
3212 case 0x10100:
3213 return 0x20000;
3214 case 0x20041:
3215 return 0xe0000;
3216 case 0xe0101:
3217 return 0x10fffd;
3218 default:
3219 return c+1;
3220 }
3221}
3222
3223// Verify that all implementations represent the same set.
3224void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3225 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3226 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3227 // Skip the UTF-8 part of the test - if the string contains surrogates -
3228 // because it is likely to produce a different result.
3229 UBool inconsistentSurrogates=
3230 (!(sets[0]->getSet().contains(0xfffd) ?
3231 sets[0]->getSet().contains(0xd800, 0xdfff) :
3232 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3233 sets[0]->hasStringsWithSurrogates());
3234
3235 UChar s[1000];
3236 int32_t length=0;
3237 uint32_t localWhichSpans;
3238
3239 UChar32 c, first;
3240 for(first=c=0;; c=nextCodePoint(c)) {
3241 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3242 localWhichSpans=whichSpans;
3243 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3244 localWhichSpans&=~SPAN_UTF8;
3245 }
3246 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3247 if(c>0x10ffff) {
3248 break;
3249 }
3250 length=0;
3251 first=c;
3252 }
3253 U16_APPEND_UNSAFE(s, length, c);
3254 }
3255}
3256
3257// Test with a particular, interesting string.
3258// Specify length and try NUL-termination.
3259void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3260 static const UChar s[]={
3261 0x61, 0x62, 0x20, // Latin, space
3262 0x3b1, 0x3b2, 0x3b3, // Greek
3263 0xd900, // lead surrogate
3264 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3265 0xdc05, // trail surrogate
3266 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3267 0xd900, 0xdc05, // unassigned supplementary
3268 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3269 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3270 0 // NUL
3271 };
3272
3273 if((whichSpans&SPAN_UTF16)==0) {
3274 return;
3275 }
3276 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3277 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3278}
3279
3280void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3281 static const char s[]={
3282 "abc" // Latin
3283
3284 /* trail byte in lead position */
3285 "\x80"
3286
3287 " " // space
3288
3289 /* truncated multi-byte sequences */
3290 "\xd0"
3291 "\xe0"
3292 "\xe1"
3293 "\xed"
3294 "\xee"
3295 "\xf0"
3296 "\xf1"
3297 "\xf4"
3298 "\xf8"
3299 "\xfc"
3300
3301 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3302
3303 /* trail byte in lead position */
3304 "\x80"
3305
3306 "\xe0\x80"
3307 "\xe0\xa0"
3308 "\xe1\x80"
3309 "\xed\x80"
3310 "\xed\xa0"
3311 "\xee\x80"
3312 "\xf0\x80"
3313 "\xf0\x90"
3314 "\xf1\x80"
3315 "\xf4\x80"
3316 "\xf4\x90"
3317 "\xf8\x80"
3318 "\xfc\x80"
3319
3320 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3321
3322 /* trail byte in lead position */
3323 "\x80"
3324
3325 "\xf0\x80\x80"
3326 "\xf0\x90\x80"
3327 "\xf1\x80\x80"
3328 "\xf4\x80\x80"
3329 "\xf4\x90\x80"
3330 "\xf8\x80\x80"
3331 "\xfc\x80\x80"
3332
3333 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3334
3335 /* trail byte in lead position */
3336 "\x80"
3337
3338 "\xf8\x80\x80\x80"
3339 "\xfc\x80\x80\x80"
3340
3341 "\xF1\x90\x80\x85" // unassigned supplementary
3342
3343 /* trail byte in lead position */
3344 "\x80"
3345
3346 "\xfc\x80\x80\x80\x80"
3347
3348 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3349
3350 /* trail byte in lead position */
3351 "\x80"
3352
3353 /* complete sequences but non-shortest forms or out of range etc. */
3354 "\xc0\x80"
3355 "\xe0\x80\x80"
3356 "\xed\xa0\x80"
3357 "\xf0\x80\x80\x80"
3358 "\xf4\x90\x80\x80"
3359 "\xf8\x80\x80\x80\x80"
3360 "\xfc\x80\x80\x80\x80\x80"
3361 "\xfe"
3362 "\xff"
3363
3364 /* trail byte in lead position */
3365 "\x80"
3366
3367 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3368 };
3369
3370 if((whichSpans&SPAN_UTF8)==0) {
3371 return;
3372 }
3373 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3374 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3375}
3376
3377// Take a set of span options and multiply them so that
3378// each portion only has one of the options a, b and c.
3379// If b==0, then the set of options is just modified with mask and a.
3380// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3381static int32_t
3382addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3383 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3384 uint32_t s;
3385 int32_t i;
3386
3387 for(i=0; i<whichSpansCount; ++i) {
3388 s=whichSpans[i]&mask;
3389 whichSpans[i]=s|a;
3390 if(b!=0) {
3391 whichSpans[whichSpansCount+i]=s|b;
3392 if(c!=0) {
3393 whichSpans[2*whichSpansCount+i]=s|c;
3394 }
3395 }
3396 }
3397 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3398}
3399
3400#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3401#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3402#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3403#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3404
3405void UnicodeSetTest::TestSpan() {
3406 // "[...]" is a UnicodeSet pattern.
3407 // "*" performs tests on all Unicode code points and on a selection of
3408 // malformed UTF-8/16 strings.
3409 // "-options" limits the scope of testing for the current set.
3410 // By default, the test verifies that equivalent boundaries are found
3411 // for UTF-16 and UTF-8, going forward and backward,
3412 // alternating USET_SPAN_NOT_CONTAINED with
3413 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3414 // Single-character options:
3415 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3416 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3417 // or the set contains strings with unpaired surrogates
3418 // which do not translate to valid UTF-8.
3419 // c -- set.span() and set.complement().span() boundaries may differ.
3420 // Cause: Set strings are not complemented.
3421 // b -- span() and spanBack() boundaries may differ.
3422 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3423 // and spanBack(USET_SPAN_SIMPLE) are defined to
3424 // match with non-overlapping substrings.
3425 // For example, with a set containing "ab" and "ba",
3426 // span() of "aba" yields boundaries { 0, 2, 3 }
3427 // because the initial "ab" matches from 0 to 2,
3428 // while spanBack() yields boundaries { 0, 1, 3 }
3429 // because the final "ba" matches from 1 to 3.
3430 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3431 // Cause: Strings in the set overlap, and a longer match may
3432 // require a sequence including non-longest substrings.
3433 // For example, with a set containing "ab", "abc" and "cd",
3434 // span(contained) of "abcd" spans the entire string
3435 // but span(longest match) only spans the first 3 characters.
3436 // Each "-options" first resets all options and then applies the specified options.
3437 // A "-" without options resets the options.
3438 // The options are also reset for each new set.
3439 // Other strings will be spanned.
3440 static const char *const testdata[]={
3441 "[:ID_Continue:]",
3442 "*",
3443 "[:White_Space:]",
3444 "*",
3445 "[]",
3446 "*",
3447 "[\\u0000-\\U0010FFFF]",
3448 "*",
3449 "[\\u0000\\u0080\\u0800\\U00010000]",
3450 "*",
3451 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3452 "*",
3453 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3454 "-c",
3455 "*",
3456 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3457 "-c",
3458 "*",
3459
3460 // Overlapping strings cause overlapping attempts to match.
3461 "[x{xy}{xya}{axy}{ax}]",
3462 "-cl",
3463
3464 // More repetitions of "xya" would take too long with the recursive
3465 // reference implementation.
3466 // containsAll()=FALSE
3467 // test_string 0x14
3468 "xx"
3469 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3470 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3471 "xyaxyaxyaxya"
3472 "xx"
3473 "xyaxyaxyaxya" // span() ends here.
3474 "aaa",
3475
3476 // containsAll()=TRUE
3477 // test_string 0x15
3478 "xx"
3479 "xyaxyaxyaxya"
3480 "xx"
3481 "xyaxyaxyaxya"
3482 "xx"
3483 "xyaxyaxyaxy",
3484
3485 "-bc",
3486 // test_string 0x17
3487 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3488 "-c",
3489 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3490 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3491 "-",
3492 "byaya", // span() -> { 5 }
3493 "byay", // span() -> { 4 }
3494 "bya", // span() -> { 3 }
3495
3496 // span(longest match) will not span the whole string.
3497 "[a{ab}{bc}]",
3498 "-cl",
3499 // test_string 0x21
3500 "abc",
3501
3502 "[a{ab}{abc}{cd}]",
3503 "-cl",
3504 "acdabcdabccd",
3505
3506 // spanBack(longest match) will not span the whole string.
3507 "[c{ab}{bc}]",
3508 "-cl",
3509 "abc",
3510
3511 "[d{cd}{bcd}{ab}]",
3512 "-cl",
3513 "abbcdabcdabd",
3514
3515 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3516 // and UTF-8 trail bytes.
3517 // Copies of above test sets and strings, but transliterated to have
3518 // different code points with similar trail units.
3519 // Previous: a b c d
3520 // Unicode: 042B 30AB 200AB 204AB
3521 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3522 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3523 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3524 "-cl",
3525 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3526
3527 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3528 "-cl",
3529 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3530
3531 // Stress bookkeeping and recursion.
3532 // The following strings are barely doable with the recursive
3533 // reference implementation.
3534 // The not-contained character at the end prevents an early exit from the span().
3535 "[b{bb}]",
3536 "-c",
3537 // test_string 0x33
3538 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3539 // On complement sets, span() and spanBack() get different results
3540 // because b is not in the complement set and there is an odd number of b's
3541 // in the test string.
3542 "-bc",
3543 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3544
3545 // Test with set strings with an initial or final code point span
3546 // longer than 254.
3547 "[a{" _64_a _64_a _64_a _64_a "b}"
3548 "{a" _64_b _64_b _64_b _64_b "}]",
3549 "-c",
3550 _64_a _64_a _64_a _63_a "b",
3551 _64_a _64_a _64_a _64_a "b",
3552 _64_a _64_a _64_a _64_a "aaaabbbb",
3553 "a" _64_b _64_b _64_b _63_b,
3554 "a" _64_b _64_b _64_b _64_b,
3555 "aaaabbbb" _64_b _64_b _64_b _64_b,
3556
3557 // Test with strings containing unpaired surrogates.
3558 // They are not representable in UTF-8, and a leading trail surrogate
3559 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3560 // U+20001 == \\uD840\\uDC01
3561 // U+20400 == \\uD841\\uDC00
3562 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3563 "-8cl",
3564 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3565 };
3566 uint32_t whichSpans[96]={ SPAN_ALL };
3567 int32_t whichSpansCount=1;
3568
3569 UnicodeSet *sets[SET_COUNT]={ NULL };
3570 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3571
3572 char testName[1024];
3573 char *testNameLimit=testName;
3574
3575 int32_t i, j;
3576 for(i=0; i<LENGTHOF(testdata); ++i) {
3577 const char *s=testdata[i];
3578 if(s[0]=='[') {
3579 // Create new test sets from this pattern.
3580 for(j=0; j<SET_COUNT; ++j) {
3581 delete sets_with_str[j];
3582 delete sets[j];
3583 }
3584 UErrorCode errorCode=U_ZERO_ERROR;
3585 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3586 if(U_FAILURE(errorCode)) {
3587 errln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3588 break;
3589 }
3590 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3591 sets[SLOW_NOT]->complement();
3592 // Intermediate set: Test cloning of a frozen set.
3593 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3594 fast->freeze();
3595 sets[FAST]=(UnicodeSet *)fast->clone();
3596 delete fast;
3597 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3598 fastNot->freeze();
3599 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3600 delete fastNot;
3601
3602 for(j=0; j<SET_COUNT; ++j) {
3603 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3604 }
3605
3606 strcpy(testName, s);
3607 testNameLimit=strchr(testName, 0);
3608 *testNameLimit++=':';
3609 *testNameLimit=0;
3610
3611 whichSpans[0]=SPAN_ALL;
3612 whichSpansCount=1;
3613 } else if(s[0]=='-') {
3614 whichSpans[0]=SPAN_ALL;
3615 whichSpansCount=1;
3616
3617 while(*++s!=0) {
3618 switch(*s) {
3619 case 'c':
3620 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3621 ~SPAN_POLARITY,
3622 SPAN_SET,
3623 SPAN_COMPLEMENT,
3624 0);
3625 break;
3626 case 'b':
3627 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3628 ~SPAN_DIRS,
3629 SPAN_FWD,
3630 SPAN_BACK,
3631 0);
3632 break;
3633 case 'l':
3634 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3635 // USET_SPAN_SIMPLE only FWD, and separately
3636 // USET_SPAN_SIMPLE only BACK
3637 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3638 ~(SPAN_DIRS|SPAN_CONDITION),
3639 SPAN_DIRS|SPAN_CONTAINED,
3640 SPAN_FWD|SPAN_SIMPLE,
3641 SPAN_BACK|SPAN_SIMPLE);
3642 break;
3643 case '8':
3644 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3645 ~SPAN_UTFS,
3646 SPAN_UTF16,
3647 SPAN_UTF8,
3648 0);
3649 break;
3650 default:
3651 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3652 break;
3653 }
3654 }
3655 } else if(0==strcmp(s, "*")) {
3656 strcpy(testNameLimit, "bad_string");
3657 for(j=0; j<whichSpansCount; ++j) {
3658 if(whichSpansCount>1) {
3659 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3660 "%%0x%3x",
3661 whichSpans[j]);
3662 }
3663 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3664 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3665 }
3666
3667 strcpy(testNameLimit, "contents");
3668 for(j=0; j<whichSpansCount; ++j) {
3669 if(whichSpansCount>1) {
3670 sprintf(testNameLimit+8 /* strlen("contents") */,
3671 "%%0x%3x",
3672 whichSpans[j]);
3673 }
3674 testSpanContents(sets_with_str, whichSpans[j], testName);
3675 }
3676 } else {
3677 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3678 strcpy(testNameLimit, "test_string");
3679 for(j=0; j<whichSpansCount; ++j) {
3680 if(whichSpansCount>1) {
3681 sprintf(testNameLimit+11 /* strlen("test_string") */,
3682 "%%0x%3x",
3683 whichSpans[j]);
3684 }
3685 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3686 }
3687 }
3688 }
3689 for(j=0; j<SET_COUNT; ++j) {
3690 delete sets_with_str[j];
3691 delete sets[j];
3692 }
3693}
3694
3695// Test select patterns and strings, and test USET_SPAN_SIMPLE.
3696void UnicodeSetTest::TestStringSpan() {
3697 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3698 static const char *const string=
3699 "xx"
3700 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3701 "xx"
3702 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3703 "xx"
3704 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3705 "aaaa";
3706
3707 UErrorCode errorCode=U_ZERO_ERROR;
3708 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3709 UnicodeSet set(pattern16, errorCode);
3710 if(U_FAILURE(errorCode)) {
3711 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3712 return;
3713 }
3714
3715 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3716
3717 if(set.containsAll(string16)) {
3718 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3719 }
3720
3721 // Remove trailing "aaaa".
3722 string16.truncate(string16.length()-4);
3723 if(!set.containsAll(string16)) {
3724 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3725 }
3726
3727 string16=UNICODE_STRING_SIMPLE("byayaxya");
3728 const UChar *s16=string16.getBuffer();
3729 int32_t length16=string16.length();
3730 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3731 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3732 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3733 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3734 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3735 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3736 ) {
3737 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3738 }
3739
3740 pattern="[a{ab}{abc}{cd}]";
3741 pattern16=UnicodeString(pattern, -1, US_INV);
3742 set.applyPattern(pattern16, errorCode);
3743 if(U_FAILURE(errorCode)) {
3744 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3745 return;
3746 }
3747 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3748 s16=string16.getBuffer();
3749 length16=string16.length();
3750 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3751 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3752 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3753 ) {
3754 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3755 }
3756
3757 pattern="[d{cd}{bcd}{ab}]";
3758 pattern16=UnicodeString(pattern, -1, US_INV);
3759 set.applyPattern(pattern16, errorCode).freeze();
3760 if(U_FAILURE(errorCode)) {
3761 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3762 return;
3763 }
3764 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3765 s16=string16.getBuffer();
3766 length16=string16.length();
3767 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3768 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3769 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3770 ) {
3771 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3772 }
3773}