icuSources/test/intltest/usettest.cpp

   1 /*
   2 ********************************************************************************
   3 *   Copyright (C) 1999-2014 International Business Machines Corporation and
   4 *   others. All Rights Reserved.
   5 ********************************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 *   03/22/2000  Madhu       Added additional tests
   9 ********************************************************************************
  10 */
  11
  12 #include <stdio.h>
  13
  14 #include <string.h>
  15 #include "unicode/utypes.h"
  16 #include "usettest.h"
  17 #include "unicode/ucnv.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/usetiter.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/parsepos.h"
  23 #include "unicode/symtable.h"
  24 #include "unicode/uversion.h"
  25 #include "hash.h"
  26
  27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  28     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  29     u_errorName(status));}}
  30
  31 #define TEST_ASSERT(expr) {if (!(expr)) { \
  32     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
  33
  34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  35     UnicodeString pat;
  36     set.toPattern(pat);
  37     return left + UnicodeSetTest::escape(pat);
  38 }
  39
  40 #define CASE(id,test) case id:                          \
  41                           name = #test;                 \
  42                           if (exec) {                   \
  43                               logln(#test "---");       \
  44                               logln();                  \
  45                               test();                   \
  46                           }                             \
  47                           break
  48
  49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
  50 }
  51
  52 UConverter *UnicodeSetTest::openUTF8Converter() {
  53     if(utf8Cnv==NULL) {
  54         UErrorCode errorCode=U_ZERO_ERROR;
  55         utf8Cnv=ucnv_open("UTF-8", &errorCode);
  56     }
  57     return utf8Cnv;
  58 }
  59
  60 UnicodeSetTest::~UnicodeSetTest() {
  61     ucnv_close(utf8Cnv);
  62 }
  63
  64 void
  65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  66                                const char* &name, char* /*par*/) {
  67     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  68     switch (index) {
  69         CASE(0,TestPatterns);
  70         CASE(1,TestAddRemove);
  71         CASE(2,TestCategories);
  72         CASE(3,TestCloneEqualHash);
  73         CASE(4,TestMinimalRep);
  74         CASE(5,TestAPI);
  75         CASE(6,TestScriptSet);
  76         CASE(7,TestPropertySet);
  77         CASE(8,TestClone);
  78         CASE(9,TestExhaustive);
  79         CASE(10,TestToPattern);
  80         CASE(11,TestIndexOf);
  81         CASE(12,TestStrings);
  82         CASE(13,Testj2268);
  83         CASE(14,TestCloseOver);
  84         CASE(15,TestEscapePattern);
  85         CASE(16,TestInvalidCodePoint);
  86         CASE(17,TestSymbolTable);
  87         CASE(18,TestSurrogate);
  88         CASE(19,TestPosixClasses);
  89         CASE(20,TestIteration);
  90         CASE(21,TestFreezable);
  91         CASE(22,TestSpan);
  92         CASE(23,TestStringSpan);
  93         default: name = ""; break;
  94     }
  95 }
  96
  97 static const char NOT[] = "%%%%";
  98
  99 /**
 100  * UVector was improperly copying contents
 101  * This code will crash this is still true
 102  */
 103 void UnicodeSetTest::Testj2268() {
 104   UnicodeSet t;
 105   t.add(UnicodeString("abc"));
 106   UnicodeSet test(t);
 107   UnicodeString ustrPat;
 108   test.toPattern(ustrPat, TRUE);
 109 }
 110
 111 /**
 112  * Test toPattern().
 113  */
 114 void UnicodeSetTest::TestToPattern() {
 115     UErrorCode ec = U_ZERO_ERROR;
 116
 117     // Test that toPattern() round trips with syntax characters and
 118     // whitespace.
 119     {
 120         static const char* OTHER_TOPATTERN_TESTS[] = {
 121             "[[:latin:]&[:greek:]]",
 122             "[[:latin:]-[:greek:]]",
 123             "[:nonspacing mark:]",
 124             NULL
 125         };
 126
 127         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 128             ec = U_ZERO_ERROR;
 129             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 130             if (U_FAILURE(ec)) {
 131                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
 132                 continue;
 133             }
 134             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 135         }
 136
 137         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 138             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 139
 140                 // check various combinations to make sure they all work.
 141                 if (i != 0 && !toPatternAux(i, i)){
 142                     continue;
 143                 }
 144                 if (!toPatternAux(0, i)){
 145                     continue;
 146                 }
 147                 if (!toPatternAux(i, 0xFFFF)){
 148                     continue;
 149                 }
 150             }
 151         }
 152     }
 153
 154     // Test pattern behavior of multicharacter strings.
 155     {
 156         ec = U_ZERO_ERROR;
 157         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 158
 159         // This loop isn't a loop.  It's here to make the compiler happy.
 160         // If you're curious, try removing it and changing the 'break'
 161         // statements (except for the last) to goto's.
 162         for (;;) {
 163             if (U_FAILURE(ec)) break;
 164             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 165             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 166
 167             s->add("ac");
 168             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 169             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 170
 171             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
 172             if (U_FAILURE(ec)) break;
 173             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 174             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
 175
 176             s->add("[]");
 177             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 178             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
 179
 180             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
 181             if (U_FAILURE(ec)) break;
 182             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 183             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
 184
 185             // j2189
 186             s->clear();
 187             s->add(UnicodeString("abc", ""));
 188             s->add(UnicodeString("abc", ""));
 189             const char* exp6[] = {"abc", NOT, "ab", NULL};
 190             expectToPattern(*s, "[{abc}]", exp6);
 191
 192             break;
 193         }
 194
 195         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 196         delete s;
 197     }
 198
 199     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 200     UnicodeSet s;
 201     s.add((UChar)97, (UChar)98); // 'a', 'b'
 202     expectToPattern(s, "[ab]", NULL);
 203 }
 204
 205 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 206
 207     // use Integer.toString because Utility.hex doesn't handle ints
 208     UnicodeString pat = "";
 209     // TODO do these in hex
 210     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 211     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 212     UnicodeString source;
 213     source = source + (uint32_t)start;
 214     if (start != end)
 215         source = source + ".." + (uint32_t)end;
 216     UnicodeSet testSet;
 217     testSet.add(start, end);
 218     return checkPat(source, testSet);
 219 }
 220
 221 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 222                                const UnicodeSet& testSet) {
 223     // What we want to make sure of is that a pattern generated
 224     // by toPattern(), with or without escaped unprintables, can
 225     // be passed back into the UnicodeSet constructor.
 226     UnicodeString pat0;
 227
 228     testSet.toPattern(pat0, TRUE);
 229
 230     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 231
 232     //String pat1 = unescapeLeniently(pat0);
 233     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 234
 235     UnicodeString pat2;
 236     testSet.toPattern(pat2, FALSE);
 237     if (!checkPat(source, testSet, pat2)) return FALSE;
 238
 239     //String pat3 = unescapeLeniently(pat2);
 240     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 241
 242     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 243     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 244     return TRUE;
 245 }
 246
 247 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 248                                const UnicodeSet& testSet,
 249                                const UnicodeString& pat) {
 250     UErrorCode ec = U_ZERO_ERROR;
 251     UnicodeSet testSet2(pat, ec);
 252     if (testSet2 != testSet) {
 253         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 254         return FALSE;
 255     }
 256     return TRUE;
 257 }
 258
 259 void
 260 UnicodeSetTest::TestPatterns(void) {
 261     UnicodeSet set;
 262     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 263     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 264     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 265     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 266     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 267     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 268
 269     // Throw in a test of complement
 270     set.complement();
 271     UnicodeString exp;
 272     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 273     expectPairs(set, exp);
 274 }
 275
 276 void
 277 UnicodeSetTest::TestCategories(void) {
 278     UErrorCode status = U_ZERO_ERROR;
 279     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 280     UnicodeSet set(pat, status);
 281     if (U_FAILURE(status)) {
 282         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
 283         return;
 284     } else {
 285         expectContainment(set, pat, "ABC", "abc");
 286     }
 287
 288     UChar32 i;
 289     int32_t failures = 0;
 290     // Make sure generation of L doesn't pollute cached Lu set
 291     // First generate L, then Lu
 292     set.applyPattern("[:L:]", status);
 293     if (U_FAILURE(status)) { errln("FAIL"); return; }
 294     for (i=0; i<0x200; ++i) {
 295         UBool l = u_isalpha((UChar)i);
 296         if (l != set.contains(i)) {
 297             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 298                   set.contains(i));
 299             if (++failures == 10) break;
 300         }
 301     }
 302
 303     set.applyPattern("[:Lu:]", status);
 304     if (U_FAILURE(status)) { errln("FAIL"); return; }
 305     for (i=0; i<0x200; ++i) {
 306         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 307         if (lu != set.contains(i)) {
 308             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 309                   set.contains(i));
 310             if (++failures == 20) break;
 311         }
 312     }
 313 }
 314 void
 315 UnicodeSetTest::TestCloneEqualHash(void) {
 316     UErrorCode status = U_ZERO_ERROR;
 317     // set1 and set2 used to be built with the obsolete constructor taking
 318     // UCharCategory values; replaced with pattern constructors
 319     // markus 20030502
 320     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
 321     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
 322     if (U_FAILURE(status)){
 323         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
 324         return;
 325     }
 326     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
 327     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
 328     if (U_FAILURE(status)){
 329         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 330         return;
 331     }
 332
 333     if (*set1 != *set1a) {
 334         errln("FAIL: category constructor for Ll broken");
 335     }
 336     if (*set2 != *set2a) {
 337         errln("FAIL: category constructor for Nd broken");
 338     }
 339     delete set1a;
 340     delete set2a;
 341
 342     logln("Testing copy construction");
 343     UnicodeSet *set1copy=new UnicodeSet(*set1);
 344     if(*set1 != *set1copy || *set1 == *set2 ||
 345         getPairs(*set1) != getPairs(*set1copy) ||
 346         set1->hashCode() != set1copy->hashCode()){
 347         errln("FAIL : Error in copy construction");
 348         return;
 349     }
 350
 351     logln("Testing =operator");
 352     UnicodeSet set1equal=*set1;
 353     UnicodeSet set2equal=*set2;
 354     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 355         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 356         errln("FAIL: Error in =operator");
 357     }
 358
 359     logln("Testing clone()");
 360     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 361     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 362     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 363         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 364         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 365         errln("FAIL: Error in clone");
 366     }
 367
 368     logln("Testing hashcode");
 369     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 370         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 371         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 372         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 373         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 374         errln("FAIL: Error in hashCode()");
 375     }
 376
 377     delete set1;
 378     delete set1copy;
 379     delete set2;
 380     delete set1clone;
 381     delete set2clone;
 382
 383
 384 }
 385 void
 386 UnicodeSetTest::TestAddRemove(void) {
 387     UnicodeSet set; // Construct empty set
 388     doAssert(set.isEmpty() == TRUE, "set should be empty");
 389     doAssert(set.size() == 0, "size should be 0");
 390     set.complement();
 391     doAssert(set.size() == 0x110000, "size should be 0x110000");
 392     set.clear();
 393     set.add(0x0061, 0x007a);
 394     expectPairs(set, "az");
 395     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 396     doAssert(set.size() != 0, "size should not be equal to 0");
 397     doAssert(set.size() == 26, "size should be equal to 26");
 398     set.remove(0x006d, 0x0070);
 399     expectPairs(set, "alqz");
 400     doAssert(set.size() == 22, "size should be equal to 22");
 401     set.remove(0x0065, 0x0067);
 402     expectPairs(set, "adhlqz");
 403     doAssert(set.size() == 19, "size should be equal to 19");
 404     set.remove(0x0064, 0x0069);
 405     expectPairs(set, "acjlqz");
 406     doAssert(set.size() == 16, "size should be equal to 16");
 407     set.remove(0x0063, 0x0072);
 408     expectPairs(set, "absz");
 409     doAssert(set.size() == 10, "size should be equal to 10");
 410     set.add(0x0066, 0x0071);
 411     expectPairs(set, "abfqsz");
 412     doAssert(set.size() == 22, "size should be equal to 22");
 413     set.remove(0x0061, 0x0067);
 414     expectPairs(set, "hqsz");
 415     set.remove(0x0061, 0x007a);
 416     expectPairs(set, "");
 417     doAssert(set.isEmpty() == TRUE, "set should be empty");
 418     doAssert(set.size() == 0, "size should be 0");
 419     set.add(0x0061);
 420     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 421     doAssert(set.size() == 1, "size should not be equal to 1");
 422     set.add(0x0062);
 423     set.add(0x0063);
 424     expectPairs(set, "ac");
 425     doAssert(set.size() == 3, "size should not be equal to 3");
 426     set.add(0x0070);
 427     set.add(0x0071);
 428     expectPairs(set, "acpq");
 429     doAssert(set.size() == 5, "size should not be equal to 5");
 430     set.clear();
 431     expectPairs(set, "");
 432     doAssert(set.isEmpty() == TRUE, "set should be empty");
 433     doAssert(set.size() == 0, "size should be 0");
 434
 435     // Try removing an entire set from another set
 436     expectPattern(set, "[c-x]", "cx");
 437     UnicodeSet set2;
 438     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 439     set.removeAll(set2);
 440     expectPairs(set, "deluxx");
 441
 442     // Try adding an entire set to another set
 443     expectPattern(set, "[jackiemclean]", "aacceein");
 444     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 445     set.addAll(set2);
 446     expectPairs(set, "aacehort");
 447     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 448
 449     // Try retaining an set of elements contained in another set (intersection)
 450     UnicodeSet set3;
 451     expectPattern(set3, "[a-c]", "ac");
 452     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 453     set3.remove(0x0062);
 454     expectPairs(set3, "aacc");
 455     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 456     set.retainAll(set3);
 457     expectPairs(set, "aacc");
 458     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 459     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 460     set.clear();
 461     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 462
 463     // Test commutativity
 464     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 465     expectPattern(set2, "[jackiemclean]", "aacceein");
 466     set.addAll(set2);
 467     expectPairs(set, "aacehort");
 468     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 469
 470
 471
 472
 473 }
 474
 475 /**
 476  * Make sure minimal representation is maintained.
 477  */
 478 void UnicodeSetTest::TestMinimalRep() {
 479     UErrorCode status = U_ZERO_ERROR;
 480     // This is pretty thoroughly tested by checkCanonicalRep()
 481     // run against the exhaustive operation results.  Use the code
 482     // here for debugging specific spot problems.
 483
 484     // 1 overlap against 2
 485     UnicodeSet set("[h-km-q]", status);
 486     if (U_FAILURE(status)) { errln("FAIL"); return; }
 487     UnicodeSet set2("[i-o]", status);
 488     if (U_FAILURE(status)) { errln("FAIL"); return; }
 489     set.addAll(set2);
 490     expectPairs(set, "hq");
 491     // right
 492     set.applyPattern("[a-m]", status);
 493     if (U_FAILURE(status)) { errln("FAIL"); return; }
 494     set2.applyPattern("[e-o]", status);
 495     if (U_FAILURE(status)) { errln("FAIL"); return; }
 496     set.addAll(set2);
 497     expectPairs(set, "ao");
 498     // left
 499     set.applyPattern("[e-o]", status);
 500     if (U_FAILURE(status)) { errln("FAIL"); return; }
 501     set2.applyPattern("[a-m]", status);
 502     if (U_FAILURE(status)) { errln("FAIL"); return; }
 503     set.addAll(set2);
 504     expectPairs(set, "ao");
 505     // 1 overlap against 3
 506     set.applyPattern("[a-eg-mo-w]", status);
 507     if (U_FAILURE(status)) { errln("FAIL"); return; }
 508     set2.applyPattern("[d-q]", status);
 509     if (U_FAILURE(status)) { errln("FAIL"); return; }
 510     set.addAll(set2);
 511     expectPairs(set, "aw");
 512 }
 513
 514 void UnicodeSetTest::TestAPI() {
 515     UErrorCode status = U_ZERO_ERROR;
 516     // default ct
 517     UnicodeSet set;
 518     if (!set.isEmpty() || set.getRangeCount() != 0) {
 519         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 520               set);
 521     }
 522
 523     // clear(), isEmpty()
 524     set.add(0x0061);
 525     if (set.isEmpty()) {
 526         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 527               set);
 528     }
 529     set.clear();
 530     if (!set.isEmpty()) {
 531         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 532               set);
 533     }
 534
 535     // size()
 536     set.clear();
 537     if (set.size() != 0) {
 538         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 539               ": " + set);
 540     }
 541     set.add(0x0061);
 542     if (set.size() != 1) {
 543         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 544               ": " + set);
 545     }
 546     set.add(0x0031, 0x0039);
 547     if (set.size() != 10) {
 548         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 549               ": " + set);
 550     }
 551
 552     // contains(first, last)
 553     set.clear();
 554     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 555     if (U_FAILURE(status)) { errln("FAIL"); return; }
 556     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 557         UChar32 a = set.getRangeStart(i);
 558         UChar32 b = set.getRangeEnd(i);
 559         if (!set.contains(a, b)) {
 560             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 561                   " but doesn't: " + set);
 562         }
 563         if (set.contains((UChar32)(a-1), b)) {
 564             errln((UnicodeString)"FAIL, shouldn't contain " +
 565                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 566                   " but does: " + set);
 567         }
 568         if (set.contains(a, (UChar32)(b+1))) {
 569             errln((UnicodeString)"FAIL, shouldn't contain " +
 570                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 571                   " but does: " + set);
 572         }
 573     }
 574
 575     // Ported InversionList test.
 576     UnicodeSet a((UChar32)3,(UChar32)10);
 577     UnicodeSet b((UChar32)7,(UChar32)15);
 578     UnicodeSet c;
 579
 580     logln((UnicodeString)"a [3-10]: " + a);
 581     logln((UnicodeString)"b [7-15]: " + b);
 582     c = a;
 583     c.addAll(b);
 584     UnicodeSet exp((UChar32)3,(UChar32)15);
 585     if (c == exp) {
 586         logln((UnicodeString)"c.set(a).add(b): " + c);
 587     } else {
 588         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 589     }
 590     c.complement();
 591     exp.set((UChar32)0, (UChar32)2);
 592     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 593     if (c == exp) {
 594         logln((UnicodeString)"c.complement(): " + c);
 595     } else {
 596         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 597     }
 598     c.complement();
 599     exp.set((UChar32)3, (UChar32)15);
 600     if (c == exp) {
 601         logln((UnicodeString)"c.complement(): " + c);
 602     } else {
 603         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 604     }
 605     c = a;
 606     c.complementAll(b);
 607     exp.set((UChar32)3,(UChar32)6);
 608     exp.add((UChar32)11,(UChar32) 15);
 609     if (c == exp) {
 610         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 611     } else {
 612         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 613     }
 614
 615     exp = c;
 616     bitsToSet(setToBits(c), c);
 617     if (c == exp) {
 618         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 619     } else {
 620         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 621     }
 622
 623     // Additional tests for coverage JB#2118
 624     //UnicodeSet::complement(class UnicodeString const &)
 625     //UnicodeSet::complementAll(class UnicodeString const &)
 626     //UnicodeSet::containsNone(class UnicodeSet const &)
 627     //UnicodeSet::containsNone(long,long)
 628     //UnicodeSet::containsSome(class UnicodeSet const &)
 629     //UnicodeSet::containsSome(long,long)
 630     //UnicodeSet::removeAll(class UnicodeString const &)
 631     //UnicodeSet::retain(long)
 632     //UnicodeSet::retainAll(class UnicodeString const &)
 633     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 634     //UnicodeSetIterator::getString(void)
 635     set.clear();
 636     set.complement("ab");
 637     exp.applyPattern("[{ab}]", status);
 638     if (U_FAILURE(status)) { errln("FAIL"); return; }
 639     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 640
 641     UnicodeSetIterator iset(set);
 642     if (!iset.next() || !iset.isString()) {
 643         errln("FAIL: UnicodeSetIterator::next/isString");
 644     } else if (iset.getString() != "ab") {
 645         errln("FAIL: UnicodeSetIterator::getString");
 646     }
 647
 648     set.add((UChar32)0x61, (UChar32)0x7A);
 649     set.complementAll("alan");
 650     exp.applyPattern("[{ab}b-kmo-z]", status);
 651     if (U_FAILURE(status)) { errln("FAIL"); return; }
 652     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 653
 654     exp.applyPattern("[a-z]", status);
 655     if (U_FAILURE(status)) { errln("FAIL"); return; }
 656     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 657     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 658     exp.applyPattern("[aln]", status);
 659     if (U_FAILURE(status)) { errln("FAIL"); return; }
 660     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 661     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 662
 663     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 664         errln("FAIL: containsNone(UChar32, UChar32)");
 665     }
 666     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 667         errln("FAIL: containsSome(UChar32, UChar32)");
 668     }
 669     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 670         errln("FAIL: containsNone(UChar32, UChar32)");
 671     }
 672     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 673         errln("FAIL: containsSome(UChar32, UChar32)");
 674     }
 675
 676     set.removeAll("liu");
 677     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 678     if (U_FAILURE(status)) { errln("FAIL"); return; }
 679     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 680
 681     set.retainAll("star");
 682     exp.applyPattern("[rst]", status);
 683     if (U_FAILURE(status)) { errln("FAIL"); return; }
 684     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 685
 686     set.retain((UChar32)0x73);
 687     exp.applyPattern("[s]", status);
 688     if (U_FAILURE(status)) { errln("FAIL"); return; }
 689     if (set != exp) { errln("FAIL: retain('s')"); return; }
 690
 691     uint16_t buf[32];
 692     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
 693     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 694     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 695         errln("FAIL: serialize");
 696         return;
 697     }
 698
 699     // Conversions to and from USet
 700     UnicodeSet *uniset = &set;
 701     USet *uset = uniset->toUSet();
 702     TEST_ASSERT((void *)uset == (void *)uniset);
 703     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
 704     TEST_ASSERT((void *)setx == (void *)uset);
 705     const UnicodeSet *constSet = uniset;
 706     const USet *constUSet = constSet->toUSet();
 707     TEST_ASSERT((void *)constUSet == (void *)constSet);
 708     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
 709     TEST_ASSERT((void *)constSetx == (void *)constUSet);
 710
 711     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
 712     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
 713     UnicodeSet ac(0x61, 0x63);
 714     ac.remove(0x62).freeze();
 715     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
 716         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
 717         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
 718         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
 719         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 720         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
 721         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
 722         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
 723         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
 724         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
 725     ) {
 726         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
 727     }
 728     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
 729         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
 730         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
 731         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
 732         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 733         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
 734         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
 735         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
 736         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
 737         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
 738     ) {
 739         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
 740     }
 741 }
 742
 743 void UnicodeSetTest::TestIteration() {
 744     UErrorCode ec = U_ZERO_ERROR;
 745     int i = 0;
 746     int outerLoop;
 747
 748     // 6 code points, 3 ranges, 2 strings, 8 total elements
 749     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 750     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
 751     TEST_ASSERT_SUCCESS(ec);
 752     UnicodeSetIterator it(set);
 753
 754     for (outerLoop=0; outerLoop<3; outerLoop++) {
 755         // Run the test multiple times, to check that iterator.reset() is working.
 756         for (i=0; i<10; i++) {
 757             UBool         nextv        = it.next();
 758             UBool         isString     = it.isString();
 759             int32_t       codePoint    = it.getCodepoint();
 760             //int32_t       codePointEnd = it.getCodepointEnd();
 761             UnicodeString s   = it.getString();
 762             switch (i) {
 763             case 0:
 764                 TEST_ASSERT(nextv == TRUE);
 765                 TEST_ASSERT(isString == FALSE);
 766                 TEST_ASSERT(codePoint==0x61);
 767                 TEST_ASSERT(s == "a");
 768                 break;
 769             case 1:
 770                 TEST_ASSERT(nextv == TRUE);
 771                 TEST_ASSERT(isString == FALSE);
 772                 TEST_ASSERT(codePoint==0x62);
 773                 TEST_ASSERT(s == "b");
 774                 break;
 775             case 2:
 776                 TEST_ASSERT(nextv == TRUE);
 777                 TEST_ASSERT(isString == FALSE);
 778                 TEST_ASSERT(codePoint==0x63);
 779                 TEST_ASSERT(s == "c");
 780                 break;
 781             case 3:
 782                 TEST_ASSERT(nextv == TRUE);
 783                 TEST_ASSERT(isString == FALSE);
 784                 TEST_ASSERT(codePoint==0x79);
 785                 TEST_ASSERT(s == "y");
 786                 break;
 787             case 4:
 788                 TEST_ASSERT(nextv == TRUE);
 789                 TEST_ASSERT(isString == FALSE);
 790                 TEST_ASSERT(codePoint==0x7a);
 791                 TEST_ASSERT(s == "z");
 792                 break;
 793             case 5:
 794                 TEST_ASSERT(nextv == TRUE);
 795                 TEST_ASSERT(isString == FALSE);
 796                 TEST_ASSERT(codePoint==0x1abcd);
 797                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 798                 break;
 799             case 6:
 800                 TEST_ASSERT(nextv == TRUE);
 801                 TEST_ASSERT(isString == TRUE);
 802                 TEST_ASSERT(s == "str1");
 803                 break;
 804             case 7:
 805                 TEST_ASSERT(nextv == TRUE);
 806                 TEST_ASSERT(isString == TRUE);
 807                 TEST_ASSERT(s == "str2");
 808                 break;
 809             case 8:
 810                 TEST_ASSERT(nextv == FALSE);
 811                 break;
 812             case 9:
 813                 TEST_ASSERT(nextv == FALSE);
 814                 break;
 815             }
 816         }
 817         it.reset();  // prepare to run the iteration again.
 818     }
 819 }
 820
 821
 822
 823
 824 void UnicodeSetTest::TestStrings() {
 825     UErrorCode ec = U_ZERO_ERROR;
 826
 827     UnicodeSet* testList[] = {
 828         UnicodeSet::createFromAll("abc"),
 829         new UnicodeSet("[a-c]", ec),
 830
 831         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 832         new UnicodeSet("[{ll}{ch}a-z]", ec),
 833
 834         UnicodeSet::createFrom("ab}c"),
 835         new UnicodeSet("[{ab\\}c}]", ec),
 836
 837         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 838         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 839
 840         NULL
 841     };
 842
 843     if (U_FAILURE(ec)) {
 844         errln("FAIL: couldn't construct test sets");
 845     }
 846
 847     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 848         if (U_SUCCESS(ec)) {
 849             UnicodeString pat0, pat1;
 850             testList[i]->toPattern(pat0, TRUE);
 851             testList[i+1]->toPattern(pat1, TRUE);
 852             if (*testList[i] == *testList[i+1]) {
 853                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 854             } else {
 855                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 856             }
 857         }
 858         delete testList[i];
 859         delete testList[i+1];
 860     }
 861 }
 862
 863 /**
 864  * Test the [:Latin:] syntax.
 865  */
 866 void UnicodeSetTest::TestScriptSet() {
 867     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 868
 869     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 870
 871     /* Jitterbug 1423 */
 872     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 873
 874 }
 875
 876 /**
 877  * Test the [:Latin:] syntax.
 878  */
 879 void UnicodeSetTest::TestPropertySet() {
 880     static const char* const DATA[] = {
 881         // Pattern, Chars IN, Chars NOT in
 882
 883         "[:Latin:]",
 884         "aA",
 885         "\\u0391\\u03B1",
 886
 887         "[\\p{Greek}]",
 888         "\\u0391\\u03B1",
 889         "aA",
 890
 891         "\\P{ GENERAL Category = upper case letter }",
 892         "abc",
 893         "ABC",
 894
 895 #if !UCONFIG_NO_NORMALIZATION
 896         // Combining class: @since ICU 2.2
 897         // Check both symbolic and numeric
 898         "\\p{ccc=Nukta}",
 899         "\\u0ABC",
 900         "abc",
 901
 902         "\\p{Canonical Combining Class = 11}",
 903         "\\u05B1",
 904         "\\u05B2",
 905
 906         "[:c c c = iota subscript :]",
 907         "\\u0345",
 908         "xyz",
 909 #endif
 910
 911         // Bidi class: @since ICU 2.2
 912         "\\p{bidiclass=lefttoright}",
 913         "abc",
 914         "\\u0671\\u0672",
 915
 916         // Binary properties: @since ICU 2.2
 917         "\\p{ideographic}",
 918         "\\u4E0A",
 919         "x",
 920
 921         "[:math=false:]",
 922         "q)*(",
 923         // weiv: )(and * were removed from math in Unicode 4.0.1
 924         //"(*+)",
 925         "+<>^",
 926
 927         // JB#1767 \N{}, \p{ASCII}
 928         "[:Ascii:]",
 929         "abc\\u0000\\u007F",
 930         "\\u0080\\u4E00",
 931
 932         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 933         "az",
 934         "qrs",
 935
 936         // JB#2015
 937         "[:any:]",
 938         "a\\U0010FFFF",
 939         "",
 940
 941         "[:nv=0.5:]",
 942         "\\u00BD\\u0F2A",
 943         "\\u00BC",
 944
 945         // JB#2653: Age
 946         "[:Age=1.1:]",
 947         "\\u03D6", // 1.1
 948         "\\u03D8\\u03D9", // 3.2
 949
 950         "[:Age=3.1:]",
 951         "\\u1800\\u3400\\U0002f800",
 952         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 953
 954         // JB#2350: Case_Sensitive
 955         "[:Case Sensitive:]",
 956         "A\\u1FFC\\U00010410",
 957         ";\\u00B4\\U00010500",
 958
 959         // JB#2832: C99-compatibility props
 960         "[:blank:]",
 961         " \\u0009",
 962         "1-9A-Z",
 963
 964         "[:graph:]",
 965         "19AZ",
 966         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 967
 968         "[:punct:]",
 969         "!@#%&*()[]{}-_\\/;:,.?'\"",
 970         "09azAZ",
 971
 972         "[:xdigit:]",
 973         "09afAF",
 974         "gG!",
 975
 976         // Regex compatibility test
 977         "[-b]", // leading '-' is literal
 978         "-b",
 979         "ac",
 980
 981         "[^-b]", // leading '-' is literal
 982         "ac",
 983         "-b",
 984
 985         "[b-]", // trailing '-' is literal
 986         "-b",
 987         "ac",
 988
 989         "[^b-]", // trailing '-' is literal
 990         "ac",
 991         "-b",
 992
 993         "[a-b-]", // trailing '-' is literal
 994         "ab-",
 995         "c=",
 996
 997         "[[a-q]&[p-z]-]", // trailing '-' is literal
 998         "pq-",
 999         "or=",
1000
1001         "[\\s|\\)|:|$|\\>]", // from regex tests
1002         "s|):$>",
1003         "abc",
1004
1005         "[\\uDC00cd]", // JB#2906: isolated trail at start
1006         "cd\\uDC00",
1007         "ab\\uD800\\U00010000",
1008
1009         "[ab\\uD800]", // JB#2906: isolated trail at start
1010         "ab\\uD800",
1011         "cd\\uDC00\\U00010000",
1012
1013         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1014         "abcd\\uD800",
1015         "ef\\uDC00\\U00010000",
1016
1017         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1018         "abcd\\uDC00",
1019         "ef\\uD800\\U00010000",
1020
1021 #if !UCONFIG_NO_NORMALIZATION
1022         "[:^lccc=0:]", // Lead canonical class
1023         "\\u0300\\u0301",
1024         "abcd\\u00c0\\u00c5",
1025
1026         "[:^tccc=0:]", // Trail canonical class
1027         "\\u0300\\u0301\\u00c0\\u00c5",
1028         "abcd",
1029
1030         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1031         "\\u0300\\u0301\\u00c0\\u00c5",
1032         "abcd",
1033
1034         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1035         "",
1036         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1037
1038         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1039         "\\u0F73\\u0F75\\u0F81",
1040         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041 #endif /* !UCONFIG_NO_NORMALIZATION */
1042
1043         "[:Assigned:]",
1044         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1045         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1046
1047         // Script_Extensions, new in Unicode 6.0
1048         "[:scx=Arab:]",
1049         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1050         "\\u061D\\uFDEF\\uFDFE",
1051
1052         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1053         // so scx-sc is missing U+FDF2.
1054         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1055         "\\u0640\\u064B\\u0650\\u0655",
1056         "\\uFDF2"
1057     };
1058
1059     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1060
1061     for (int32_t i=0; i<DATA_LEN; i+=3) {
1062         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1063                           CharsToUnicodeString(DATA[i+2]));
1064     }
1065 }
1066
1067 /**
1068   * Test that Posix style character classes [:digit:], etc.
1069   *   have the Unicode definitions from TR 18.
1070   */
1071 void UnicodeSetTest::TestPosixClasses() {
1072     {
1073         UErrorCode status = U_ZERO_ERROR;
1074         UnicodeSet s1("[:alpha:]", status);
1075         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1076         TEST_ASSERT_SUCCESS(status);
1077         TEST_ASSERT(s1==s2);
1078     }
1079     {
1080         UErrorCode status = U_ZERO_ERROR;
1081         UnicodeSet s1("[:lower:]", status);
1082         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1083         TEST_ASSERT_SUCCESS(status);
1084         TEST_ASSERT(s1==s2);
1085     }
1086     {
1087         UErrorCode status = U_ZERO_ERROR;
1088         UnicodeSet s1("[:upper:]", status);
1089         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1090         TEST_ASSERT_SUCCESS(status);
1091         TEST_ASSERT(s1==s2);
1092     }
1093     {
1094         UErrorCode status = U_ZERO_ERROR;
1095         UnicodeSet s1("[:punct:]", status);
1096         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1097         TEST_ASSERT_SUCCESS(status);
1098         TEST_ASSERT(s1==s2);
1099     }
1100     {
1101         UErrorCode status = U_ZERO_ERROR;
1102         UnicodeSet s1("[:digit:]", status);
1103         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1104         TEST_ASSERT_SUCCESS(status);
1105         TEST_ASSERT(s1==s2);
1106     }
1107     {
1108         UErrorCode status = U_ZERO_ERROR;
1109         UnicodeSet s1("[:xdigit:]", status);
1110         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1111         TEST_ASSERT_SUCCESS(status);
1112         TEST_ASSERT(s1==s2);
1113     }
1114     {
1115         UErrorCode status = U_ZERO_ERROR;
1116         UnicodeSet s1("[:alnum:]", status);
1117         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1118         TEST_ASSERT_SUCCESS(status);
1119         TEST_ASSERT(s1==s2);
1120     }
1121     {
1122         UErrorCode status = U_ZERO_ERROR;
1123         UnicodeSet s1("[:space:]", status);
1124         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1125         TEST_ASSERT_SUCCESS(status);
1126         TEST_ASSERT(s1==s2);
1127     }
1128     {
1129         UErrorCode status = U_ZERO_ERROR;
1130         UnicodeSet s1("[:blank:]", status);
1131         TEST_ASSERT_SUCCESS(status);
1132         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1133             status);
1134         TEST_ASSERT_SUCCESS(status);
1135         TEST_ASSERT(s1==s2);
1136     }
1137     {
1138         UErrorCode status = U_ZERO_ERROR;
1139         UnicodeSet s1("[:cntrl:]", status);
1140         TEST_ASSERT_SUCCESS(status);
1141         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1142         TEST_ASSERT_SUCCESS(status);
1143         TEST_ASSERT(s1==s2);
1144     }
1145     {
1146         UErrorCode status = U_ZERO_ERROR;
1147         UnicodeSet s1("[:graph:]", status);
1148         TEST_ASSERT_SUCCESS(status);
1149         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1150         TEST_ASSERT_SUCCESS(status);
1151         TEST_ASSERT(s1==s2);
1152     }
1153     {
1154         UErrorCode status = U_ZERO_ERROR;
1155         UnicodeSet s1("[:print:]", status);
1156         TEST_ASSERT_SUCCESS(status);
1157         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1158         TEST_ASSERT_SUCCESS(status);
1159         TEST_ASSERT(s1==s2);
1160     }
1161 }
1162 /**
1163  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1164  */
1165 void UnicodeSetTest::TestClone() {
1166     UErrorCode ec = U_ZERO_ERROR;
1167     UnicodeSet s("[abcxyz]", ec);
1168     UnicodeSet t(s);
1169     expectContainment(t, "abc", "def");
1170 }
1171
1172 /**
1173  * Test the indexOf() and charAt() methods.
1174  */
1175 void UnicodeSetTest::TestIndexOf() {
1176     UErrorCode ec = U_ZERO_ERROR;
1177     UnicodeSet set("[a-cx-y3578]", ec);
1178     if (U_FAILURE(ec)) {
1179         errln("FAIL: UnicodeSet constructor");
1180         return;
1181     }
1182     for (int32_t i=0; i<set.size(); ++i) {
1183         UChar32 c = set.charAt(i);
1184         if (set.indexOf(c) != i) {
1185             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1186                 i, c, set.indexOf(c));
1187         }
1188     }
1189     UChar32 c = set.charAt(set.size());
1190     if (c != -1) {
1191         errln("FAIL: charAt(<out of range>) = %X", c);
1192     }
1193     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1194     if (j != -1) {
1195         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1196     }
1197 }
1198
1199 /**
1200  * Test closure API.
1201  */
1202 void UnicodeSetTest::TestCloseOver() {
1203     UErrorCode ec = U_ZERO_ERROR;
1204
1205     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1206     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1207     const char* DATA[] = {
1208         // selector, input, output
1209         CASE,
1210         "[aq\\u00DF{Bc}{bC}{Fi}]",
1211         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1212
1213         CASE,
1214         "[\\u01F1]", // 'DZ'
1215         "[\\u01F1\\u01F2\\u01F3]",
1216
1217         CASE,
1218         "[\\u1FB4]",
1219         "[\\u1FB4{\\u03AC\\u03B9}]",
1220
1221         CASE,
1222         "[{F\\uFB01}]",
1223         "[\\uFB03{ffi}]",
1224
1225         CASE, // make sure binary search finds limits
1226         "[a\\uFF3A]",
1227         "[aA\\uFF3A\\uFF5A]",
1228
1229         CASE,
1230         "[a-z]","[A-Za-z\\u017F\\u212A]",
1231         CASE,
1232         "[abc]","[A-Ca-c]",
1233         CASE,
1234         "[ABC]","[A-Ca-c]",
1235
1236         CASE, "[i]", "[iI]",
1237
1238         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1239         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1240
1241         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1242
1243         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1244
1245         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1246
1247         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1248
1249         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1250
1251         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1252
1253         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1254         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1255
1256         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1257
1258         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1259
1260         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1261
1262 #if !UCONFIG_NO_FILE_IO
1263         CASE_MAPPINGS,
1264         "[aq\\u00DF{Bc}{bC}{Fi}]",
1265         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1266 #endif
1267
1268         CASE_MAPPINGS,
1269         "[\\u01F1]", // 'DZ'
1270         "[\\u01F1\\u01F2\\u01F3]",
1271
1272         CASE_MAPPINGS,
1273         "[a-z]",
1274         "[A-Za-z]",
1275
1276         NULL
1277     };
1278
1279     UnicodeSet s;
1280     UnicodeSet t;
1281     UnicodeString buf;
1282     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1283         int32_t selector = DATA[i][0];
1284         UnicodeString pat(DATA[i+1], -1, US_INV);
1285         UnicodeString exp(DATA[i+2], -1, US_INV);
1286         s.applyPattern(pat, ec);
1287         s.closeOver(selector);
1288         t.applyPattern(exp, ec);
1289         if (U_FAILURE(ec)) {
1290             errln("FAIL: applyPattern failed");
1291             continue;
1292         }
1293         if (s == t) {
1294             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1295         } else {
1296             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1297                   s.toPattern(buf, TRUE) + ", expected " + exp);
1298         }
1299     }
1300
1301 #if 0
1302     /*
1303      * Unused test code.
1304      * This was used to compare the old implementation (using USET_CASE)
1305      * with the new one (using 0x100 temporarily)
1306      * while transitioning from hardcoded case closure tables in uniset.cpp
1307      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1308      * and using ucase.c functions for closure.
1309      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1310      *
1311      * Note: The old and new implementation never fully matched because
1312      * the old implementation turned out to not map U+0130 and U+0131 correctly
1313      * (dotted I and dotless i) and because the old implementation's data tables
1314      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1315      * new implementation. (So sigmas and some other characters were not handled
1316      * according to the newer Unicode version.)
1317      */
1318     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1319     UnicodeSetIterator si(sens);
1320     UnicodeString str, buf2;
1321     const UnicodeString *pStr;
1322     UChar32 c;
1323     while(si.next()) {
1324         if(!si.isString()) {
1325             c=si.getCodepoint();
1326             s.clear();
1327             s.add(c);
1328
1329             str.setTo(c);
1330             str.foldCase();
1331             sens2.add(str);
1332
1333             t=s;
1334             s.closeOver(USET_CASE);
1335             t.closeOver(0x100);
1336             if(s!=t) {
1337                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1338                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1339             }
1340         }
1341     }
1342     // remove all code points
1343     // should contain all full case folding mapping strings
1344     sens2.remove(0, 0x10ffff);
1345     si.reset(sens2);
1346     while(si.next()) {
1347         if(si.isString()) {
1348             pStr=&si.getString();
1349             s.clear();
1350             s.add(*pStr);
1351             t=s2=s;
1352             s.closeOver(USET_CASE);
1353             t.closeOver(0x100);
1354             if(s!=t) {
1355                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1356                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1357             }
1358         }
1359     }
1360 #endif
1361
1362     // Test the pattern API
1363     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1364     if (U_FAILURE(ec)) {
1365         errln("FAIL: applyPattern failed");
1366     } else {
1367         expectContainment(s, "abcABC", "defDEF");
1368     }
1369     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1370     if (U_FAILURE(ec)) {
1371         errln("FAIL: constructor failed");
1372     } else {
1373         expectContainment(v, "defDEF", "abcABC");
1374     }
1375     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1376     if (U_FAILURE(ec)) {
1377         errln("FAIL: construct w/case mappings failed");
1378     } else {
1379         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1380     }
1381 }
1382
1383 void UnicodeSetTest::TestEscapePattern() {
1384     const char pattern[] =
1385         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1386     const char exp[] =
1387         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1388     // We test this with two passes; in the second pass we
1389     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1390     // this fails -- which is what we expect.
1391     for (int32_t pass=1; pass<=2; ++pass) {
1392         UErrorCode ec = U_ZERO_ERROR;
1393         UnicodeString pat(pattern, -1, US_INV);
1394         if (pass==2) {
1395             pat = pat.unescape();
1396         }
1397         // Pattern is only good for pass 1
1398         UBool isPatternValid = (pass==1);
1399
1400         UnicodeSet set(pat, ec);
1401         if (U_SUCCESS(ec) != isPatternValid){
1402             errln((UnicodeString)"FAIL: applyPattern(" +
1403                   escape(pat) + ") => " +
1404                   u_errorName(ec));
1405             continue;
1406         }
1407         if (U_FAILURE(ec)) {
1408             continue;
1409         }
1410         if (set.contains((UChar)0x0644)){
1411             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1412         }
1413
1414         UnicodeString newpat;
1415         set.toPattern(newpat, TRUE);
1416         if (newpat == UnicodeString(exp, -1, US_INV)) {
1417             logln(escape(pat) + " => " + newpat);
1418         } else {
1419             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1420         }
1421
1422         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1423             UnicodeString str("Range ");
1424             str.append((UChar)(0x30 + i))
1425                 .append(": ")
1426                 .append((UChar32)set.getRangeStart(i))
1427                 .append(" - ")
1428                 .append((UChar32)set.getRangeEnd(i));
1429             str = str + " (" + set.getRangeStart(i) + " - " +
1430                 set.getRangeEnd(i) + ")";
1431             if (set.getRangeStart(i) < 0) {
1432                 errln((UnicodeString)"FAIL: " + escape(str));
1433             } else {
1434                 logln(escape(str));
1435             }
1436         }
1437     }
1438 }
1439
1440 void UnicodeSetTest::expectRange(const UnicodeString& label,
1441                                  const UnicodeSet& set,
1442                                  UChar32 start, UChar32 end) {
1443     UnicodeSet exp(start, end);
1444     UnicodeString pat;
1445     if (set == exp) {
1446         logln(label + " => " + set.toPattern(pat, TRUE));
1447     } else {
1448         UnicodeString xpat;
1449         errln((UnicodeString)"FAIL: " + label + " => " +
1450               set.toPattern(pat, TRUE) +
1451               ", expected " + exp.toPattern(xpat, TRUE));
1452     }
1453 }
1454
1455 void UnicodeSetTest::TestInvalidCodePoint() {
1456
1457     const UChar32 DATA[] = {
1458         // Test range             Expected range
1459         0, 0x10FFFF,              0, 0x10FFFF,
1460         (UChar32)-1, 8,           0, 8,
1461         8, 0x110000,              8, 0x10FFFF
1462     };
1463     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1464
1465     UnicodeString pat;
1466     int32_t i;
1467
1468     for (i=0; i<DATA_LENGTH; i+=4) {
1469         UChar32 start  = DATA[i];
1470         UChar32 end    = DATA[i+1];
1471         UChar32 xstart = DATA[i+2];
1472         UChar32 xend   = DATA[i+3];
1473
1474         // Try various API using the test code points
1475
1476         UnicodeSet set(start, end);
1477         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1478                     set, xstart, xend);
1479
1480         set.clear();
1481         set.set(start, end);
1482         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1483                     set, xstart, xend);
1484
1485         UBool b = set.contains(start);
1486         b = set.contains(start, end);
1487         b = set.containsNone(start, end);
1488         b = set.containsSome(start, end);
1489         (void)b;   // Suppress set but not used warning.
1490
1491         /*int32_t index = set.indexOf(start);*/
1492
1493         set.clear();
1494         set.add(start);
1495         set.add(start, end);
1496         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1497                     set, xstart, xend);
1498
1499         set.set(0, 0x10FFFF);
1500         set.retain(start, end);
1501         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1502                     set, xstart, xend);
1503         set.retain(start);
1504
1505         set.set(0, 0x10FFFF);
1506         set.remove(start);
1507         set.remove(start, end);
1508         set.complement();
1509         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1510                     set, xstart, xend);
1511
1512         set.set(0, 0x10FFFF);
1513         set.complement(start, end);
1514         set.complement();
1515         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1516                     set, xstart, xend);
1517         set.complement(start);
1518     }
1519
1520     const UChar32 DATA2[] = {
1521         0,
1522         0x10FFFF,
1523         (UChar32)-1,
1524         0x110000
1525     };
1526     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1527
1528     for (i=0; i<DATA2_LENGTH; ++i) {
1529         UChar32 c = DATA2[i], end = 0x10FFFF;
1530         UBool valid = (c >= 0 && c <= 0x10FFFF);
1531
1532         UnicodeSet set(0, 0x10FFFF);
1533
1534         // For single-codepoint contains, invalid codepoints are NOT contained
1535         UBool b = set.contains(c);
1536         if (b == valid) {
1537             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1538                   ") = " + b);
1539         } else {
1540             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1541                   ") = " + b);
1542         }
1543
1544         // For codepoint range contains, containsNone, and containsSome,
1545         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1546         b = set.contains(c, end);
1547         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1548               "," + end + ") = " + b);
1549
1550         b = set.containsNone(c, end);
1551         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1552               "," + end + ") = " + b);
1553
1554         b = set.containsSome(c, end);
1555         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1556               "," + end + ") = " + b);
1557
1558         int32_t index = set.indexOf(c);
1559         if ((index >= 0) == valid) {
1560             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1561                   ") = " + index);
1562         } else {
1563             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1564                   ") = " + index);
1565         }
1566     }
1567 }
1568
1569 // Used by TestSymbolTable
1570 class TokenSymbolTable : public SymbolTable {
1571 public:
1572     Hashtable contents;
1573
1574     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1575         contents.setValueDeleter(uprv_deleteUObject);
1576     }
1577
1578     ~TokenSymbolTable() {}
1579
1580     /**
1581      * (Non-SymbolTable API) Add the given variable and value to
1582      * the table.  Variable should NOT contain leading '$'.
1583      */
1584     void add(const UnicodeString& var, const UnicodeString& value,
1585              UErrorCode& ec) {
1586         if (U_SUCCESS(ec)) {
1587             contents.put(var, new UnicodeString(value), ec);
1588         }
1589     }
1590
1591     /**
1592      * SymbolTable API
1593      */
1594     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1595         return (const UnicodeString*) contents.get(s);
1596     }
1597
1598     /**
1599      * SymbolTable API
1600      */
1601     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1602         return NULL;
1603     }
1604
1605     /**
1606      * SymbolTable API
1607      */
1608     virtual UnicodeString parseReference(const UnicodeString& text,
1609                                          ParsePosition& pos, int32_t limit) const {
1610         int32_t start = pos.getIndex();
1611         int32_t i = start;
1612         UnicodeString result;
1613         while (i < limit) {
1614             UChar c = text.charAt(i);
1615             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1616                 break;
1617             }
1618             ++i;
1619         }
1620         if (i == start) { // No valid name chars
1621             return result; // Indicate failure with empty string
1622         }
1623         pos.setIndex(i);
1624         text.extractBetween(start, i, result);
1625         return result;
1626     }
1627 };
1628
1629 void UnicodeSetTest::TestSymbolTable() {
1630     // Multiple test cases can be set up here.  Each test case
1631     // is terminated by null:
1632     // var, value, var, value,..., input pat., exp. output pat., null
1633     const char* DATA[] = {
1634         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1635         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1636         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1637         NULL
1638     };
1639
1640     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1641         UErrorCode ec = U_ZERO_ERROR;
1642         TokenSymbolTable sym(ec);
1643         if (U_FAILURE(ec)) {
1644             errln("FAIL: couldn't construct TokenSymbolTable");
1645             continue;
1646         }
1647
1648         // Set up variables
1649         while (DATA[i+2] != NULL) {
1650             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1651             if (U_FAILURE(ec)) {
1652                 errln("FAIL: couldn't add to TokenSymbolTable");
1653                 continue;
1654             }
1655             i += 2;
1656         }
1657
1658         // Input pattern and expected output pattern
1659         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1660         i += 2;
1661
1662         ParsePosition pos(0);
1663         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1664         if (U_FAILURE(ec)) {
1665             errln("FAIL: couldn't construct UnicodeSet");
1666             continue;
1667         }
1668
1669         // results
1670         if (pos.getIndex() != inpat.length()) {
1671             errln((UnicodeString)"Failed to read to end of string \""
1672                   + inpat + "\": read to "
1673                   + pos.getIndex() + ", length is "
1674                   + inpat.length());
1675         }
1676
1677         UnicodeSet us2(exppat, ec);
1678         if (U_FAILURE(ec)) {
1679             errln("FAIL: couldn't construct expected UnicodeSet");
1680             continue;
1681         }
1682
1683         UnicodeString a, b;
1684         if (us != us2) {
1685             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1686                   ", expected " + us2.toPattern(b, TRUE));
1687         } else {
1688             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1689         }
1690     }
1691 }
1692
1693 void UnicodeSetTest::TestSurrogate() {
1694     const char* DATA[] = {
1695         // These should all behave identically
1696         "[abc\\uD800\\uDC00]",
1697         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1698         "[abc\\U00010000]",
1699         0
1700     };
1701     for (int i=0; DATA[i] != 0; ++i) {
1702         UErrorCode ec = U_ZERO_ERROR;
1703         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1704         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1705         UnicodeSet set(str, ec);
1706         if (U_FAILURE(ec)) {
1707             errln("FAIL: UnicodeSet constructor");
1708             continue;
1709         }
1710         expectContainment(set,
1711                           CharsToUnicodeString("abc\\U00010000"),
1712                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1713         if (set.size() != 4) {
1714             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1715                   set.size() + ", expected 4");
1716         }
1717     }
1718 }
1719
1720 void UnicodeSetTest::TestExhaustive() {
1721     // exhaustive tests. Simulate UnicodeSets with integers.
1722     // That gives us very solid tests (except for large memory tests).
1723
1724     int32_t limit = 128;
1725
1726     UnicodeSet x, y, z, aa;
1727
1728     for (int32_t i = 0; i < limit; ++i) {
1729         bitsToSet(i, x);
1730         logln((UnicodeString)"Testing " + i + ", " + x);
1731         _testComplement(i, x, y);
1732
1733         // AS LONG AS WE ARE HERE, check roundtrip
1734         checkRoundTrip(bitsToSet(i, aa));
1735
1736         for (int32_t j = 0; j < limit; ++j) {
1737             _testAdd(i,j,  x,y,z);
1738             _testXor(i,j,  x,y,z);
1739             _testRetain(i,j,  x,y,z);
1740             _testRemove(i,j,  x,y,z);
1741         }
1742     }
1743 }
1744
1745 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1746     bitsToSet(a, x);
1747     z = x;
1748     z.complement();
1749     int32_t c = setToBits(z);
1750     if (c != (~a)) {
1751         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1752         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1753     }
1754     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1755 }
1756
1757 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1758     bitsToSet(a, x);
1759     bitsToSet(b, y);
1760     z = x;
1761     z.addAll(y);
1762     int32_t c = setToBits(z);
1763     if (c != (a | b)) {
1764         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1765         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1766     }
1767     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1768 }
1769
1770 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1771     bitsToSet(a, x);
1772     bitsToSet(b, y);
1773     z = x;
1774     z.retainAll(y);
1775     int32_t c = setToBits(z);
1776     if (c != (a & b)) {
1777         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1778         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1779     }
1780     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1781 }
1782
1783 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1784     bitsToSet(a, x);
1785     bitsToSet(b, y);
1786     z = x;
1787     z.removeAll(y);
1788     int32_t c = setToBits(z);
1789     if (c != (a &~ b)) {
1790         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1791         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1792     }
1793     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1794 }
1795
1796 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1797     bitsToSet(a, x);
1798     bitsToSet(b, y);
1799     z = x;
1800     z.complementAll(y);
1801     int32_t c = setToBits(z);
1802     if (c != (a ^ b)) {
1803         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1804         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1805     }
1806     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1807 }
1808
1809 /**
1810  * Check that ranges are monotonically increasing and non-
1811  * overlapping.
1812  */
1813 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1814     int32_t n = set.getRangeCount();
1815     if (n < 0) {
1816         errln((UnicodeString)"FAIL result of " + msg +
1817               ": range count should be >= 0 but is " +
1818               n /*+ " for " + set.toPattern())*/);
1819         return;
1820     }
1821     UChar32 last = 0;
1822     for (int32_t i=0; i<n; ++i) {
1823         UChar32 start = set.getRangeStart(i);
1824         UChar32 end = set.getRangeEnd(i);
1825         if (start > end) {
1826             errln((UnicodeString)"FAIL result of " + msg +
1827                   ": range " + (i+1) +
1828                   " start > end: " + (int)start + ", " + (int)end +
1829                   " for " + set);
1830         }
1831         if (i > 0 && start <= last) {
1832             errln((UnicodeString)"FAIL result of " + msg +
1833                   ": range " + (i+1) +
1834                   " overlaps previous range: " + (int)start + ", " + (int)end +
1835                   " for " + set);
1836         }
1837         last = end;
1838     }
1839 }
1840
1841 /**
1842  * Convert a bitmask to a UnicodeSet.
1843  */
1844 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1845     result.clear();
1846     for (UChar32 i = 0; i < 32; ++i) {
1847         if ((a & (1<<i)) != 0) {
1848             result.add(i);
1849         }
1850     }
1851     return result;
1852 }
1853
1854 /**
1855  * Convert a UnicodeSet to a bitmask.  Only the characters
1856  * U+0000 to U+0020 are represented in the bitmask.
1857  */
1858 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1859     int32_t result = 0;
1860     for (int32_t i = 0; i < 32; ++i) {
1861         if (x.contains((UChar32)i)) {
1862             result |= (1<<i);
1863         }
1864     }
1865     return result;
1866 }
1867
1868 /**
1869  * Return the representation of an inversion list based UnicodeSet
1870  * as a pairs list.  Ranges are listed in ascending Unicode order.
1871  * For example, the set [a-zA-M3] is represented as "33AMaz".
1872  */
1873 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1874     UnicodeString pairs;
1875     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1876         UChar32 start = set.getRangeStart(i);
1877         UChar32 end = set.getRangeEnd(i);
1878         if (end > 0xFFFF) {
1879             end = 0xFFFF;
1880             i = set.getRangeCount(); // Should be unnecessary
1881         }
1882         pairs.append((UChar)start).append((UChar)end);
1883     }
1884     return pairs;
1885 }
1886
1887 /**
1888  * Basic consistency check for a few items.
1889  * That the iterator works, and that we can create a pattern and
1890  * get the same thing back
1891  */
1892 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1893     UErrorCode ec = U_ZERO_ERROR;
1894
1895     UnicodeSet t(s);
1896     checkEqual(s, t, "copy ct");
1897
1898     t = s;
1899     checkEqual(s, t, "operator=");
1900
1901     copyWithIterator(t, s, FALSE);
1902     checkEqual(s, t, "iterator roundtrip");
1903
1904     copyWithIterator(t, s, TRUE); // try range
1905     checkEqual(s, t, "iterator roundtrip");
1906
1907     UnicodeString pat; s.toPattern(pat, FALSE);
1908     t.applyPattern(pat, ec);
1909     if (U_FAILURE(ec)) {
1910         errln("FAIL: applyPattern");
1911         return;
1912     } else {
1913         checkEqual(s, t, "toPattern(false)");
1914     }
1915
1916     s.toPattern(pat, TRUE);
1917     t.applyPattern(pat, ec);
1918     if (U_FAILURE(ec)) {
1919         errln("FAIL: applyPattern");
1920         return;
1921     } else {
1922         checkEqual(s, t, "toPattern(true)");
1923     }
1924 }
1925
1926 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1927     t.clear();
1928     UnicodeSetIterator it(s);
1929     if (withRange) {
1930         while (it.nextRange()) {
1931             if (it.isString()) {
1932                 t.add(it.getString());
1933             } else {
1934                 t.add(it.getCodepoint(), it.getCodepointEnd());
1935             }
1936         }
1937     } else {
1938         while (it.next()) {
1939             if (it.isString()) {
1940                 t.add(it.getString());
1941             } else {
1942                 t.add(it.getCodepoint());
1943             }
1944         }
1945     }
1946 }
1947
1948 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1949     UnicodeString source; s.toPattern(source, TRUE);
1950     UnicodeString result; t.toPattern(result, TRUE);
1951     if (s != t) {
1952         errln((UnicodeString)"FAIL: " + message
1953               + "; source = " + source
1954               + "; result = " + result
1955               );
1956         return FALSE;
1957     } else {
1958         logln((UnicodeString)"Ok: " + message
1959               + "; source = " + source
1960               + "; result = " + result
1961               );
1962     }
1963     return TRUE;
1964 }
1965
1966 void
1967 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1968                                   const UnicodeString& charsIn,
1969                                   const UnicodeString& charsOut) {
1970     UErrorCode ec = U_ZERO_ERROR;
1971     UnicodeSet set(pat, ec);
1972     if (U_FAILURE(ec)) {
1973         dataerrln((UnicodeString)"FAIL: pattern \"" +
1974               pat + "\" => " + u_errorName(ec));
1975         return;
1976     }
1977     expectContainment(set, pat, charsIn, charsOut);
1978 }
1979
1980 void
1981 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1982                                   const UnicodeString& charsIn,
1983                                   const UnicodeString& charsOut) {
1984     UnicodeString pat;
1985     set.toPattern(pat);
1986     expectContainment(set, pat, charsIn, charsOut);
1987 }
1988
1989 void
1990 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1991                                   const UnicodeString& setName,
1992                                   const UnicodeString& charsIn,
1993                                   const UnicodeString& charsOut) {
1994     UnicodeString bad;
1995     UChar32 c;
1996     int32_t i;
1997
1998     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1999         c = charsIn.char32At(i);
2000         if (!set.contains(c)) {
2001             bad.append(c);
2002         }
2003     }
2004     if (bad.length() > 0) {
2005         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2006               ", expected containment of " + prettify(charsIn));
2007     } else {
2008         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2009     }
2010
2011     bad.truncate(0);
2012     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2013         c = charsOut.char32At(i);
2014         if (set.contains(c)) {
2015             bad.append(c);
2016         }
2017     }
2018     if (bad.length() > 0) {
2019         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2020               ", expected non-containment of " + prettify(charsOut));
2021     } else {
2022         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2023     }
2024 }
2025
2026 void
2027 UnicodeSetTest::expectPattern(UnicodeSet& set,
2028                               const UnicodeString& pattern,
2029                               const UnicodeString& expectedPairs){
2030     UErrorCode status = U_ZERO_ERROR;
2031     set.applyPattern(pattern, status);
2032     if (U_FAILURE(status)) {
2033         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2034               "\") failed");
2035         return;
2036     } else {
2037         if (getPairs(set) != expectedPairs ) {
2038             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2039                   "\") => pairs \"" +
2040                   escape(getPairs(set)) + "\", expected \"" +
2041                   escape(expectedPairs) + "\"");
2042         } else {
2043             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2044                   "\") => pairs \"" +
2045                   escape(getPairs(set)) + "\"");
2046         }
2047     }
2048     // the result of calling set.toPattern(), which is the string representation of
2049     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2050     // will produce another set that is equal to this one.
2051     UnicodeString temppattern;
2052     set.toPattern(temppattern);
2053     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2054     if (U_FAILURE(status)) {
2055         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2056         return;
2057     }
2058     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2059         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2060             escape(getPairs(set)) + "\""));
2061     } else{
2062         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2063     }
2064
2065     delete tempset;
2066
2067 }
2068
2069 void
2070 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2071     if (getPairs(set) != expectedPairs) {
2072         errln(UnicodeString("FAIL: Expected pair list \"") +
2073               escape(expectedPairs) + "\", got \"" +
2074               escape(getPairs(set)) + "\"");
2075     }
2076 }
2077
2078 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2079                                      const UnicodeString& expPat,
2080                                      const char** expStrings) {
2081     UnicodeString pat;
2082     set.toPattern(pat, TRUE);
2083     if (pat == expPat) {
2084         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2085     } else {
2086         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2087         return;
2088     }
2089     if (expStrings == NULL) {
2090         return;
2091     }
2092     UBool in = TRUE;
2093     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2094         if (expStrings[i] == NOT) { // sic; pointer comparison
2095             in = FALSE;
2096             continue;
2097         }
2098         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2099         UBool contained = set.contains(s);
2100         if (contained == in) {
2101             logln((UnicodeString)"Ok: " + expPat +
2102                   (contained ? " contains {" : " does not contain {") +
2103                   escape(expStrings[i]) + "}");
2104         } else {
2105             errln((UnicodeString)"FAIL: " + expPat +
2106                   (contained ? " contains {" : " does not contain {") +
2107                   escape(expStrings[i]) + "}");
2108         }
2109     }
2110 }
2111
2112 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2113
2114 void
2115 UnicodeSetTest::doAssert(UBool condition, const char *message)
2116 {
2117     if (!condition) {
2118         errln(UnicodeString("ERROR : ") + message);
2119     }
2120 }
2121
2122 UnicodeString
2123 UnicodeSetTest::escape(const UnicodeString& s) {
2124     UnicodeString buf;
2125     for (int32_t i=0; i<s.length(); )
2126     {
2127         UChar32 c = s.char32At(i);
2128         if (0x0020 <= c && c <= 0x007F) {
2129             buf += c;
2130         } else {
2131             if (c <= 0xFFFF) {
2132                 buf += (UChar)0x5c; buf += (UChar)0x75;
2133             } else {
2134                 buf += (UChar)0x5c; buf += (UChar)0x55;
2135                 buf += toHexString((c & 0xF0000000) >> 28);
2136                 buf += toHexString((c & 0x0F000000) >> 24);
2137                 buf += toHexString((c & 0x00F00000) >> 20);
2138                 buf += toHexString((c & 0x000F0000) >> 16);
2139             }
2140             buf += toHexString((c & 0xF000) >> 12);
2141             buf += toHexString((c & 0x0F00) >> 8);
2142             buf += toHexString((c & 0x00F0) >> 4);
2143             buf += toHexString(c & 0x000F);
2144         }
2145         i += U16_LENGTH(c);
2146     }
2147     return buf;
2148 }
2149
2150 void UnicodeSetTest::TestFreezable() {
2151     UErrorCode errorCode=U_ZERO_ERROR;
2152     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2153     UnicodeSet idSet(idPattern, errorCode);
2154     if(U_FAILURE(errorCode)) {
2155         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2156         return;
2157     }
2158
2159     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2160     UnicodeSet wsSet(wsPattern, errorCode);
2161     if(U_FAILURE(errorCode)) {
2162         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2163         return;
2164     }
2165
2166     idSet.add(idPattern);
2167     UnicodeSet frozen(idSet);
2168     frozen.freeze();
2169
2170     if(idSet.isFrozen() || !frozen.isFrozen()) {
2171         errln("FAIL: isFrozen() is wrong");
2172     }
2173     if(frozen!=idSet || !(frozen==idSet)) {
2174         errln("FAIL: a copy-constructed frozen set differs from its original");
2175     }
2176
2177     frozen=wsSet;
2178     if(frozen!=idSet || !(frozen==idSet)) {
2179         errln("FAIL: a frozen set was modified by operator=");
2180     }
2181
2182     UnicodeSet frozen2(frozen);
2183     if(frozen2!=frozen || frozen2!=idSet) {
2184         errln("FAIL: a copied frozen set differs from its frozen original");
2185     }
2186     if(!frozen2.isFrozen()) {
2187         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2188     }
2189     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2190     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2191         errln("FAIL: UnicodeSet(5, 55) failed");
2192     }
2193     frozen3=frozen;
2194     if(!frozen3.isFrozen()) {
2195         errln("FAIL: copying a frozen set results in a thawed one");
2196     }
2197
2198     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2199     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2200         errln("FAIL: clone() failed");
2201     }
2202     cloned->add(0xd802, 0xd805);
2203     if(cloned->containsSome(0xd802, 0xd805)) {
2204         errln("FAIL: unable to modify clone");
2205     }
2206     delete cloned;
2207
2208     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2209     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2210         errln("FAIL: cloneAsThawed() failed");
2211     }
2212     thawed->add(0xd802, 0xd805);
2213     if(!thawed->contains(0xd802, 0xd805)) {
2214         errln("FAIL: unable to modify thawed clone");
2215     }
2216     delete thawed;
2217
2218     frozen.set(5, 55);
2219     if(frozen!=idSet || !(frozen==idSet)) {
2220         errln("FAIL: UnicodeSet::set() modified a frozen set");
2221     }
2222
2223     frozen.clear();
2224     if(frozen!=idSet || !(frozen==idSet)) {
2225         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2226     }
2227
2228     frozen.closeOver(USET_CASE_INSENSITIVE);
2229     if(frozen!=idSet || !(frozen==idSet)) {
2230         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2231     }
2232
2233     frozen.compact();
2234     if(frozen!=idSet || !(frozen==idSet)) {
2235         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2236     }
2237
2238     ParsePosition pos;
2239     frozen.
2240         applyPattern(wsPattern, errorCode).
2241         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2242         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2243         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2244         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2245     if(frozen!=idSet || !(frozen==idSet)) {
2246         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2247     }
2248
2249     frozen.
2250         add(0xd800).
2251         add(0xd802, 0xd805).
2252         add(wsPattern).
2253         addAll(idPattern).
2254         addAll(wsSet);
2255     if(frozen!=idSet || !(frozen==idSet)) {
2256         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2257     }
2258
2259     frozen.
2260         retain(0x62).
2261         retain(0x64, 0x69).
2262         retainAll(wsPattern).
2263         retainAll(wsSet);
2264     if(frozen!=idSet || !(frozen==idSet)) {
2265         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2266     }
2267
2268     frozen.
2269         remove(0x62).
2270         remove(0x64, 0x69).
2271         remove(idPattern).
2272         removeAll(idPattern).
2273         removeAll(idSet);
2274     if(frozen!=idSet || !(frozen==idSet)) {
2275         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2276     }
2277
2278     frozen.
2279         complement().
2280         complement(0x62).
2281         complement(0x64, 0x69).
2282         complement(idPattern).
2283         complementAll(idPattern).
2284         complementAll(idSet);
2285     if(frozen!=idSet || !(frozen==idSet)) {
2286         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2287     }
2288 }
2289
2290 // Test span() etc. -------------------------------------------------------- ***
2291
2292 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2293 static int32_t
2294 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2295     UErrorCode errorCode=U_ZERO_ERROR;
2296     int32_t length8=0;
2297     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2298     if(U_SUCCESS(errorCode)) {
2299         return length8;
2300     } else {
2301         // The string contains an unpaired surrogate.
2302         // Ignore this string.
2303         return 0;
2304     }
2305 }
2306
2307 class UnicodeSetWithStringsIterator;
2308
2309 // Make the strings in a UnicodeSet easily accessible.
2310 class UnicodeSetWithStrings {
2311 public:
2312     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2313             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2314         int32_t size=set.size();
2315         if(size>0 && set.charAt(size-1)<0) {
2316             // If a set's last element is not a code point, then it must contain strings.
2317             // Iterate over the set, skip all code point ranges, and cache the strings.
2318             // Convert them to UTF-8 for spanUTF8().
2319             UnicodeSetIterator iter(set);
2320             const UnicodeString *s;
2321             char *s8=utf8;
2322             int32_t length8, utf8Count=0;
2323             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2324                 if(iter.isString()) {
2325                     // Store the pointer to the set's string element
2326                     // which we happen to know is a stable pointer.
2327                     strings[stringsLength]=s=&iter.getString();
2328                     utf8Count+=
2329                         utf8Lengths[stringsLength]=length8=
2330                         appendUTF8(s->getBuffer(), s->length(),
2331                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2332                     if(length8==0) {
2333                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2334                     }
2335                     s8+=length8;
2336                     ++stringsLength;
2337                 }
2338             }
2339         }
2340     }
2341
2342     const UnicodeSet &getSet() const {
2343         return set;
2344     }
2345
2346     UBool hasStrings() const {
2347         return (UBool)(stringsLength>0);
2348     }
2349
2350     UBool hasStringsWithSurrogates() const {
2351         return hasSurrogates;
2352     }
2353
2354 private:
2355     friend class UnicodeSetWithStringsIterator;
2356
2357     const UnicodeSet &set;
2358
2359     const UnicodeString *strings[20];
2360     int32_t stringsLength;
2361     UBool hasSurrogates;
2362
2363     char utf8[1024];
2364     int32_t utf8Lengths[20];
2365 };
2366
2367 class UnicodeSetWithStringsIterator {
2368 public:
2369     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2370             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2371     }
2372
2373     void reset() {
2374         nextStringIndex=nextUTF8Start=0;
2375     }
2376
2377     const UnicodeString *nextString() {
2378         if(nextStringIndex<fSet.stringsLength) {
2379             return fSet.strings[nextStringIndex++];
2380         } else {
2381             return NULL;
2382         }
2383     }
2384
2385     // Do not mix with calls to nextString().
2386     const char *nextUTF8(int32_t &length) {
2387         if(nextStringIndex<fSet.stringsLength) {
2388             const char *s8=fSet.utf8+nextUTF8Start;
2389             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2390             return s8;
2391         } else {
2392             length=0;
2393             return NULL;
2394         }
2395     }
2396
2397 private:
2398     const UnicodeSetWithStrings &fSet;
2399     int32_t nextStringIndex;
2400     int32_t nextUTF8Start;
2401 };
2402
2403 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2404 // at code point boundaries.
2405 // That is, each edge of a match must not be in the middle of a surrogate pair.
2406 static inline UBool
2407 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2408     s+=start;
2409     limit-=start;
2410     int32_t length=t.length();
2411     return 0==t.compare(s, length) &&
2412            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2413            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2414 }
2415
2416 // Implement span() with contains() for comparison.
2417 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2418                                  USetSpanCondition spanCondition) {
2419     const UnicodeSet &realSet(set.getSet());
2420     if(!set.hasStrings()) {
2421         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2422             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2423         }
2424
2425         UChar32 c;
2426         int32_t start=0, prev;
2427         while((prev=start)<length) {
2428             U16_NEXT(s, start, length, c);
2429             if(realSet.contains(c)!=spanCondition) {
2430                 break;
2431             }
2432         }
2433         return prev;
2434     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2435         UnicodeSetWithStringsIterator iter(set);
2436         UChar32 c;
2437         int32_t start, next;
2438         for(start=next=0; start<length;) {
2439             U16_NEXT(s, next, length, c);
2440             if(realSet.contains(c)) {
2441                 break;
2442             }
2443             const UnicodeString *str;
2444             iter.reset();
2445             while((str=iter.nextString())!=NULL) {
2446                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2447                     // spanNeedsStrings=TRUE;
2448                     return start;
2449                 }
2450             }
2451             start=next;
2452         }
2453         return start;
2454     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2455         UnicodeSetWithStringsIterator iter(set);
2456         UChar32 c;
2457         int32_t start, next, maxSpanLimit=0;
2458         for(start=next=0; start<length;) {
2459             U16_NEXT(s, next, length, c);
2460             if(!realSet.contains(c)) {
2461                 next=start;  // Do not span this single, not-contained code point.
2462             }
2463             const UnicodeString *str;
2464             iter.reset();
2465             while((str=iter.nextString())!=NULL) {
2466                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2467                     // spanNeedsStrings=TRUE;
2468                     int32_t matchLimit=start+str->length();
2469                     if(matchLimit==length) {
2470                         return length;
2471                     }
2472                     if(spanCondition==USET_SPAN_CONTAINED) {
2473                         // Iterate for the shortest match at each position.
2474                         // Recurse for each but the shortest match.
2475                         if(next==start) {
2476                             next=matchLimit;  // First match from start.
2477                         } else {
2478                             if(matchLimit<next) {
2479                                 // Remember shortest match from start for iteration.
2480                                 int32_t temp=next;
2481                                 next=matchLimit;
2482                                 matchLimit=temp;
2483                             }
2484                             // Recurse for non-shortest match from start.
2485                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2486                                                                  USET_SPAN_CONTAINED);
2487                             if((matchLimit+spanLength)>maxSpanLimit) {
2488                                 maxSpanLimit=matchLimit+spanLength;
2489                                 if(maxSpanLimit==length) {
2490                                     return length;
2491                                 }
2492                             }
2493                         }
2494                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2495                         if(matchLimit>next) {
2496                             // Remember longest match from start.
2497                             next=matchLimit;
2498                         }
2499                     }
2500                 }
2501             }
2502             if(next==start) {
2503                 break;  // No match from start.
2504             }
2505             start=next;
2506         }
2507         if(start>maxSpanLimit) {
2508             return start;
2509         } else {
2510             return maxSpanLimit;
2511         }
2512     }
2513 }
2514
2515 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2516                                      USetSpanCondition spanCondition) {
2517     if(length==0) {
2518         return 0;
2519     }
2520     const UnicodeSet &realSet(set.getSet());
2521     if(!set.hasStrings()) {
2522         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2523             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2524         }
2525
2526         UChar32 c;
2527         int32_t prev=length;
2528         do {
2529             U16_PREV(s, 0, length, c);
2530             if(realSet.contains(c)!=spanCondition) {
2531                 break;
2532             }
2533         } while((prev=length)>0);
2534         return prev;
2535     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2536         UnicodeSetWithStringsIterator iter(set);
2537         UChar32 c;
2538         int32_t prev=length, length0=length;
2539         do {
2540             U16_PREV(s, 0, length, c);
2541             if(realSet.contains(c)) {
2542                 break;
2543             }
2544             const UnicodeString *str;
2545             iter.reset();
2546             while((str=iter.nextString())!=NULL) {
2547                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2548                     // spanNeedsStrings=TRUE;
2549                     return prev;
2550                 }
2551             }
2552         } while((prev=length)>0);
2553         return prev;
2554     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2555         UnicodeSetWithStringsIterator iter(set);
2556         UChar32 c;
2557         int32_t prev=length, minSpanStart=length, length0=length;
2558         do {
2559             U16_PREV(s, 0, length, c);
2560             if(!realSet.contains(c)) {
2561                 length=prev;  // Do not span this single, not-contained code point.
2562             }
2563             const UnicodeString *str;
2564             iter.reset();
2565             while((str=iter.nextString())!=NULL) {
2566                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2567                     // spanNeedsStrings=TRUE;
2568                     int32_t matchStart=prev-str->length();
2569                     if(matchStart==0) {
2570                         return 0;
2571                     }
2572                     if(spanCondition==USET_SPAN_CONTAINED) {
2573                         // Iterate for the shortest match at each position.
2574                         // Recurse for each but the shortest match.
2575                         if(length==prev) {
2576                             length=matchStart;  // First match from prev.
2577                         } else {
2578                             if(matchStart>length) {
2579                                 // Remember shortest match from prev for iteration.
2580                                 int32_t temp=length;
2581                                 length=matchStart;
2582                                 matchStart=temp;
2583                             }
2584                             // Recurse for non-shortest match from prev.
2585                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2586                                                                     USET_SPAN_CONTAINED);
2587                             if(spanStart<minSpanStart) {
2588                                 minSpanStart=spanStart;
2589                                 if(minSpanStart==0) {
2590                                     return 0;
2591                                 }
2592                             }
2593                         }
2594                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2595                         if(matchStart<length) {
2596                             // Remember longest match from prev.
2597                             length=matchStart;
2598                         }
2599                     }
2600                 }
2601             }
2602             if(length==prev) {
2603                 break;  // No match from prev.
2604             }
2605         } while((prev=length)>0);
2606         if(prev<minSpanStart) {
2607             return prev;
2608         } else {
2609             return minSpanStart;
2610         }
2611     }
2612 }
2613
2614 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2615                                 USetSpanCondition spanCondition) {
2616     const UnicodeSet &realSet(set.getSet());
2617     if(!set.hasStrings()) {
2618         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2619             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2620         }
2621
2622         UChar32 c;
2623         int32_t start=0, prev;
2624         while((prev=start)<length) {
2625             U8_NEXT_OR_FFFD(s, start, length, c);
2626             if(realSet.contains(c)!=spanCondition) {
2627                 break;
2628             }
2629         }
2630         return prev;
2631     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2632         UnicodeSetWithStringsIterator iter(set);
2633         UChar32 c;
2634         int32_t start, next;
2635         for(start=next=0; start<length;) {
2636             U8_NEXT_OR_FFFD(s, next, length, c);
2637             if(realSet.contains(c)) {
2638                 break;
2639             }
2640             const char *s8;
2641             int32_t length8;
2642             iter.reset();
2643             while((s8=iter.nextUTF8(length8))!=NULL) {
2644                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2645                     // spanNeedsStrings=TRUE;
2646                     return start;
2647                 }
2648             }
2649             start=next;
2650         }
2651         return start;
2652     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2653         UnicodeSetWithStringsIterator iter(set);
2654         UChar32 c;
2655         int32_t start, next, maxSpanLimit=0;
2656         for(start=next=0; start<length;) {
2657             U8_NEXT_OR_FFFD(s, next, length, c);
2658             if(!realSet.contains(c)) {
2659                 next=start;  // Do not span this single, not-contained code point.
2660             }
2661             const char *s8;
2662             int32_t length8;
2663             iter.reset();
2664             while((s8=iter.nextUTF8(length8))!=NULL) {
2665                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2666                     // spanNeedsStrings=TRUE;
2667                     int32_t matchLimit=start+length8;
2668                     if(matchLimit==length) {
2669                         return length;
2670                     }
2671                     if(spanCondition==USET_SPAN_CONTAINED) {
2672                         // Iterate for the shortest match at each position.
2673                         // Recurse for each but the shortest match.
2674                         if(next==start) {
2675                             next=matchLimit;  // First match from start.
2676                         } else {
2677                             if(matchLimit<next) {
2678                                 // Remember shortest match from start for iteration.
2679                                 int32_t temp=next;
2680                                 next=matchLimit;
2681                                 matchLimit=temp;
2682                             }
2683                             // Recurse for non-shortest match from start.
2684                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2685                                                                 USET_SPAN_CONTAINED);
2686                             if((matchLimit+spanLength)>maxSpanLimit) {
2687                                 maxSpanLimit=matchLimit+spanLength;
2688                                 if(maxSpanLimit==length) {
2689                                     return length;
2690                                 }
2691                             }
2692                         }
2693                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2694                         if(matchLimit>next) {
2695                             // Remember longest match from start.
2696                             next=matchLimit;
2697                         }
2698                     }
2699                 }
2700             }
2701             if(next==start) {
2702                 break;  // No match from start.
2703             }
2704             start=next;
2705         }
2706         if(start>maxSpanLimit) {
2707             return start;
2708         } else {
2709             return maxSpanLimit;
2710         }
2711     }
2712 }
2713
2714 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2715                                     USetSpanCondition spanCondition) {
2716     if(length==0) {
2717         return 0;
2718     }
2719     const UnicodeSet &realSet(set.getSet());
2720     if(!set.hasStrings()) {
2721         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2722             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2723         }
2724
2725         UChar32 c;
2726         int32_t prev=length;
2727         do {
2728             U8_PREV_OR_FFFD(s, 0, length, c);
2729             if(realSet.contains(c)!=spanCondition) {
2730                 break;
2731             }
2732         } while((prev=length)>0);
2733         return prev;
2734     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2735         UnicodeSetWithStringsIterator iter(set);
2736         UChar32 c;
2737         int32_t prev=length;
2738         do {
2739             U8_PREV_OR_FFFD(s, 0, length, c);
2740             if(realSet.contains(c)) {
2741                 break;
2742             }
2743             const char *s8;
2744             int32_t length8;
2745             iter.reset();
2746             while((s8=iter.nextUTF8(length8))!=NULL) {
2747                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2748                     // spanNeedsStrings=TRUE;
2749                     return prev;
2750                 }
2751             }
2752         } while((prev=length)>0);
2753         return prev;
2754     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2755         UnicodeSetWithStringsIterator iter(set);
2756         UChar32 c;
2757         int32_t prev=length, minSpanStart=length;
2758         do {
2759             U8_PREV_OR_FFFD(s, 0, length, c);
2760             if(!realSet.contains(c)) {
2761                 length=prev;  // Do not span this single, not-contained code point.
2762             }
2763             const char *s8;
2764             int32_t length8;
2765             iter.reset();
2766             while((s8=iter.nextUTF8(length8))!=NULL) {
2767                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2768                     // spanNeedsStrings=TRUE;
2769                     int32_t matchStart=prev-length8;
2770                     if(matchStart==0) {
2771                         return 0;
2772                     }
2773                     if(spanCondition==USET_SPAN_CONTAINED) {
2774                         // Iterate for the shortest match at each position.
2775                         // Recurse for each but the shortest match.
2776                         if(length==prev) {
2777                             length=matchStart;  // First match from prev.
2778                         } else {
2779                             if(matchStart>length) {
2780                                 // Remember shortest match from prev for iteration.
2781                                 int32_t temp=length;
2782                                 length=matchStart;
2783                                 matchStart=temp;
2784                             }
2785                             // Recurse for non-shortest match from prev.
2786                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2787                                                                    USET_SPAN_CONTAINED);
2788                             if(spanStart<minSpanStart) {
2789                                 minSpanStart=spanStart;
2790                                 if(minSpanStart==0) {
2791                                     return 0;
2792                                 }
2793                             }
2794                         }
2795                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2796                         if(matchStart<length) {
2797                             // Remember longest match from prev.
2798                             length=matchStart;
2799                         }
2800                     }
2801                 }
2802             }
2803             if(length==prev) {
2804                 break;  // No match from prev.
2805             }
2806         } while((prev=length)>0);
2807         if(prev<minSpanStart) {
2808             return prev;
2809         } else {
2810             return minSpanStart;
2811         }
2812     }
2813 }
2814
2815 // spans to be performed and compared
2816 enum {
2817     SPAN_UTF16          =1,
2818     SPAN_UTF8           =2,
2819     SPAN_UTFS           =3,
2820
2821     SPAN_SET            =4,
2822     SPAN_COMPLEMENT     =8,
2823     SPAN_POLARITY       =0xc,
2824
2825     SPAN_FWD            =0x10,
2826     SPAN_BACK           =0x20,
2827     SPAN_DIRS           =0x30,
2828
2829     SPAN_CONTAINED      =0x100,
2830     SPAN_SIMPLE         =0x200,
2831     SPAN_CONDITION      =0x300,
2832
2833     SPAN_ALL            =0x33f
2834 };
2835
2836 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2837     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2838 }
2839
2840 static inline int32_t slen(const void *s, UBool isUTF16) {
2841     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2842 }
2843
2844 /*
2845  * Count spans on a string with the method according to type and set the span limits.
2846  * The set may be the complement of the original.
2847  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2848  * according to the expected number of spans.
2849  * Sets typeName to an empty string if there is no such type.
2850  * Returns -1 if the span option is filtered out.
2851  */
2852 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2853                         const void *s, int32_t length, UBool isUTF16,
2854                         uint32_t whichSpans,
2855                         int type, const char *&typeName,
2856                         int32_t limits[], int32_t limitsCapacity,
2857                         int32_t expectCount) {
2858     const UnicodeSet &realSet(set.getSet());
2859     int32_t start, count;
2860     USetSpanCondition spanCondition, firstSpanCondition, contained;
2861     UBool isForward;
2862
2863     if(type<0 || 7<type) {
2864         typeName="";
2865         return 0;
2866     }
2867
2868     static const char *const typeNames16[]={
2869         "contains", "contains(LM)",
2870         "span", "span(LM)",
2871         "containsBack", "containsBack(LM)",
2872         "spanBack", "spanBack(LM)"
2873     };
2874
2875     static const char *const typeNames8[]={
2876         "containsUTF8", "containsUTF8(LM)",
2877         "spanUTF8", "spanUTF8(LM)",
2878         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2879         "spanBackUTF8", "spanBackUTF8(LM)"
2880     };
2881
2882     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2883
2884     // filter span options
2885     if(type<=3) {
2886         // span forward
2887         if((whichSpans&SPAN_FWD)==0) {
2888             return -1;
2889         }
2890         isForward=TRUE;
2891     } else {
2892         // span backward
2893         if((whichSpans&SPAN_BACK)==0) {
2894             return -1;
2895         }
2896         isForward=FALSE;
2897     }
2898     if((type&1)==0) {
2899         // use USET_SPAN_CONTAINED
2900         if((whichSpans&SPAN_CONTAINED)==0) {
2901             return -1;
2902         }
2903         contained=USET_SPAN_CONTAINED;
2904     } else {
2905         // use USET_SPAN_SIMPLE
2906         if((whichSpans&SPAN_SIMPLE)==0) {
2907             return -1;
2908         }
2909         contained=USET_SPAN_SIMPLE;
2910     }
2911
2912     // Default first span condition for going forward with an uncomplemented set.
2913     spanCondition=USET_SPAN_NOT_CONTAINED;
2914     if(isComplement) {
2915         spanCondition=invertSpanCondition(spanCondition, contained);
2916     }
2917
2918     // First span condition for span(), used to terminate the spanBack() iteration.
2919     firstSpanCondition=spanCondition;
2920
2921     // spanBack(): Its initial span condition is span()'s last span condition,
2922     // which is the opposite of span()'s first span condition
2923     // if we expect an even number of spans.
2924     // (The loop inverts spanCondition (expectCount-1) times
2925     // before the expectCount'th span() call.)
2926     // If we do not compare forward and backward directions, then we do not have an
2927     // expectCount and just start with firstSpanCondition.
2928     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2929         spanCondition=invertSpanCondition(spanCondition, contained);
2930     }
2931
2932     count=0;
2933     switch(type) {
2934     case 0:
2935     case 1:
2936         start=0;
2937         if(length<0) {
2938             length=slen(s, isUTF16);
2939         }
2940         for(;;) {
2941             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2942                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2943             if(count<limitsCapacity) {
2944                 limits[count]=start;
2945             }
2946             ++count;
2947             if(start>=length) {
2948                 break;
2949             }
2950             spanCondition=invertSpanCondition(spanCondition, contained);
2951         }
2952         break;
2953     case 2:
2954     case 3:
2955         start=0;
2956         for(;;) {
2957             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2958                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2959             if(count<limitsCapacity) {
2960                 limits[count]=start;
2961             }
2962             ++count;
2963             if(length>=0 ? start>=length :
2964                            isUTF16 ? ((const UChar *)s)[start]==0 :
2965                                      ((const char *)s)[start]==0
2966             ) {
2967                 break;
2968             }
2969             spanCondition=invertSpanCondition(spanCondition, contained);
2970         }
2971         break;
2972     case 4:
2973     case 5:
2974         if(length<0) {
2975             length=slen(s, isUTF16);
2976         }
2977         for(;;) {
2978             ++count;
2979             if(count<=limitsCapacity) {
2980                 limits[limitsCapacity-count]=length;
2981             }
2982             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2983                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2984             if(length==0 && spanCondition==firstSpanCondition) {
2985                 break;
2986             }
2987             spanCondition=invertSpanCondition(spanCondition, contained);
2988         }
2989         if(count<limitsCapacity) {
2990             memmove(limits, limits+(limitsCapacity-count), count*4);
2991         }
2992         break;
2993     case 6:
2994     case 7:
2995         for(;;) {
2996             ++count;
2997             if(count<=limitsCapacity) {
2998                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
2999             }
3000             // Note: Length<0 is tested only for the first spanBack().
3001             // If we wanted to keep length<0 for all spanBack()s, we would have to
3002             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3003             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3004                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3005             if(length==0 && spanCondition==firstSpanCondition) {
3006                 break;
3007             }
3008             spanCondition=invertSpanCondition(spanCondition, contained);
3009         }
3010         if(count<limitsCapacity) {
3011             memmove(limits, limits+(limitsCapacity-count), count*4);
3012         }
3013         break;
3014     default:
3015         typeName="";
3016         return -1;
3017     }
3018
3019     return count;
3020 }
3021
3022 // sets to be tested; odd index=isComplement
3023 enum {
3024     SLOW,
3025     SLOW_NOT,
3026     FAST,
3027     FAST_NOT,
3028     SET_COUNT
3029 };
3030
3031 static const char *const setNames[SET_COUNT]={
3032     "slow",
3033     "slow.not",
3034     "fast",
3035     "fast.not"
3036 };
3037
3038 /*
3039  * Verify that we get the same results whether we look at text with contains(),
3040  * span() or spanBack(), using unfrozen or frozen versions of the set,
3041  * and using the set or its complement (switching the spanConditions accordingly).
3042  * The latter verifies that
3043  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3044  *
3045  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3046  * or returned to the caller (with an input expectCount<0).
3047  */
3048 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3049                               const void *s, int32_t length, UBool isUTF16,
3050                               uint32_t whichSpans,
3051                               int32_t expectLimits[], int32_t &expectCount,
3052                               const char *testName, int32_t index) {
3053     int32_t limits[500];
3054     int32_t limitsCount;
3055     int i, j;
3056
3057     const char *typeName;
3058     int type;
3059
3060     for(i=0; i<SET_COUNT; ++i) {
3061         if((i&1)==0) {
3062             // Even-numbered sets are original, uncomplemented sets.
3063             if((whichSpans&SPAN_SET)==0) {
3064                 continue;
3065             }
3066         } else {
3067             // Odd-numbered sets are complemented.
3068             if((whichSpans&SPAN_COMPLEMENT)==0) {
3069                 continue;
3070             }
3071         }
3072         for(type=0;; ++type) {
3073             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3074                                  s, length, isUTF16,
3075                                  whichSpans,
3076                                  type, typeName,
3077                                  limits, UPRV_LENGTHOF(limits), expectCount);
3078             if(typeName[0]==0) {
3079                 break; // All types tried.
3080             }
3081             if(limitsCount<0) {
3082                 continue; // Span option filtered out.
3083             }
3084             if(expectCount<0) {
3085                 expectCount=limitsCount;
3086                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3087                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3088                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3089                     return;
3090                 }
3091                 memcpy(expectLimits, limits, limitsCount*4);
3092             } else if(limitsCount!=expectCount) {
3093                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3094                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3095             } else {
3096                 for(j=0; j<limitsCount; ++j) {
3097                     if(limits[j]!=expectLimits[j]) {
3098                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3099                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3100                               j, (long)limits[j], (long)expectLimits[j]);
3101                         break;
3102                     }
3103                 }
3104             }
3105         }
3106     }
3107
3108     // Compare span() with containsAll()/containsNone(),
3109     // but only if we have expectLimits[] from the uncomplemented set.
3110     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3111         const UChar *s16=(const UChar *)s;
3112         UnicodeString string;
3113         int32_t prev=0, limit, length;
3114         for(i=0; i<expectCount; ++i) {
3115             limit=expectLimits[i];
3116             length=limit-prev;
3117             if(length>0) {
3118                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3119                 if(i&1) {
3120                     if(!sets[SLOW]->getSet().containsAll(string)) {
3121                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3122                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3123                         return;
3124                     }
3125                     if(!sets[FAST]->getSet().containsAll(string)) {
3126                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3127                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3128                         return;
3129                     }
3130                 } else {
3131                     if(!sets[SLOW]->getSet().containsNone(string)) {
3132                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3133                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3134                         return;
3135                     }
3136                     if(!sets[FAST]->getSet().containsNone(string)) {
3137                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3138                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3139                         return;
3140                     }
3141                 }
3142             }
3143             prev=limit;
3144         }
3145     }
3146 }
3147
3148 // Specifically test either UTF-16 or UTF-8.
3149 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3150                               const void *s, int32_t length, UBool isUTF16,
3151                               uint32_t whichSpans,
3152                               const char *testName, int32_t index) {
3153     int32_t expectLimits[500];
3154     int32_t expectCount=-1;
3155     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3156 }
3157
3158 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3159     UChar c, c2;
3160
3161     if(length>=0) {
3162         while(length>0) {
3163             c=*s++;
3164             --length;
3165             if(0xd800<=c && c<0xe000) {
3166                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3167                     return TRUE;
3168                 }
3169                 --length;
3170             }
3171         }
3172     } else {
3173         while((c=*s++)!=0) {
3174             if(0xd800<=c && c<0xe000) {
3175                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3176                     return TRUE;
3177                 }
3178             }
3179         }
3180     }
3181     return FALSE;
3182 }
3183
3184 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3185 // unless either UTF is turned off in whichSpans.
3186 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3187 // have the same contains(c) value as U+FFFD.
3188 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3189                                       const UChar *s16, int32_t length16,
3190                                       uint32_t whichSpans,
3191                                       const char *testName, int32_t index) {
3192     int32_t expectLimits[500];
3193     int32_t expectCount;
3194
3195     expectCount=-1;  // Get expectLimits[] from testSpan().
3196
3197     if((whichSpans&SPAN_UTF16)!=0) {
3198         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3199     }
3200     if((whichSpans&SPAN_UTF8)==0) {
3201         return;
3202     }
3203
3204     // Convert s16[] and expectLimits[] to UTF-8.
3205     uint8_t s8[3000];
3206     int32_t offsets[3000];
3207
3208     const UChar *s16Limit=s16+length16;
3209     char *t=(char *)s8;
3210     char *tLimit=t+sizeof(s8);
3211     int32_t *o=offsets;
3212     UErrorCode errorCode=U_ZERO_ERROR;
3213
3214     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3215     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3216     if(U_FAILURE(errorCode)) {
3217         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3218               testName, (long)index, u_errorName(errorCode));
3219         ucnv_resetFromUnicode(utf8Cnv);
3220         return;
3221     }
3222     int32_t length8=(int32_t)(t-(char *)s8);
3223
3224     // Convert expectLimits[].
3225     int32_t i, j, expect;
3226     for(i=j=0; i<expectCount; ++i) {
3227         expect=expectLimits[i];
3228         if(expect==length16) {
3229             expectLimits[i]=length8;
3230         } else {
3231             while(offsets[j]<expect) {
3232                 ++j;
3233             }
3234             expectLimits[i]=j;
3235         }
3236     }
3237
3238     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3239 }
3240
3241 static UChar32 nextCodePoint(UChar32 c) {
3242     // Skip some large and boring ranges.
3243     switch(c) {
3244     case 0x3441:
3245         return 0x4d7f;
3246     case 0x5100:
3247         return 0x9f00;
3248     case 0xb040:
3249         return 0xd780;
3250     case 0xe041:
3251         return 0xf8fe;
3252     case 0x10100:
3253         return 0x20000;
3254     case 0x20041:
3255         return 0xe0000;
3256     case 0xe0101:
3257         return 0x10fffd;
3258     default:
3259         return c+1;
3260     }
3261 }
3262
3263 // Verify that all implementations represent the same set.
3264 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3265     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3266     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3267     // Skip the UTF-8 part of the test - if the string contains surrogates -
3268     // because it is likely to produce a different result.
3269     UBool inconsistentSurrogates=
3270             (!(sets[0]->getSet().contains(0xfffd) ?
3271                sets[0]->getSet().contains(0xd800, 0xdfff) :
3272                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3273              sets[0]->hasStringsWithSurrogates());
3274
3275     UChar s[1000];
3276     int32_t length=0;
3277     uint32_t localWhichSpans;
3278
3279     UChar32 c, first;
3280     for(first=c=0;; c=nextCodePoint(c)) {
3281         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3282             localWhichSpans=whichSpans;
3283             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3284                 localWhichSpans&=~SPAN_UTF8;
3285             }
3286             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3287             if(c>0x10ffff) {
3288                 break;
3289             }
3290             length=0;
3291             first=c;
3292         }
3293         U16_APPEND_UNSAFE(s, length, c);
3294     }
3295 }
3296
3297 // Test with a particular, interesting string.
3298 // Specify length and try NUL-termination.
3299 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3300     static const UChar s[]={
3301         0x61, 0x62, 0x20,                       // Latin, space
3302         0x3b1, 0x3b2, 0x3b3,                    // Greek
3303         0xd900,                                 // lead surrogate
3304         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3305         0xdc05,                                 // trail surrogate
3306         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3307         0xd900, 0xdc05,                         // unassigned supplementary
3308         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3309         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3310         0                                       // NUL
3311     };
3312
3313     if((whichSpans&SPAN_UTF16)==0) {
3314         return;
3315     }
3316     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3317     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3318 }
3319
3320 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3321     static const char s[]={
3322         "abc"                                   // Latin
3323
3324         /* trail byte in lead position */
3325         "\x80"
3326
3327         " "                                     // space
3328
3329         /* truncated multi-byte sequences */
3330         "\xd0"
3331         "\xe0"
3332         "\xe1"
3333         "\xed"
3334         "\xee"
3335         "\xf0"
3336         "\xf1"
3337         "\xf4"
3338         "\xf8"
3339         "\xfc"
3340
3341         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3342
3343         /* trail byte in lead position */
3344         "\x80"
3345
3346         "\xe0\x80"
3347         "\xe0\xa0"
3348         "\xe1\x80"
3349         "\xed\x80"
3350         "\xed\xa0"
3351         "\xee\x80"
3352         "\xf0\x80"
3353         "\xf0\x90"
3354         "\xf1\x80"
3355         "\xf4\x80"
3356         "\xf4\x90"
3357         "\xf8\x80"
3358         "\xfc\x80"
3359
3360         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3361
3362         /* trail byte in lead position */
3363         "\x80"
3364
3365         "\xf0\x80\x80"
3366         "\xf0\x90\x80"
3367         "\xf1\x80\x80"
3368         "\xf4\x80\x80"
3369         "\xf4\x90\x80"
3370         "\xf8\x80\x80"
3371         "\xfc\x80\x80"
3372
3373         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3374
3375         /* trail byte in lead position */
3376         "\x80"
3377
3378         "\xf8\x80\x80\x80"
3379         "\xfc\x80\x80\x80"
3380
3381         "\xF1\x90\x80\x85"                      // unassigned supplementary
3382
3383         /* trail byte in lead position */
3384         "\x80"
3385
3386         "\xfc\x80\x80\x80\x80"
3387
3388         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3389
3390         /* trail byte in lead position */
3391         "\x80"
3392
3393         /* complete sequences but non-shortest forms or out of range etc. */
3394         "\xc0\x80"
3395         "\xe0\x80\x80"
3396         "\xed\xa0\x80"
3397         "\xf0\x80\x80\x80"
3398         "\xf4\x90\x80\x80"
3399         "\xf8\x80\x80\x80\x80"
3400         "\xfc\x80\x80\x80\x80\x80"
3401         "\xfe"
3402         "\xff"
3403
3404         /* trail byte in lead position */
3405         "\x80"
3406
3407         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3408     };
3409
3410     if((whichSpans&SPAN_UTF8)==0) {
3411         return;
3412     }
3413     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3414     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3415 }
3416
3417 // Take a set of span options and multiply them so that
3418 // each portion only has one of the options a, b and c.
3419 // If b==0, then the set of options is just modified with mask and a.
3420 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3421 static int32_t
3422 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3423                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3424     uint32_t s;
3425     int32_t i;
3426
3427     for(i=0; i<whichSpansCount; ++i) {
3428         s=whichSpans[i]&mask;
3429         whichSpans[i]=s|a;
3430         if(b!=0) {
3431             whichSpans[whichSpansCount+i]=s|b;
3432             if(c!=0) {
3433                 whichSpans[2*whichSpansCount+i]=s|c;
3434             }
3435         }
3436     }
3437     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3438 }
3439
3440 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3441 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3442 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3443 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3444
3445 void UnicodeSetTest::TestSpan() {
3446     // "[...]" is a UnicodeSet pattern.
3447     // "*" performs tests on all Unicode code points and on a selection of
3448     //   malformed UTF-8/16 strings.
3449     // "-options" limits the scope of testing for the current set.
3450     //   By default, the test verifies that equivalent boundaries are found
3451     //   for UTF-16 and UTF-8, going forward and backward,
3452     //   alternating USET_SPAN_NOT_CONTAINED with
3453     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3454     //   Single-character options:
3455     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3456     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3457     //          or the set contains strings with unpaired surrogates
3458     //          which do not translate to valid UTF-8.
3459     //     c -- set.span() and set.complement().span() boundaries may differ.
3460     //          Cause: Set strings are not complemented.
3461     //     b -- span() and spanBack() boundaries may differ.
3462     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3463     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3464     //          match with non-overlapping substrings.
3465     //          For example, with a set containing "ab" and "ba",
3466     //          span() of "aba" yields boundaries { 0, 2, 3 }
3467     //          because the initial "ab" matches from 0 to 2,
3468     //          while spanBack() yields boundaries { 0, 1, 3 }
3469     //          because the final "ba" matches from 1 to 3.
3470     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3471     //          Cause: Strings in the set overlap, and a longer match may
3472     //          require a sequence including non-longest substrings.
3473     //          For example, with a set containing "ab", "abc" and "cd",
3474     //          span(contained) of "abcd" spans the entire string
3475     //          but span(longest match) only spans the first 3 characters.
3476     //   Each "-options" first resets all options and then applies the specified options.
3477     //   A "-" without options resets the options.
3478     //   The options are also reset for each new set.
3479     // Other strings will be spanned.
3480     static const char *const testdata[]={
3481         "[:ID_Continue:]",
3482         "*",
3483         "[:White_Space:]",
3484         "*",
3485         "[]",
3486         "*",
3487         "[\\u0000-\\U0010FFFF]",
3488         "*",
3489         "[\\u0000\\u0080\\u0800\\U00010000]",
3490         "*",
3491         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3492         "*",
3493         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3494         "-c",
3495         "*",
3496         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3497         "-c",
3498         "*",
3499
3500         // Overlapping strings cause overlapping attempts to match.
3501         "[x{xy}{xya}{axy}{ax}]",
3502         "-cl",
3503
3504         // More repetitions of "xya" would take too long with the recursive
3505         // reference implementation.
3506         // containsAll()=FALSE
3507         // test_string 0x14
3508         "xx"
3509         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3510         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3511         "xyaxyaxyaxya"
3512         "xx"
3513         "xyaxyaxyaxya"  // span() ends here.
3514         "aaa",
3515
3516         // containsAll()=TRUE
3517         // test_string 0x15
3518         "xx"
3519         "xyaxyaxyaxya"
3520         "xx"
3521         "xyaxyaxyaxya"
3522         "xx"
3523         "xyaxyaxyaxy",
3524
3525         "-bc",
3526         // test_string 0x17
3527         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3528         "-c",
3529         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3530         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3531         "-",
3532         "byaya",     // span() -> { 5 }
3533         "byay",      // span() -> { 4 }
3534         "bya",       // span() -> { 3 }
3535
3536         // span(longest match) will not span the whole string.
3537         "[a{ab}{bc}]",
3538         "-cl",
3539         // test_string 0x21
3540         "abc",
3541
3542         "[a{ab}{abc}{cd}]",
3543         "-cl",
3544         "acdabcdabccd",
3545
3546         // spanBack(longest match) will not span the whole string.
3547         "[c{ab}{bc}]",
3548         "-cl",
3549         "abc",
3550
3551         "[d{cd}{bcd}{ab}]",
3552         "-cl",
3553         "abbcdabcdabd",
3554
3555         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3556         // and UTF-8 trail bytes.
3557         // Copies of above test sets and strings, but transliterated to have
3558         // different code points with similar trail units.
3559         // Previous: a      b         c            d
3560         // Unicode:  042B   30AB      200AB        204AB
3561         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3562         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3563         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3564         "-cl",
3565         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3566
3567         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3568         "-cl",
3569         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3570
3571         // Stress bookkeeping and recursion.
3572         // The following strings are barely doable with the recursive
3573         // reference implementation.
3574         // The not-contained character at the end prevents an early exit from the span().
3575         "[b{bb}]",
3576         "-c",
3577         // test_string 0x33
3578         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3579         // On complement sets, span() and spanBack() get different results
3580         // because b is not in the complement set and there is an odd number of b's
3581         // in the test string.
3582         "-bc",
3583         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3584
3585         // Test with set strings with an initial or final code point span
3586         // longer than 254.
3587         "[a{" _64_a _64_a _64_a _64_a "b}"
3588           "{a" _64_b _64_b _64_b _64_b "}]",
3589         "-c",
3590         _64_a _64_a _64_a _63_a "b",
3591         _64_a _64_a _64_a _64_a "b",
3592         _64_a _64_a _64_a _64_a "aaaabbbb",
3593         "a" _64_b _64_b _64_b _63_b,
3594         "a" _64_b _64_b _64_b _64_b,
3595         "aaaabbbb" _64_b _64_b _64_b _64_b,
3596
3597         // Test with strings containing unpaired surrogates.
3598         // They are not representable in UTF-8, and a leading trail surrogate
3599         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3600         // U+20001 == \\uD840\\uDC01
3601         // U+20400 == \\uD841\\uDC00
3602         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3603         "-8cl",
3604         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3605     };
3606     uint32_t whichSpans[96]={ SPAN_ALL };
3607     int32_t whichSpansCount=1;
3608
3609     UnicodeSet *sets[SET_COUNT]={ NULL };
3610     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3611
3612     char testName[1024];
3613     char *testNameLimit=testName;
3614
3615     int32_t i, j;
3616     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3617         const char *s=testdata[i];
3618         if(s[0]=='[') {
3619             // Create new test sets from this pattern.
3620             for(j=0; j<SET_COUNT; ++j) {
3621                 delete sets_with_str[j];
3622                 delete sets[j];
3623             }
3624             UErrorCode errorCode=U_ZERO_ERROR;
3625             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3626             if(U_FAILURE(errorCode)) {
3627                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3628                 break;
3629             }
3630             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3631             sets[SLOW_NOT]->complement();
3632             // Intermediate set: Test cloning of a frozen set.
3633             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3634             fast->freeze();
3635             sets[FAST]=(UnicodeSet *)fast->clone();
3636             delete fast;
3637             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3638             fastNot->freeze();
3639             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3640             delete fastNot;
3641
3642             for(j=0; j<SET_COUNT; ++j) {
3643                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3644             }
3645
3646             strcpy(testName, s);
3647             testNameLimit=strchr(testName, 0);
3648             *testNameLimit++=':';
3649             *testNameLimit=0;
3650
3651             whichSpans[0]=SPAN_ALL;
3652             whichSpansCount=1;
3653         } else if(s[0]=='-') {
3654             whichSpans[0]=SPAN_ALL;
3655             whichSpansCount=1;
3656
3657             while(*++s!=0) {
3658                 switch(*s) {
3659                 case 'c':
3660                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3661                                                    ~SPAN_POLARITY,
3662                                                    SPAN_SET,
3663                                                    SPAN_COMPLEMENT,
3664                                                    0);
3665                     break;
3666                 case 'b':
3667                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3668                                                    ~SPAN_DIRS,
3669                                                    SPAN_FWD,
3670                                                    SPAN_BACK,
3671                                                    0);
3672                     break;
3673                 case 'l':
3674                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3675                     // USET_SPAN_SIMPLE only FWD, and separately
3676                     // USET_SPAN_SIMPLE only BACK
3677                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3678                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3679                                                    SPAN_DIRS|SPAN_CONTAINED,
3680                                                    SPAN_FWD|SPAN_SIMPLE,
3681                                                    SPAN_BACK|SPAN_SIMPLE);
3682                     break;
3683                 case '8':
3684                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3685                                                    ~SPAN_UTFS,
3686                                                    SPAN_UTF16,
3687                                                    SPAN_UTF8,
3688                                                    0);
3689                     break;
3690                 default:
3691                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3692                     break;
3693                 }
3694             }
3695         } else if(0==strcmp(s, "*")) {
3696             strcpy(testNameLimit, "bad_string");
3697             for(j=0; j<whichSpansCount; ++j) {
3698                 if(whichSpansCount>1) {
3699                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3700                             "%%0x%3x",
3701                             whichSpans[j]);
3702                 }
3703                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3704                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3705             }
3706
3707             strcpy(testNameLimit, "contents");
3708             for(j=0; j<whichSpansCount; ++j) {
3709                 if(whichSpansCount>1) {
3710                     sprintf(testNameLimit+8 /* strlen("contents") */,
3711                             "%%0x%3x",
3712                             whichSpans[j]);
3713                 }
3714                 testSpanContents(sets_with_str, whichSpans[j], testName);
3715             }
3716         } else {
3717             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3718             strcpy(testNameLimit, "test_string");
3719             for(j=0; j<whichSpansCount; ++j) {
3720                 if(whichSpansCount>1) {
3721                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3722                             "%%0x%3x",
3723                             whichSpans[j]);
3724                 }
3725                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3726             }
3727         }
3728     }
3729     for(j=0; j<SET_COUNT; ++j) {
3730         delete sets_with_str[j];
3731         delete sets[j];
3732     }
3733 }
3734
3735 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3736 void UnicodeSetTest::TestStringSpan() {
3737     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3738     static const char *const string=
3739         "xx"
3740         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3741         "xx"
3742         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3743         "xx"
3744         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3745         "aaaa";
3746
3747     UErrorCode errorCode=U_ZERO_ERROR;
3748     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3749     UnicodeSet set(pattern16, errorCode);
3750     if(U_FAILURE(errorCode)) {
3751         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3752         return;
3753     }
3754
3755     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3756
3757     if(set.containsAll(string16)) {
3758         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3759     }
3760
3761     // Remove trailing "aaaa".
3762     string16.truncate(string16.length()-4);
3763     if(!set.containsAll(string16)) {
3764         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3765     }
3766
3767     string16=UNICODE_STRING_SIMPLE("byayaxya");
3768     const UChar *s16=string16.getBuffer();
3769     int32_t length16=string16.length();
3770     (void)length16;   // Suppress set but not used warning.
3771     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3772         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3773         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3774         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3775         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3776         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3777     ) {
3778         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3779     }
3780
3781     pattern="[a{ab}{abc}{cd}]";
3782     pattern16=UnicodeString(pattern, -1, US_INV);
3783     set.applyPattern(pattern16, errorCode);
3784     if(U_FAILURE(errorCode)) {
3785         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3786         return;
3787     }
3788     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3789     s16=string16.getBuffer();
3790     length16=string16.length();
3791     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3792         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3793         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3794     ) {
3795         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3796     }
3797
3798     pattern="[d{cd}{bcd}{ab}]";
3799     pattern16=UnicodeString(pattern, -1, US_INV);
3800     set.applyPattern(pattern16, errorCode).freeze();
3801     if(U_FAILURE(errorCode)) {
3802         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3803         return;
3804     }
3805     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3806     s16=string16.getBuffer();
3807     length16=string16.length();
3808     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3809         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3810         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3811     ) {
3812         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3813     }
3814 }