icuSources/test/intltest/usettest.cpp

   1 /*
   2 ********************************************************************************
   3 *   Copyright (C) 1999-2012 International Business Machines Corporation and
   4 *   others. All Rights Reserved.
   5 ********************************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 *   03/22/2000  Madhu       Added additional tests
   9 ********************************************************************************
  10 */
  11
  12 #include <stdio.h>
  13
  14 #include <string.h>
  15 #include "unicode/utypes.h"
  16 #include "usettest.h"
  17 #include "unicode/ucnv.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/usetiter.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/parsepos.h"
  23 #include "unicode/symtable.h"
  24 #include "unicode/uversion.h"
  25 #include "hash.h"
  26
  27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  28
  29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  30     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  31     u_errorName(status));}}
  32
  33 #define TEST_ASSERT(expr) {if (!(expr)) { \
  34     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
  35
  36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  37     UnicodeString pat;
  38     set.toPattern(pat);
  39     return left + UnicodeSetTest::escape(pat);
  40 }
  41
  42 #define CASE(id,test) case id:                          \
  43                           name = #test;                 \
  44                           if (exec) {                   \
  45                               logln(#test "---");       \
  46                               logln();                  \
  47                               test();                   \
  48                           }                             \
  49                           break
  50
  51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
  52 }
  53
  54 UConverter *UnicodeSetTest::openUTF8Converter() {
  55     if(utf8Cnv==NULL) {
  56         UErrorCode errorCode=U_ZERO_ERROR;
  57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
  58     }
  59     return utf8Cnv;
  60 }
  61
  62 UnicodeSetTest::~UnicodeSetTest() {
  63     ucnv_close(utf8Cnv);
  64 }
  65
  66 void
  67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  68                                const char* &name, char* /*par*/) {
  69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  70     switch (index) {
  71         CASE(0,TestPatterns);
  72         CASE(1,TestAddRemove);
  73         CASE(2,TestCategories);
  74         CASE(3,TestCloneEqualHash);
  75         CASE(4,TestMinimalRep);
  76         CASE(5,TestAPI);
  77         CASE(6,TestScriptSet);
  78         CASE(7,TestPropertySet);
  79         CASE(8,TestClone);
  80         CASE(9,TestExhaustive);
  81         CASE(10,TestToPattern);
  82         CASE(11,TestIndexOf);
  83         CASE(12,TestStrings);
  84         CASE(13,Testj2268);
  85         CASE(14,TestCloseOver);
  86         CASE(15,TestEscapePattern);
  87         CASE(16,TestInvalidCodePoint);
  88         CASE(17,TestSymbolTable);
  89         CASE(18,TestSurrogate);
  90         CASE(19,TestPosixClasses);
  91         CASE(20,TestIteration);
  92         CASE(21,TestFreezable);
  93         CASE(22,TestSpan);
  94         CASE(23,TestStringSpan);
  95         default: name = ""; break;
  96     }
  97 }
  98
  99 static const char NOT[] = "%%%%";
 100
 101 /**
 102  * UVector was improperly copying contents
 103  * This code will crash this is still true
 104  */
 105 void UnicodeSetTest::Testj2268() {
 106   UnicodeSet t;
 107   t.add(UnicodeString("abc"));
 108   UnicodeSet test(t);
 109   UnicodeString ustrPat;
 110   test.toPattern(ustrPat, TRUE);
 111 }
 112
 113 /**
 114  * Test toPattern().
 115  */
 116 void UnicodeSetTest::TestToPattern() {
 117     UErrorCode ec = U_ZERO_ERROR;
 118
 119     // Test that toPattern() round trips with syntax characters and
 120     // whitespace.
 121     {
 122         static const char* OTHER_TOPATTERN_TESTS[] = {
 123             "[[:latin:]&[:greek:]]",
 124             "[[:latin:]-[:greek:]]",
 125             "[:nonspacing mark:]",
 126             NULL
 127         };
 128
 129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 130             ec = U_ZERO_ERROR;
 131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 132             if (U_FAILURE(ec)) {
 133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
 134                 continue;
 135             }
 136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 137         }
 138
 139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 141
 142                 // check various combinations to make sure they all work.
 143                 if (i != 0 && !toPatternAux(i, i)){
 144                     continue;
 145                 }
 146                 if (!toPatternAux(0, i)){
 147                     continue;
 148                 }
 149                 if (!toPatternAux(i, 0xFFFF)){
 150                     continue;
 151                 }
 152             }
 153         }
 154     }
 155
 156     // Test pattern behavior of multicharacter strings.
 157     {
 158         ec = U_ZERO_ERROR;
 159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 160
 161         // This loop isn't a loop.  It's here to make the compiler happy.
 162         // If you're curious, try removing it and changing the 'break'
 163         // statements (except for the last) to goto's.
 164         for (;;) {
 165             if (U_FAILURE(ec)) break;
 166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 168
 169             s->add("ac");
 170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 172
 173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
 174             if (U_FAILURE(ec)) break;
 175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
 177
 178             s->add("[]");
 179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
 181
 182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
 183             if (U_FAILURE(ec)) break;
 184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
 186
 187             // j2189
 188             s->clear();
 189             s->add(UnicodeString("abc", ""));
 190             s->add(UnicodeString("abc", ""));
 191             const char* exp6[] = {"abc", NOT, "ab", NULL};
 192             expectToPattern(*s, "[{abc}]", exp6);
 193
 194             break;
 195         }
 196
 197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 198         delete s;
 199     }
 200
 201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 202     UnicodeSet s;
 203     s.add((UChar)97, (UChar)98); // 'a', 'b'
 204     expectToPattern(s, "[ab]", NULL);
 205 }
 206
 207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 208
 209     // use Integer.toString because Utility.hex doesn't handle ints
 210     UnicodeString pat = "";
 211     // TODO do these in hex
 212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 214     UnicodeString source;
 215     source = source + (uint32_t)start;
 216     if (start != end)
 217         source = source + ".." + (uint32_t)end;
 218     UnicodeSet testSet;
 219     testSet.add(start, end);
 220     return checkPat(source, testSet);
 221 }
 222
 223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 224                                const UnicodeSet& testSet) {
 225     // What we want to make sure of is that a pattern generated
 226     // by toPattern(), with or without escaped unprintables, can
 227     // be passed back into the UnicodeSet constructor.
 228     UnicodeString pat0;
 229
 230     testSet.toPattern(pat0, TRUE);
 231
 232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 233
 234     //String pat1 = unescapeLeniently(pat0);
 235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 236
 237     UnicodeString pat2;
 238     testSet.toPattern(pat2, FALSE);
 239     if (!checkPat(source, testSet, pat2)) return FALSE;
 240
 241     //String pat3 = unescapeLeniently(pat2);
 242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 243
 244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 246     return TRUE;
 247 }
 248
 249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 250                                const UnicodeSet& testSet,
 251                                const UnicodeString& pat) {
 252     UErrorCode ec = U_ZERO_ERROR;
 253     UnicodeSet testSet2(pat, ec);
 254     if (testSet2 != testSet) {
 255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 256         return FALSE;
 257     }
 258     return TRUE;
 259 }
 260
 261 void
 262 UnicodeSetTest::TestPatterns(void) {
 263     UnicodeSet set;
 264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 270
 271     // Throw in a test of complement
 272     set.complement();
 273     UnicodeString exp;
 274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 275     expectPairs(set, exp);
 276 }
 277
 278 void
 279 UnicodeSetTest::TestCategories(void) {
 280     UErrorCode status = U_ZERO_ERROR;
 281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 282     UnicodeSet set(pat, status);
 283     if (U_FAILURE(status)) {
 284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
 285         return;
 286     } else {
 287         expectContainment(set, pat, "ABC", "abc");
 288     }
 289
 290     UChar32 i;
 291     int32_t failures = 0;
 292     // Make sure generation of L doesn't pollute cached Lu set
 293     // First generate L, then Lu
 294     set.applyPattern("[:L:]", status);
 295     if (U_FAILURE(status)) { errln("FAIL"); return; }
 296     for (i=0; i<0x200; ++i) {
 297         UBool l = u_isalpha((UChar)i);
 298         if (l != set.contains(i)) {
 299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 300                   set.contains(i));
 301             if (++failures == 10) break;
 302         }
 303     }
 304
 305     set.applyPattern("[:Lu:]", status);
 306     if (U_FAILURE(status)) { errln("FAIL"); return; }
 307     for (i=0; i<0x200; ++i) {
 308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 309         if (lu != set.contains(i)) {
 310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 311                   set.contains(i));
 312             if (++failures == 20) break;
 313         }
 314     }
 315 }
 316 void
 317 UnicodeSetTest::TestCloneEqualHash(void) {
 318     UErrorCode status = U_ZERO_ERROR;
 319     // set1 and set2 used to be built with the obsolete constructor taking
 320     // UCharCategory values; replaced with pattern constructors
 321     // markus 20030502
 322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
 323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
 324     if (U_FAILURE(status)){
 325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
 326         return;
 327     }
 328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
 329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
 330     if (U_FAILURE(status)){
 331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 332         return;
 333     }
 334
 335     if (*set1 != *set1a) {
 336         errln("FAIL: category constructor for Ll broken");
 337     }
 338     if (*set2 != *set2a) {
 339         errln("FAIL: category constructor for Nd broken");
 340     }
 341     delete set1a;
 342     delete set2a;
 343
 344     logln("Testing copy construction");
 345     UnicodeSet *set1copy=new UnicodeSet(*set1);
 346     if(*set1 != *set1copy || *set1 == *set2 ||
 347         getPairs(*set1) != getPairs(*set1copy) ||
 348         set1->hashCode() != set1copy->hashCode()){
 349         errln("FAIL : Error in copy construction");
 350         return;
 351     }
 352
 353     logln("Testing =operator");
 354     UnicodeSet set1equal=*set1;
 355     UnicodeSet set2equal=*set2;
 356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 358         errln("FAIL: Error in =operator");
 359     }
 360
 361     logln("Testing clone()");
 362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 367         errln("FAIL: Error in clone");
 368     }
 369
 370     logln("Testing hashcode");
 371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 376         errln("FAIL: Error in hashCode()");
 377     }
 378
 379     delete set1;
 380     delete set1copy;
 381     delete set2;
 382     delete set1clone;
 383     delete set2clone;
 384
 385
 386 }
 387 void
 388 UnicodeSetTest::TestAddRemove(void) {
 389     UnicodeSet set; // Construct empty set
 390     doAssert(set.isEmpty() == TRUE, "set should be empty");
 391     doAssert(set.size() == 0, "size should be 0");
 392     set.complement();
 393     doAssert(set.size() == 0x110000, "size should be 0x110000");
 394     set.clear();
 395     set.add(0x0061, 0x007a);
 396     expectPairs(set, "az");
 397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 398     doAssert(set.size() != 0, "size should not be equal to 0");
 399     doAssert(set.size() == 26, "size should be equal to 26");
 400     set.remove(0x006d, 0x0070);
 401     expectPairs(set, "alqz");
 402     doAssert(set.size() == 22, "size should be equal to 22");
 403     set.remove(0x0065, 0x0067);
 404     expectPairs(set, "adhlqz");
 405     doAssert(set.size() == 19, "size should be equal to 19");
 406     set.remove(0x0064, 0x0069);
 407     expectPairs(set, "acjlqz");
 408     doAssert(set.size() == 16, "size should be equal to 16");
 409     set.remove(0x0063, 0x0072);
 410     expectPairs(set, "absz");
 411     doAssert(set.size() == 10, "size should be equal to 10");
 412     set.add(0x0066, 0x0071);
 413     expectPairs(set, "abfqsz");
 414     doAssert(set.size() == 22, "size should be equal to 22");
 415     set.remove(0x0061, 0x0067);
 416     expectPairs(set, "hqsz");
 417     set.remove(0x0061, 0x007a);
 418     expectPairs(set, "");
 419     doAssert(set.isEmpty() == TRUE, "set should be empty");
 420     doAssert(set.size() == 0, "size should be 0");
 421     set.add(0x0061);
 422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 423     doAssert(set.size() == 1, "size should not be equal to 1");
 424     set.add(0x0062);
 425     set.add(0x0063);
 426     expectPairs(set, "ac");
 427     doAssert(set.size() == 3, "size should not be equal to 3");
 428     set.add(0x0070);
 429     set.add(0x0071);
 430     expectPairs(set, "acpq");
 431     doAssert(set.size() == 5, "size should not be equal to 5");
 432     set.clear();
 433     expectPairs(set, "");
 434     doAssert(set.isEmpty() == TRUE, "set should be empty");
 435     doAssert(set.size() == 0, "size should be 0");
 436
 437     // Try removing an entire set from another set
 438     expectPattern(set, "[c-x]", "cx");
 439     UnicodeSet set2;
 440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 441     set.removeAll(set2);
 442     expectPairs(set, "deluxx");
 443
 444     // Try adding an entire set to another set
 445     expectPattern(set, "[jackiemclean]", "aacceein");
 446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 447     set.addAll(set2);
 448     expectPairs(set, "aacehort");
 449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 450
 451     // Try retaining an set of elements contained in another set (intersection)
 452     UnicodeSet set3;
 453     expectPattern(set3, "[a-c]", "ac");
 454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 455     set3.remove(0x0062);
 456     expectPairs(set3, "aacc");
 457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 458     set.retainAll(set3);
 459     expectPairs(set, "aacc");
 460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 462     set.clear();
 463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 464
 465     // Test commutativity
 466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 467     expectPattern(set2, "[jackiemclean]", "aacceein");
 468     set.addAll(set2);
 469     expectPairs(set, "aacehort");
 470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 471
 472
 473
 474
 475 }
 476
 477 /**
 478  * Make sure minimal representation is maintained.
 479  */
 480 void UnicodeSetTest::TestMinimalRep() {
 481     UErrorCode status = U_ZERO_ERROR;
 482     // This is pretty thoroughly tested by checkCanonicalRep()
 483     // run against the exhaustive operation results.  Use the code
 484     // here for debugging specific spot problems.
 485
 486     // 1 overlap against 2
 487     UnicodeSet set("[h-km-q]", status);
 488     if (U_FAILURE(status)) { errln("FAIL"); return; }
 489     UnicodeSet set2("[i-o]", status);
 490     if (U_FAILURE(status)) { errln("FAIL"); return; }
 491     set.addAll(set2);
 492     expectPairs(set, "hq");
 493     // right
 494     set.applyPattern("[a-m]", status);
 495     if (U_FAILURE(status)) { errln("FAIL"); return; }
 496     set2.applyPattern("[e-o]", status);
 497     if (U_FAILURE(status)) { errln("FAIL"); return; }
 498     set.addAll(set2);
 499     expectPairs(set, "ao");
 500     // left
 501     set.applyPattern("[e-o]", status);
 502     if (U_FAILURE(status)) { errln("FAIL"); return; }
 503     set2.applyPattern("[a-m]", status);
 504     if (U_FAILURE(status)) { errln("FAIL"); return; }
 505     set.addAll(set2);
 506     expectPairs(set, "ao");
 507     // 1 overlap against 3
 508     set.applyPattern("[a-eg-mo-w]", status);
 509     if (U_FAILURE(status)) { errln("FAIL"); return; }
 510     set2.applyPattern("[d-q]", status);
 511     if (U_FAILURE(status)) { errln("FAIL"); return; }
 512     set.addAll(set2);
 513     expectPairs(set, "aw");
 514 }
 515
 516 void UnicodeSetTest::TestAPI() {
 517     UErrorCode status = U_ZERO_ERROR;
 518     // default ct
 519     UnicodeSet set;
 520     if (!set.isEmpty() || set.getRangeCount() != 0) {
 521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 522               set);
 523     }
 524
 525     // clear(), isEmpty()
 526     set.add(0x0061);
 527     if (set.isEmpty()) {
 528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 529               set);
 530     }
 531     set.clear();
 532     if (!set.isEmpty()) {
 533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 534               set);
 535     }
 536
 537     // size()
 538     set.clear();
 539     if (set.size() != 0) {
 540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 541               ": " + set);
 542     }
 543     set.add(0x0061);
 544     if (set.size() != 1) {
 545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 546               ": " + set);
 547     }
 548     set.add(0x0031, 0x0039);
 549     if (set.size() != 10) {
 550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 551               ": " + set);
 552     }
 553
 554     // contains(first, last)
 555     set.clear();
 556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 557     if (U_FAILURE(status)) { errln("FAIL"); return; }
 558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 559         UChar32 a = set.getRangeStart(i);
 560         UChar32 b = set.getRangeEnd(i);
 561         if (!set.contains(a, b)) {
 562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 563                   " but doesn't: " + set);
 564         }
 565         if (set.contains((UChar32)(a-1), b)) {
 566             errln((UnicodeString)"FAIL, shouldn't contain " +
 567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 568                   " but does: " + set);
 569         }
 570         if (set.contains(a, (UChar32)(b+1))) {
 571             errln((UnicodeString)"FAIL, shouldn't contain " +
 572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 573                   " but does: " + set);
 574         }
 575     }
 576
 577     // Ported InversionList test.
 578     UnicodeSet a((UChar32)3,(UChar32)10);
 579     UnicodeSet b((UChar32)7,(UChar32)15);
 580     UnicodeSet c;
 581
 582     logln((UnicodeString)"a [3-10]: " + a);
 583     logln((UnicodeString)"b [7-15]: " + b);
 584     c = a;
 585     c.addAll(b);
 586     UnicodeSet exp((UChar32)3,(UChar32)15);
 587     if (c == exp) {
 588         logln((UnicodeString)"c.set(a).add(b): " + c);
 589     } else {
 590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 591     }
 592     c.complement();
 593     exp.set((UChar32)0, (UChar32)2);
 594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 595     if (c == exp) {
 596         logln((UnicodeString)"c.complement(): " + c);
 597     } else {
 598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 599     }
 600     c.complement();
 601     exp.set((UChar32)3, (UChar32)15);
 602     if (c == exp) {
 603         logln((UnicodeString)"c.complement(): " + c);
 604     } else {
 605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 606     }
 607     c = a;
 608     c.complementAll(b);
 609     exp.set((UChar32)3,(UChar32)6);
 610     exp.add((UChar32)11,(UChar32) 15);
 611     if (c == exp) {
 612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 613     } else {
 614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 615     }
 616
 617     exp = c;
 618     bitsToSet(setToBits(c), c);
 619     if (c == exp) {
 620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 621     } else {
 622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 623     }
 624
 625     // Additional tests for coverage JB#2118
 626     //UnicodeSet::complement(class UnicodeString const &)
 627     //UnicodeSet::complementAll(class UnicodeString const &)
 628     //UnicodeSet::containsNone(class UnicodeSet const &)
 629     //UnicodeSet::containsNone(long,long)
 630     //UnicodeSet::containsSome(class UnicodeSet const &)
 631     //UnicodeSet::containsSome(long,long)
 632     //UnicodeSet::removeAll(class UnicodeString const &)
 633     //UnicodeSet::retain(long)
 634     //UnicodeSet::retainAll(class UnicodeString const &)
 635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 636     //UnicodeSetIterator::getString(void)
 637     set.clear();
 638     set.complement("ab");
 639     exp.applyPattern("[{ab}]", status);
 640     if (U_FAILURE(status)) { errln("FAIL"); return; }
 641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 642
 643     UnicodeSetIterator iset(set);
 644     if (!iset.next() || !iset.isString()) {
 645         errln("FAIL: UnicodeSetIterator::next/isString");
 646     } else if (iset.getString() != "ab") {
 647         errln("FAIL: UnicodeSetIterator::getString");
 648     }
 649
 650     set.add((UChar32)0x61, (UChar32)0x7A);
 651     set.complementAll("alan");
 652     exp.applyPattern("[{ab}b-kmo-z]", status);
 653     if (U_FAILURE(status)) { errln("FAIL"); return; }
 654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 655
 656     exp.applyPattern("[a-z]", status);
 657     if (U_FAILURE(status)) { errln("FAIL"); return; }
 658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 660     exp.applyPattern("[aln]", status);
 661     if (U_FAILURE(status)) { errln("FAIL"); return; }
 662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 664
 665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 666         errln("FAIL: containsNone(UChar32, UChar32)");
 667     }
 668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 669         errln("FAIL: containsSome(UChar32, UChar32)");
 670     }
 671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 672         errln("FAIL: containsNone(UChar32, UChar32)");
 673     }
 674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 675         errln("FAIL: containsSome(UChar32, UChar32)");
 676     }
 677
 678     set.removeAll("liu");
 679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 680     if (U_FAILURE(status)) { errln("FAIL"); return; }
 681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 682
 683     set.retainAll("star");
 684     exp.applyPattern("[rst]", status);
 685     if (U_FAILURE(status)) { errln("FAIL"); return; }
 686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 687
 688     set.retain((UChar32)0x73);
 689     exp.applyPattern("[s]", status);
 690     if (U_FAILURE(status)) { errln("FAIL"); return; }
 691     if (set != exp) { errln("FAIL: retain('s')"); return; }
 692
 693     uint16_t buf[32];
 694     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
 695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 697         errln("FAIL: serialize");
 698         return;
 699     }
 700
 701     // Conversions to and from USet
 702     UnicodeSet *uniset = &set;
 703     USet *uset = uniset->toUSet();
 704     TEST_ASSERT((void *)uset == (void *)uniset);
 705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
 706     TEST_ASSERT((void *)setx == (void *)uset);
 707     const UnicodeSet *constSet = uniset;
 708     const USet *constUSet = constSet->toUSet();
 709     TEST_ASSERT((void *)constUSet == (void *)constSet);
 710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
 711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
 712
 713     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
 714     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
 715     UnicodeSet ac(0x61, 0x63);
 716     ac.remove(0x62).freeze();
 717     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
 718         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
 719         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
 720         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
 721         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 722         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
 723         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
 724         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
 725         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
 726         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
 727     ) {
 728         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
 729     }
 730     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
 731         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
 732         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
 733         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
 734         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 735         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
 736         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
 737         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
 738         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
 739         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
 740     ) {
 741         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
 742     }
 743 }
 744
 745 void UnicodeSetTest::TestIteration() {
 746     UErrorCode ec = U_ZERO_ERROR;
 747     int i = 0;
 748     int outerLoop;
 749
 750     // 6 code points, 3 ranges, 2 strings, 8 total elements
 751     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 752     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
 753     TEST_ASSERT_SUCCESS(ec);
 754     UnicodeSetIterator it(set);
 755
 756     for (outerLoop=0; outerLoop<3; outerLoop++) {
 757         // Run the test multiple times, to check that iterator.reset() is working.
 758         for (i=0; i<10; i++) {
 759             UBool         nextv        = it.next();
 760             UBool         isString     = it.isString();
 761             int32_t       codePoint    = it.getCodepoint();
 762             //int32_t       codePointEnd = it.getCodepointEnd();
 763             UnicodeString s   = it.getString();
 764             switch (i) {
 765             case 0:
 766                 TEST_ASSERT(nextv == TRUE);
 767                 TEST_ASSERT(isString == FALSE);
 768                 TEST_ASSERT(codePoint==0x61);
 769                 TEST_ASSERT(s == "a");
 770                 break;
 771             case 1:
 772                 TEST_ASSERT(nextv == TRUE);
 773                 TEST_ASSERT(isString == FALSE);
 774                 TEST_ASSERT(codePoint==0x62);
 775                 TEST_ASSERT(s == "b");
 776                 break;
 777             case 2:
 778                 TEST_ASSERT(nextv == TRUE);
 779                 TEST_ASSERT(isString == FALSE);
 780                 TEST_ASSERT(codePoint==0x63);
 781                 TEST_ASSERT(s == "c");
 782                 break;
 783             case 3:
 784                 TEST_ASSERT(nextv == TRUE);
 785                 TEST_ASSERT(isString == FALSE);
 786                 TEST_ASSERT(codePoint==0x79);
 787                 TEST_ASSERT(s == "y");
 788                 break;
 789             case 4:
 790                 TEST_ASSERT(nextv == TRUE);
 791                 TEST_ASSERT(isString == FALSE);
 792                 TEST_ASSERT(codePoint==0x7a);
 793                 TEST_ASSERT(s == "z");
 794                 break;
 795             case 5:
 796                 TEST_ASSERT(nextv == TRUE);
 797                 TEST_ASSERT(isString == FALSE);
 798                 TEST_ASSERT(codePoint==0x1abcd);
 799                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 800                 break;
 801             case 6:
 802                 TEST_ASSERT(nextv == TRUE);
 803                 TEST_ASSERT(isString == TRUE);
 804                 TEST_ASSERT(s == "str1");
 805                 break;
 806             case 7:
 807                 TEST_ASSERT(nextv == TRUE);
 808                 TEST_ASSERT(isString == TRUE);
 809                 TEST_ASSERT(s == "str2");
 810                 break;
 811             case 8:
 812                 TEST_ASSERT(nextv == FALSE);
 813                 break;
 814             case 9:
 815                 TEST_ASSERT(nextv == FALSE);
 816                 break;
 817             }
 818         }
 819         it.reset();  // prepare to run the iteration again.
 820     }
 821 }
 822
 823
 824
 825
 826 void UnicodeSetTest::TestStrings() {
 827     UErrorCode ec = U_ZERO_ERROR;
 828
 829     UnicodeSet* testList[] = {
 830         UnicodeSet::createFromAll("abc"),
 831         new UnicodeSet("[a-c]", ec),
 832
 833         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 834         new UnicodeSet("[{ll}{ch}a-z]", ec),
 835
 836         UnicodeSet::createFrom("ab}c"),
 837         new UnicodeSet("[{ab\\}c}]", ec),
 838
 839         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 840         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 841
 842         NULL
 843     };
 844
 845     if (U_FAILURE(ec)) {
 846         errln("FAIL: couldn't construct test sets");
 847     }
 848
 849     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 850         if (U_SUCCESS(ec)) {
 851             UnicodeString pat0, pat1;
 852             testList[i]->toPattern(pat0, TRUE);
 853             testList[i+1]->toPattern(pat1, TRUE);
 854             if (*testList[i] == *testList[i+1]) {
 855                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 856             } else {
 857                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 858             }
 859         }
 860         delete testList[i];
 861         delete testList[i+1];
 862     }
 863 }
 864
 865 /**
 866  * Test the [:Latin:] syntax.
 867  */
 868 void UnicodeSetTest::TestScriptSet() {
 869     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 870
 871     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 872
 873     /* Jitterbug 1423 */
 874     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 875
 876 }
 877
 878 /**
 879  * Test the [:Latin:] syntax.
 880  */
 881 void UnicodeSetTest::TestPropertySet() {
 882     static const char* const DATA[] = {
 883         // Pattern, Chars IN, Chars NOT in
 884
 885         "[:Latin:]",
 886         "aA",
 887         "\\u0391\\u03B1",
 888
 889         "[\\p{Greek}]",
 890         "\\u0391\\u03B1",
 891         "aA",
 892
 893         "\\P{ GENERAL Category = upper case letter }",
 894         "abc",
 895         "ABC",
 896
 897 #if !UCONFIG_NO_NORMALIZATION
 898         // Combining class: @since ICU 2.2
 899         // Check both symbolic and numeric
 900         "\\p{ccc=Nukta}",
 901         "\\u0ABC",
 902         "abc",
 903
 904         "\\p{Canonical Combining Class = 11}",
 905         "\\u05B1",
 906         "\\u05B2",
 907
 908         "[:c c c = iota subscript :]",
 909         "\\u0345",
 910         "xyz",
 911 #endif
 912
 913         // Bidi class: @since ICU 2.2
 914         "\\p{bidiclass=lefttoright}",
 915         "abc",
 916         "\\u0671\\u0672",
 917
 918         // Binary properties: @since ICU 2.2
 919         "\\p{ideographic}",
 920         "\\u4E0A",
 921         "x",
 922
 923         "[:math=false:]",
 924         "q)*(",
 925         // weiv: )(and * were removed from math in Unicode 4.0.1
 926         //"(*+)",
 927         "+<>^",
 928
 929         // JB#1767 \N{}, \p{ASCII}
 930         "[:Ascii:]",
 931         "abc\\u0000\\u007F",
 932         "\\u0080\\u4E00",
 933
 934         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 935         "az",
 936         "qrs",
 937
 938         // JB#2015
 939         "[:any:]",
 940         "a\\U0010FFFF",
 941         "",
 942
 943         "[:nv=0.5:]",
 944         "\\u00BD\\u0F2A",
 945         "\\u00BC",
 946
 947         // JB#2653: Age
 948         "[:Age=1.1:]",
 949         "\\u03D6", // 1.1
 950         "\\u03D8\\u03D9", // 3.2
 951
 952         "[:Age=3.1:]",
 953         "\\u1800\\u3400\\U0002f800",
 954         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 955
 956         // JB#2350: Case_Sensitive
 957         "[:Case Sensitive:]",
 958         "A\\u1FFC\\U00010410",
 959         ";\\u00B4\\U00010500",
 960
 961         // JB#2832: C99-compatibility props
 962         "[:blank:]",
 963         " \\u0009",
 964         "1-9A-Z",
 965
 966         "[:graph:]",
 967         "19AZ",
 968         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 969
 970         "[:punct:]",
 971         "!@#%&*()[]{}-_\\/;:,.?'\"",
 972         "09azAZ",
 973
 974         "[:xdigit:]",
 975         "09afAF",
 976         "gG!",
 977
 978         // Regex compatibility test
 979         "[-b]", // leading '-' is literal
 980         "-b",
 981         "ac",
 982
 983         "[^-b]", // leading '-' is literal
 984         "ac",
 985         "-b",
 986
 987         "[b-]", // trailing '-' is literal
 988         "-b",
 989         "ac",
 990
 991         "[^b-]", // trailing '-' is literal
 992         "ac",
 993         "-b",
 994
 995         "[a-b-]", // trailing '-' is literal
 996         "ab-",
 997         "c=",
 998
 999         "[[a-q]&[p-z]-]", // trailing '-' is literal
1000         "pq-",
1001         "or=",
1002
1003         "[\\s|\\)|:|$|\\>]", // from regex tests
1004         "s|):$>",
1005         "abc",
1006
1007         "[\\uDC00cd]", // JB#2906: isolated trail at start
1008         "cd\\uDC00",
1009         "ab\\uD800\\U00010000",
1010
1011         "[ab\\uD800]", // JB#2906: isolated trail at start
1012         "ab\\uD800",
1013         "cd\\uDC00\\U00010000",
1014
1015         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016         "abcd\\uD800",
1017         "ef\\uDC00\\U00010000",
1018
1019         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020         "abcd\\uDC00",
1021         "ef\\uD800\\U00010000",
1022
1023 #if !UCONFIG_NO_NORMALIZATION
1024         "[:^lccc=0:]", // Lead canonical class
1025         "\\u0300\\u0301",
1026         "abcd\\u00c0\\u00c5",
1027
1028         "[:^tccc=0:]", // Trail canonical class
1029         "\\u0300\\u0301\\u00c0\\u00c5",
1030         "abcd",
1031
1032         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033         "\\u0300\\u0301\\u00c0\\u00c5",
1034         "abcd",
1035
1036         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037         "",
1038         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041         "\\u0F73\\u0F75\\u0F81",
1042         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043 #endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045         "[:Assigned:]",
1046         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049         // Script_Extensions, new in Unicode 6.0
1050         "[:scx=Arab:]",
1051         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052         "\\u061D\\uFDEF\\uFDFE",
1053
1054         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055         // so scx-sc is missing U+FDF2.
1056         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057         "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058         "\\uFDF2"
1059     };
1060
1061     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062
1063     for (int32_t i=0; i<DATA_LEN; i+=3) {
1064         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065                           CharsToUnicodeString(DATA[i+2]));
1066     }
1067 }
1068
1069 /**
1070   * Test that Posix style character classes [:digit:], etc.
1071   *   have the Unicode definitions from TR 18.
1072   */
1073 void UnicodeSetTest::TestPosixClasses() {
1074     {
1075         UErrorCode status = U_ZERO_ERROR;
1076         UnicodeSet s1("[:alpha:]", status);
1077         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078         TEST_ASSERT_SUCCESS(status);
1079         TEST_ASSERT(s1==s2);
1080     }
1081     {
1082         UErrorCode status = U_ZERO_ERROR;
1083         UnicodeSet s1("[:lower:]", status);
1084         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085         TEST_ASSERT_SUCCESS(status);
1086         TEST_ASSERT(s1==s2);
1087     }
1088     {
1089         UErrorCode status = U_ZERO_ERROR;
1090         UnicodeSet s1("[:upper:]", status);
1091         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092         TEST_ASSERT_SUCCESS(status);
1093         TEST_ASSERT(s1==s2);
1094     }
1095     {
1096         UErrorCode status = U_ZERO_ERROR;
1097         UnicodeSet s1("[:punct:]", status);
1098         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099         TEST_ASSERT_SUCCESS(status);
1100         TEST_ASSERT(s1==s2);
1101     }
1102     {
1103         UErrorCode status = U_ZERO_ERROR;
1104         UnicodeSet s1("[:digit:]", status);
1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106         TEST_ASSERT_SUCCESS(status);
1107         TEST_ASSERT(s1==s2);
1108     }
1109     {
1110         UErrorCode status = U_ZERO_ERROR;
1111         UnicodeSet s1("[:xdigit:]", status);
1112         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113         TEST_ASSERT_SUCCESS(status);
1114         TEST_ASSERT(s1==s2);
1115     }
1116     {
1117         UErrorCode status = U_ZERO_ERROR;
1118         UnicodeSet s1("[:alnum:]", status);
1119         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120         TEST_ASSERT_SUCCESS(status);
1121         TEST_ASSERT(s1==s2);
1122     }
1123     {
1124         UErrorCode status = U_ZERO_ERROR;
1125         UnicodeSet s1("[:space:]", status);
1126         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127         TEST_ASSERT_SUCCESS(status);
1128         TEST_ASSERT(s1==s2);
1129     }
1130     {
1131         UErrorCode status = U_ZERO_ERROR;
1132         UnicodeSet s1("[:blank:]", status);
1133         TEST_ASSERT_SUCCESS(status);
1134         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135             status);
1136         TEST_ASSERT_SUCCESS(status);
1137         TEST_ASSERT(s1==s2);
1138     }
1139     {
1140         UErrorCode status = U_ZERO_ERROR;
1141         UnicodeSet s1("[:cntrl:]", status);
1142         TEST_ASSERT_SUCCESS(status);
1143         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144         TEST_ASSERT_SUCCESS(status);
1145         TEST_ASSERT(s1==s2);
1146     }
1147     {
1148         UErrorCode status = U_ZERO_ERROR;
1149         UnicodeSet s1("[:graph:]", status);
1150         TEST_ASSERT_SUCCESS(status);
1151         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152         TEST_ASSERT_SUCCESS(status);
1153         TEST_ASSERT(s1==s2);
1154     }
1155     {
1156         UErrorCode status = U_ZERO_ERROR;
1157         UnicodeSet s1("[:print:]", status);
1158         TEST_ASSERT_SUCCESS(status);
1159         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160         TEST_ASSERT_SUCCESS(status);
1161         TEST_ASSERT(s1==s2);
1162     }
1163 }
1164 /**
1165  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1166  */
1167 void UnicodeSetTest::TestClone() {
1168     UErrorCode ec = U_ZERO_ERROR;
1169     UnicodeSet s("[abcxyz]", ec);
1170     UnicodeSet t(s);
1171     expectContainment(t, "abc", "def");
1172 }
1173
1174 /**
1175  * Test the indexOf() and charAt() methods.
1176  */
1177 void UnicodeSetTest::TestIndexOf() {
1178     UErrorCode ec = U_ZERO_ERROR;
1179     UnicodeSet set("[a-cx-y3578]", ec);
1180     if (U_FAILURE(ec)) {
1181         errln("FAIL: UnicodeSet constructor");
1182         return;
1183     }
1184     for (int32_t i=0; i<set.size(); ++i) {
1185         UChar32 c = set.charAt(i);
1186         if (set.indexOf(c) != i) {
1187             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188                 i, c, set.indexOf(c));
1189         }
1190     }
1191     UChar32 c = set.charAt(set.size());
1192     if (c != -1) {
1193         errln("FAIL: charAt(<out of range>) = %X", c);
1194     }
1195     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196     if (j != -1) {
1197         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198     }
1199 }
1200
1201 /**
1202  * Test closure API.
1203  */
1204 void UnicodeSetTest::TestCloseOver() {
1205     UErrorCode ec = U_ZERO_ERROR;
1206
1207     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209     const char* DATA[] = {
1210         // selector, input, output
1211         CASE,
1212         "[aq\\u00DF{Bc}{bC}{Fi}]",
1213         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215         CASE,
1216         "[\\u01F1]", // 'DZ'
1217         "[\\u01F1\\u01F2\\u01F3]",
1218
1219         CASE,
1220         "[\\u1FB4]",
1221         "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223         CASE,
1224         "[{F\\uFB01}]",
1225         "[\\uFB03{ffi}]",
1226
1227         CASE, // make sure binary search finds limits
1228         "[a\\uFF3A]",
1229         "[aA\\uFF3A\\uFF5A]",
1230
1231         CASE,
1232         "[a-z]","[A-Za-z\\u017F\\u212A]",
1233         CASE,
1234         "[abc]","[A-Ca-c]",
1235         CASE,
1236         "[ABC]","[A-Ca-c]",
1237
1238         CASE, "[i]", "[iI]",
1239
1240         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1241         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1242
1243         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1244
1245         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1252
1253         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1256         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1257
1258         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1259
1260         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264 #if !UCONFIG_NO_FILE_IO
1265         CASE_MAPPINGS,
1266         "[aq\\u00DF{Bc}{bC}{Fi}]",
1267         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268 #endif
1269
1270         CASE_MAPPINGS,
1271         "[\\u01F1]", // 'DZ'
1272         "[\\u01F1\\u01F2\\u01F3]",
1273
1274         CASE_MAPPINGS,
1275         "[a-z]",
1276         "[A-Za-z]",
1277
1278         NULL
1279     };
1280
1281     UnicodeSet s;
1282     UnicodeSet t;
1283     UnicodeString buf;
1284     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285         int32_t selector = DATA[i][0];
1286         UnicodeString pat(DATA[i+1], -1, US_INV);
1287         UnicodeString exp(DATA[i+2], -1, US_INV);
1288         s.applyPattern(pat, ec);
1289         s.closeOver(selector);
1290         t.applyPattern(exp, ec);
1291         if (U_FAILURE(ec)) {
1292             errln("FAIL: applyPattern failed");
1293             continue;
1294         }
1295         if (s == t) {
1296             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297         } else {
1298             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299                   s.toPattern(buf, TRUE) + ", expected " + exp);
1300         }
1301     }
1302
1303 #if 0
1304     /*
1305      * Unused test code.
1306      * This was used to compare the old implementation (using USET_CASE)
1307      * with the new one (using 0x100 temporarily)
1308      * while transitioning from hardcoded case closure tables in uniset.cpp
1309      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310      * and using ucase.c functions for closure.
1311      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312      *
1313      * Note: The old and new implementation never fully matched because
1314      * the old implementation turned out to not map U+0130 and U+0131 correctly
1315      * (dotted I and dotless i) and because the old implementation's data tables
1316      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317      * new implementation. (So sigmas and some other characters were not handled
1318      * according to the newer Unicode version.)
1319      */
1320     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321     UnicodeSetIterator si(sens);
1322     UnicodeString str, buf2;
1323     const UnicodeString *pStr;
1324     UChar32 c;
1325     while(si.next()) {
1326         if(!si.isString()) {
1327             c=si.getCodepoint();
1328             s.clear();
1329             s.add(c);
1330
1331             str.setTo(c);
1332             str.foldCase();
1333             sens2.add(str);
1334
1335             t=s;
1336             s.closeOver(USET_CASE);
1337             t.closeOver(0x100);
1338             if(s!=t) {
1339                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341             }
1342         }
1343     }
1344     // remove all code points
1345     // should contain all full case folding mapping strings
1346     sens2.remove(0, 0x10ffff);
1347     si.reset(sens2);
1348     while(si.next()) {
1349         if(si.isString()) {
1350             pStr=&si.getString();
1351             s.clear();
1352             s.add(*pStr);
1353             t=s2=s;
1354             s.closeOver(USET_CASE);
1355             t.closeOver(0x100);
1356             if(s!=t) {
1357                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359             }
1360         }
1361     }
1362 #endif
1363
1364     // Test the pattern API
1365     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366     if (U_FAILURE(ec)) {
1367         errln("FAIL: applyPattern failed");
1368     } else {
1369         expectContainment(s, "abcABC", "defDEF");
1370     }
1371     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372     if (U_FAILURE(ec)) {
1373         errln("FAIL: constructor failed");
1374     } else {
1375         expectContainment(v, "defDEF", "abcABC");
1376     }
1377     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378     if (U_FAILURE(ec)) {
1379         errln("FAIL: construct w/case mappings failed");
1380     } else {
1381         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382     }
1383 }
1384
1385 void UnicodeSetTest::TestEscapePattern() {
1386     const char pattern[] =
1387         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388     const char exp[] =
1389         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390     // We test this with two passes; in the second pass we
1391     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1392     // this fails -- which is what we expect.
1393     for (int32_t pass=1; pass<=2; ++pass) {
1394         UErrorCode ec = U_ZERO_ERROR;
1395         UnicodeString pat(pattern, -1, US_INV);
1396         if (pass==2) {
1397             pat = pat.unescape();
1398         }
1399         // Pattern is only good for pass 1
1400         UBool isPatternValid = (pass==1);
1401
1402         UnicodeSet set(pat, ec);
1403         if (U_SUCCESS(ec) != isPatternValid){
1404             errln((UnicodeString)"FAIL: applyPattern(" +
1405                   escape(pat) + ") => " +
1406                   u_errorName(ec));
1407             continue;
1408         }
1409         if (U_FAILURE(ec)) {
1410             continue;
1411         }
1412         if (set.contains((UChar)0x0644)){
1413             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414         }
1415
1416         UnicodeString newpat;
1417         set.toPattern(newpat, TRUE);
1418         if (newpat == UnicodeString(exp, -1, US_INV)) {
1419             logln(escape(pat) + " => " + newpat);
1420         } else {
1421             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422         }
1423
1424         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425             UnicodeString str("Range ");
1426             str.append((UChar)(0x30 + i))
1427                 .append(": ")
1428                 .append((UChar32)set.getRangeStart(i))
1429                 .append(" - ")
1430                 .append((UChar32)set.getRangeEnd(i));
1431             str = str + " (" + set.getRangeStart(i) + " - " +
1432                 set.getRangeEnd(i) + ")";
1433             if (set.getRangeStart(i) < 0) {
1434                 errln((UnicodeString)"FAIL: " + escape(str));
1435             } else {
1436                 logln(escape(str));
1437             }
1438         }
1439     }
1440 }
1441
1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
1443                                  const UnicodeSet& set,
1444                                  UChar32 start, UChar32 end) {
1445     UnicodeSet exp(start, end);
1446     UnicodeString pat;
1447     if (set == exp) {
1448         logln(label + " => " + set.toPattern(pat, TRUE));
1449     } else {
1450         UnicodeString xpat;
1451         errln((UnicodeString)"FAIL: " + label + " => " +
1452               set.toPattern(pat, TRUE) +
1453               ", expected " + exp.toPattern(xpat, TRUE));
1454     }
1455 }
1456
1457 void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459     const UChar32 DATA[] = {
1460         // Test range             Expected range
1461         0, 0x10FFFF,              0, 0x10FFFF,
1462         (UChar32)-1, 8,           0, 8,
1463         8, 0x110000,              8, 0x10FFFF
1464     };
1465     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466
1467     UnicodeString pat;
1468     int32_t i;
1469
1470     for (i=0; i<DATA_LENGTH; i+=4) {
1471         UChar32 start  = DATA[i];
1472         UChar32 end    = DATA[i+1];
1473         UChar32 xstart = DATA[i+2];
1474         UChar32 xend   = DATA[i+3];
1475
1476         // Try various API using the test code points
1477
1478         UnicodeSet set(start, end);
1479         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480                     set, xstart, xend);
1481
1482         set.clear();
1483         set.set(start, end);
1484         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485                     set, xstart, xend);
1486
1487         UBool b = set.contains(start);
1488         b = set.contains(start, end);
1489         b = set.containsNone(start, end);
1490         b = set.containsSome(start, end);
1491
1492         /*int32_t index = set.indexOf(start);*/
1493
1494         set.clear();
1495         set.add(start);
1496         set.add(start, end);
1497         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498                     set, xstart, xend);
1499
1500         set.set(0, 0x10FFFF);
1501         set.retain(start, end);
1502         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503                     set, xstart, xend);
1504         set.retain(start);
1505
1506         set.set(0, 0x10FFFF);
1507         set.remove(start);
1508         set.remove(start, end);
1509         set.complement();
1510         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511                     set, xstart, xend);
1512
1513         set.set(0, 0x10FFFF);
1514         set.complement(start, end);
1515         set.complement();
1516         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517                     set, xstart, xend);
1518         set.complement(start);
1519     }
1520
1521     const UChar32 DATA2[] = {
1522         0,
1523         0x10FFFF,
1524         (UChar32)-1,
1525         0x110000
1526     };
1527     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1528
1529     for (i=0; i<DATA2_LENGTH; ++i) {
1530         UChar32 c = DATA2[i], end = 0x10FFFF;
1531         UBool valid = (c >= 0 && c <= 0x10FFFF);
1532
1533         UnicodeSet set(0, 0x10FFFF);
1534
1535         // For single-codepoint contains, invalid codepoints are NOT contained
1536         UBool b = set.contains(c);
1537         if (b == valid) {
1538             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539                   ") = " + b);
1540         } else {
1541             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542                   ") = " + b);
1543         }
1544
1545         // For codepoint range contains, containsNone, and containsSome,
1546         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547         b = set.contains(c, end);
1548         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549               "," + end + ") = " + b);
1550
1551         b = set.containsNone(c, end);
1552         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553               "," + end + ") = " + b);
1554
1555         b = set.containsSome(c, end);
1556         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557               "," + end + ") = " + b);
1558
1559         int32_t index = set.indexOf(c);
1560         if ((index >= 0) == valid) {
1561             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562                   ") = " + index);
1563         } else {
1564             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565                   ") = " + index);
1566         }
1567     }
1568 }
1569
1570 // Used by TestSymbolTable
1571 class TokenSymbolTable : public SymbolTable {
1572 public:
1573     Hashtable contents;
1574
1575     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1576         contents.setValueDeleter(uprv_deleteUObject);
1577     }
1578
1579     ~TokenSymbolTable() {}
1580
1581     /**
1582      * (Non-SymbolTable API) Add the given variable and value to
1583      * the table.  Variable should NOT contain leading '$'.
1584      */
1585     void add(const UnicodeString& var, const UnicodeString& value,
1586              UErrorCode& ec) {
1587         if (U_SUCCESS(ec)) {
1588             contents.put(var, new UnicodeString(value), ec);
1589         }
1590     }
1591
1592     /**
1593      * SymbolTable API
1594      */
1595     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596         return (const UnicodeString*) contents.get(s);
1597     }
1598
1599     /**
1600      * SymbolTable API
1601      */
1602     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603         return NULL;
1604     }
1605
1606     /**
1607      * SymbolTable API
1608      */
1609     virtual UnicodeString parseReference(const UnicodeString& text,
1610                                          ParsePosition& pos, int32_t limit) const {
1611         int32_t start = pos.getIndex();
1612         int32_t i = start;
1613         UnicodeString result;
1614         while (i < limit) {
1615             UChar c = text.charAt(i);
1616             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617                 break;
1618             }
1619             ++i;
1620         }
1621         if (i == start) { // No valid name chars
1622             return result; // Indicate failure with empty string
1623         }
1624         pos.setIndex(i);
1625         text.extractBetween(start, i, result);
1626         return result;
1627     }
1628 };
1629
1630 void UnicodeSetTest::TestSymbolTable() {
1631     // Multiple test cases can be set up here.  Each test case
1632     // is terminated by null:
1633     // var, value, var, value,..., input pat., exp. output pat., null
1634     const char* DATA[] = {
1635         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638         NULL
1639     };
1640
1641     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642         UErrorCode ec = U_ZERO_ERROR;
1643         TokenSymbolTable sym(ec);
1644         if (U_FAILURE(ec)) {
1645             errln("FAIL: couldn't construct TokenSymbolTable");
1646             continue;
1647         }
1648
1649         // Set up variables
1650         while (DATA[i+2] != NULL) {
1651             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1652             if (U_FAILURE(ec)) {
1653                 errln("FAIL: couldn't add to TokenSymbolTable");
1654                 continue;
1655             }
1656             i += 2;
1657         }
1658
1659         // Input pattern and expected output pattern
1660         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1661         i += 2;
1662
1663         ParsePosition pos(0);
1664         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665         if (U_FAILURE(ec)) {
1666             errln("FAIL: couldn't construct UnicodeSet");
1667             continue;
1668         }
1669
1670         // results
1671         if (pos.getIndex() != inpat.length()) {
1672             errln((UnicodeString)"Failed to read to end of string \""
1673                   + inpat + "\": read to "
1674                   + pos.getIndex() + ", length is "
1675                   + inpat.length());
1676         }
1677
1678         UnicodeSet us2(exppat, ec);
1679         if (U_FAILURE(ec)) {
1680             errln("FAIL: couldn't construct expected UnicodeSet");
1681             continue;
1682         }
1683
1684         UnicodeString a, b;
1685         if (us != us2) {
1686             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687                   ", expected " + us2.toPattern(b, TRUE));
1688         } else {
1689             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690         }
1691     }
1692 }
1693
1694 void UnicodeSetTest::TestSurrogate() {
1695     const char* DATA[] = {
1696         // These should all behave identically
1697         "[abc\\uD800\\uDC00]",
1698         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699         "[abc\\U00010000]",
1700         0
1701     };
1702     for (int i=0; DATA[i] != 0; ++i) {
1703         UErrorCode ec = U_ZERO_ERROR;
1704         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1705         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706         UnicodeSet set(str, ec);
1707         if (U_FAILURE(ec)) {
1708             errln("FAIL: UnicodeSet constructor");
1709             continue;
1710         }
1711         expectContainment(set,
1712                           CharsToUnicodeString("abc\\U00010000"),
1713                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714         if (set.size() != 4) {
1715             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1716                   set.size() + ", expected 4");
1717         }
1718     }
1719 }
1720
1721 void UnicodeSetTest::TestExhaustive() {
1722     // exhaustive tests. Simulate UnicodeSets with integers.
1723     // That gives us very solid tests (except for large memory tests).
1724
1725     int32_t limit = 128;
1726
1727     UnicodeSet x, y, z, aa;
1728
1729     for (int32_t i = 0; i < limit; ++i) {
1730         bitsToSet(i, x);
1731         logln((UnicodeString)"Testing " + i + ", " + x);
1732         _testComplement(i, x, y);
1733
1734         // AS LONG AS WE ARE HERE, check roundtrip
1735         checkRoundTrip(bitsToSet(i, aa));
1736
1737         for (int32_t j = 0; j < limit; ++j) {
1738             _testAdd(i,j,  x,y,z);
1739             _testXor(i,j,  x,y,z);
1740             _testRetain(i,j,  x,y,z);
1741             _testRemove(i,j,  x,y,z);
1742         }
1743     }
1744 }
1745
1746 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1747     bitsToSet(a, x);
1748     z = x;
1749     z.complement();
1750     int32_t c = setToBits(z);
1751     if (c != (~a)) {
1752         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1753         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1754     }
1755     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1756 }
1757
1758 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1759     bitsToSet(a, x);
1760     bitsToSet(b, y);
1761     z = x;
1762     z.addAll(y);
1763     int32_t c = setToBits(z);
1764     if (c != (a | b)) {
1765         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1766         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1767     }
1768     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1769 }
1770
1771 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772     bitsToSet(a, x);
1773     bitsToSet(b, y);
1774     z = x;
1775     z.retainAll(y);
1776     int32_t c = setToBits(z);
1777     if (c != (a & b)) {
1778         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1779         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1780     }
1781     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1782 }
1783
1784 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785     bitsToSet(a, x);
1786     bitsToSet(b, y);
1787     z = x;
1788     z.removeAll(y);
1789     int32_t c = setToBits(z);
1790     if (c != (a &~ b)) {
1791         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1792         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1793     }
1794     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1795 }
1796
1797 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798     bitsToSet(a, x);
1799     bitsToSet(b, y);
1800     z = x;
1801     z.complementAll(y);
1802     int32_t c = setToBits(z);
1803     if (c != (a ^ b)) {
1804         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1805         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1806     }
1807     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1808 }
1809
1810 /**
1811  * Check that ranges are monotonically increasing and non-
1812  * overlapping.
1813  */
1814 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1815     int32_t n = set.getRangeCount();
1816     if (n < 0) {
1817         errln((UnicodeString)"FAIL result of " + msg +
1818               ": range count should be >= 0 but is " +
1819               n /*+ " for " + set.toPattern())*/);
1820         return;
1821     }
1822     UChar32 last = 0;
1823     for (int32_t i=0; i<n; ++i) {
1824         UChar32 start = set.getRangeStart(i);
1825         UChar32 end = set.getRangeEnd(i);
1826         if (start > end) {
1827             errln((UnicodeString)"FAIL result of " + msg +
1828                   ": range " + (i+1) +
1829                   " start > end: " + (int)start + ", " + (int)end +
1830                   " for " + set);
1831         }
1832         if (i > 0 && start <= last) {
1833             errln((UnicodeString)"FAIL result of " + msg +
1834                   ": range " + (i+1) +
1835                   " overlaps previous range: " + (int)start + ", " + (int)end +
1836                   " for " + set);
1837         }
1838         last = end;
1839     }
1840 }
1841
1842 /**
1843  * Convert a bitmask to a UnicodeSet.
1844  */
1845 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1846     result.clear();
1847     for (UChar32 i = 0; i < 32; ++i) {
1848         if ((a & (1<<i)) != 0) {
1849             result.add(i);
1850         }
1851     }
1852     return result;
1853 }
1854
1855 /**
1856  * Convert a UnicodeSet to a bitmask.  Only the characters
1857  * U+0000 to U+0020 are represented in the bitmask.
1858  */
1859 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1860     int32_t result = 0;
1861     for (int32_t i = 0; i < 32; ++i) {
1862         if (x.contains((UChar32)i)) {
1863             result |= (1<<i);
1864         }
1865     }
1866     return result;
1867 }
1868
1869 /**
1870  * Return the representation of an inversion list based UnicodeSet
1871  * as a pairs list.  Ranges are listed in ascending Unicode order.
1872  * For example, the set [a-zA-M3] is represented as "33AMaz".
1873  */
1874 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1875     UnicodeString pairs;
1876     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1877         UChar32 start = set.getRangeStart(i);
1878         UChar32 end = set.getRangeEnd(i);
1879         if (end > 0xFFFF) {
1880             end = 0xFFFF;
1881             i = set.getRangeCount(); // Should be unnecessary
1882         }
1883         pairs.append((UChar)start).append((UChar)end);
1884     }
1885     return pairs;
1886 }
1887
1888 /**
1889  * Basic consistency check for a few items.
1890  * That the iterator works, and that we can create a pattern and
1891  * get the same thing back
1892  */
1893 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1894     UErrorCode ec = U_ZERO_ERROR;
1895
1896     UnicodeSet t(s);
1897     checkEqual(s, t, "copy ct");
1898
1899     t = s;
1900     checkEqual(s, t, "operator=");
1901
1902     copyWithIterator(t, s, FALSE);
1903     checkEqual(s, t, "iterator roundtrip");
1904
1905     copyWithIterator(t, s, TRUE); // try range
1906     checkEqual(s, t, "iterator roundtrip");
1907
1908     UnicodeString pat; s.toPattern(pat, FALSE);
1909     t.applyPattern(pat, ec);
1910     if (U_FAILURE(ec)) {
1911         errln("FAIL: applyPattern");
1912         return;
1913     } else {
1914         checkEqual(s, t, "toPattern(false)");
1915     }
1916
1917     s.toPattern(pat, TRUE);
1918     t.applyPattern(pat, ec);
1919     if (U_FAILURE(ec)) {
1920         errln("FAIL: applyPattern");
1921         return;
1922     } else {
1923         checkEqual(s, t, "toPattern(true)");
1924     }
1925 }
1926
1927 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1928     t.clear();
1929     UnicodeSetIterator it(s);
1930     if (withRange) {
1931         while (it.nextRange()) {
1932             if (it.isString()) {
1933                 t.add(it.getString());
1934             } else {
1935                 t.add(it.getCodepoint(), it.getCodepointEnd());
1936             }
1937         }
1938     } else {
1939         while (it.next()) {
1940             if (it.isString()) {
1941                 t.add(it.getString());
1942             } else {
1943                 t.add(it.getCodepoint());
1944             }
1945         }
1946     }
1947 }
1948
1949 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1950     UnicodeString source; s.toPattern(source, TRUE);
1951     UnicodeString result; t.toPattern(result, TRUE);
1952     if (s != t) {
1953         errln((UnicodeString)"FAIL: " + message
1954               + "; source = " + source
1955               + "; result = " + result
1956               );
1957         return FALSE;
1958     } else {
1959         logln((UnicodeString)"Ok: " + message
1960               + "; source = " + source
1961               + "; result = " + result
1962               );
1963     }
1964     return TRUE;
1965 }
1966
1967 void
1968 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1969                                   const UnicodeString& charsIn,
1970                                   const UnicodeString& charsOut) {
1971     UErrorCode ec = U_ZERO_ERROR;
1972     UnicodeSet set(pat, ec);
1973     if (U_FAILURE(ec)) {
1974         dataerrln((UnicodeString)"FAIL: pattern \"" +
1975               pat + "\" => " + u_errorName(ec));
1976         return;
1977     }
1978     expectContainment(set, pat, charsIn, charsOut);
1979 }
1980
1981 void
1982 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1983                                   const UnicodeString& charsIn,
1984                                   const UnicodeString& charsOut) {
1985     UnicodeString pat;
1986     set.toPattern(pat);
1987     expectContainment(set, pat, charsIn, charsOut);
1988 }
1989
1990 void
1991 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1992                                   const UnicodeString& setName,
1993                                   const UnicodeString& charsIn,
1994                                   const UnicodeString& charsOut) {
1995     UnicodeString bad;
1996     UChar32 c;
1997     int32_t i;
1998
1999     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2000         c = charsIn.char32At(i);
2001         if (!set.contains(c)) {
2002             bad.append(c);
2003         }
2004     }
2005     if (bad.length() > 0) {
2006         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2007               ", expected containment of " + prettify(charsIn));
2008     } else {
2009         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2010     }
2011
2012     bad.truncate(0);
2013     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2014         c = charsOut.char32At(i);
2015         if (set.contains(c)) {
2016             bad.append(c);
2017         }
2018     }
2019     if (bad.length() > 0) {
2020         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2021               ", expected non-containment of " + prettify(charsOut));
2022     } else {
2023         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2024     }
2025 }
2026
2027 void
2028 UnicodeSetTest::expectPattern(UnicodeSet& set,
2029                               const UnicodeString& pattern,
2030                               const UnicodeString& expectedPairs){
2031     UErrorCode status = U_ZERO_ERROR;
2032     set.applyPattern(pattern, status);
2033     if (U_FAILURE(status)) {
2034         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2035               "\") failed");
2036         return;
2037     } else {
2038         if (getPairs(set) != expectedPairs ) {
2039             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2040                   "\") => pairs \"" +
2041                   escape(getPairs(set)) + "\", expected \"" +
2042                   escape(expectedPairs) + "\"");
2043         } else {
2044             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2045                   "\") => pairs \"" +
2046                   escape(getPairs(set)) + "\"");
2047         }
2048     }
2049     // the result of calling set.toPattern(), which is the string representation of
2050     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2051     // will produce another set that is equal to this one.
2052     UnicodeString temppattern;
2053     set.toPattern(temppattern);
2054     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2055     if (U_FAILURE(status)) {
2056         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2057         return;
2058     }
2059     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2060         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2061             escape(getPairs(set)) + "\""));
2062     } else{
2063         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2064     }
2065
2066     delete tempset;
2067
2068 }
2069
2070 void
2071 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2072     if (getPairs(set) != expectedPairs) {
2073         errln(UnicodeString("FAIL: Expected pair list \"") +
2074               escape(expectedPairs) + "\", got \"" +
2075               escape(getPairs(set)) + "\"");
2076     }
2077 }
2078
2079 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2080                                      const UnicodeString& expPat,
2081                                      const char** expStrings) {
2082     UnicodeString pat;
2083     set.toPattern(pat, TRUE);
2084     if (pat == expPat) {
2085         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2086     } else {
2087         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2088         return;
2089     }
2090     if (expStrings == NULL) {
2091         return;
2092     }
2093     UBool in = TRUE;
2094     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2095         if (expStrings[i] == NOT) { // sic; pointer comparison
2096             in = FALSE;
2097             continue;
2098         }
2099         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2100         UBool contained = set.contains(s);
2101         if (contained == in) {
2102             logln((UnicodeString)"Ok: " + expPat +
2103                   (contained ? " contains {" : " does not contain {") +
2104                   escape(expStrings[i]) + "}");
2105         } else {
2106             errln((UnicodeString)"FAIL: " + expPat +
2107                   (contained ? " contains {" : " does not contain {") +
2108                   escape(expStrings[i]) + "}");
2109         }
2110     }
2111 }
2112
2113 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2114
2115 void
2116 UnicodeSetTest::doAssert(UBool condition, const char *message)
2117 {
2118     if (!condition) {
2119         errln(UnicodeString("ERROR : ") + message);
2120     }
2121 }
2122
2123 UnicodeString
2124 UnicodeSetTest::escape(const UnicodeString& s) {
2125     UnicodeString buf;
2126     for (int32_t i=0; i<s.length(); )
2127     {
2128         UChar32 c = s.char32At(i);
2129         if (0x0020 <= c && c <= 0x007F) {
2130             buf += c;
2131         } else {
2132             if (c <= 0xFFFF) {
2133                 buf += (UChar)0x5c; buf += (UChar)0x75;
2134             } else {
2135                 buf += (UChar)0x5c; buf += (UChar)0x55;
2136                 buf += toHexString((c & 0xF0000000) >> 28);
2137                 buf += toHexString((c & 0x0F000000) >> 24);
2138                 buf += toHexString((c & 0x00F00000) >> 20);
2139                 buf += toHexString((c & 0x000F0000) >> 16);
2140             }
2141             buf += toHexString((c & 0xF000) >> 12);
2142             buf += toHexString((c & 0x0F00) >> 8);
2143             buf += toHexString((c & 0x00F0) >> 4);
2144             buf += toHexString(c & 0x000F);
2145         }
2146         i += U16_LENGTH(c);
2147     }
2148     return buf;
2149 }
2150
2151 void UnicodeSetTest::TestFreezable() {
2152     UErrorCode errorCode=U_ZERO_ERROR;
2153     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2154     UnicodeSet idSet(idPattern, errorCode);
2155     if(U_FAILURE(errorCode)) {
2156         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2157         return;
2158     }
2159
2160     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2161     UnicodeSet wsSet(wsPattern, errorCode);
2162     if(U_FAILURE(errorCode)) {
2163         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2164         return;
2165     }
2166
2167     idSet.add(idPattern);
2168     UnicodeSet frozen(idSet);
2169     frozen.freeze();
2170
2171     if(idSet.isFrozen() || !frozen.isFrozen()) {
2172         errln("FAIL: isFrozen() is wrong");
2173     }
2174     if(frozen!=idSet || !(frozen==idSet)) {
2175         errln("FAIL: a copy-constructed frozen set differs from its original");
2176     }
2177
2178     frozen=wsSet;
2179     if(frozen!=idSet || !(frozen==idSet)) {
2180         errln("FAIL: a frozen set was modified by operator=");
2181     }
2182
2183     UnicodeSet frozen2(frozen);
2184     if(frozen2!=frozen || frozen2!=idSet) {
2185         errln("FAIL: a copied frozen set differs from its frozen original");
2186     }
2187     if(!frozen2.isFrozen()) {
2188         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2189     }
2190     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2191     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2192         errln("FAIL: UnicodeSet(5, 55) failed");
2193     }
2194     frozen3=frozen;
2195     if(!frozen3.isFrozen()) {
2196         errln("FAIL: copying a frozen set results in a thawed one");
2197     }
2198
2199     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2200     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2201         errln("FAIL: clone() failed");
2202     }
2203     cloned->add(0xd802, 0xd805);
2204     if(cloned->containsSome(0xd802, 0xd805)) {
2205         errln("FAIL: unable to modify clone");
2206     }
2207     delete cloned;
2208
2209     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2210     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2211         errln("FAIL: cloneAsThawed() failed");
2212     }
2213     thawed->add(0xd802, 0xd805);
2214     if(!thawed->contains(0xd802, 0xd805)) {
2215         errln("FAIL: unable to modify thawed clone");
2216     }
2217     delete thawed;
2218
2219     frozen.set(5, 55);
2220     if(frozen!=idSet || !(frozen==idSet)) {
2221         errln("FAIL: UnicodeSet::set() modified a frozen set");
2222     }
2223
2224     frozen.clear();
2225     if(frozen!=idSet || !(frozen==idSet)) {
2226         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2227     }
2228
2229     frozen.closeOver(USET_CASE_INSENSITIVE);
2230     if(frozen!=idSet || !(frozen==idSet)) {
2231         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2232     }
2233
2234     frozen.compact();
2235     if(frozen!=idSet || !(frozen==idSet)) {
2236         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2237     }
2238
2239     ParsePosition pos;
2240     frozen.
2241         applyPattern(wsPattern, errorCode).
2242         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2243         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2244         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2245         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2246     if(frozen!=idSet || !(frozen==idSet)) {
2247         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2248     }
2249
2250     frozen.
2251         add(0xd800).
2252         add(0xd802, 0xd805).
2253         add(wsPattern).
2254         addAll(idPattern).
2255         addAll(wsSet);
2256     if(frozen!=idSet || !(frozen==idSet)) {
2257         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2258     }
2259
2260     frozen.
2261         retain(0x62).
2262         retain(0x64, 0x69).
2263         retainAll(wsPattern).
2264         retainAll(wsSet);
2265     if(frozen!=idSet || !(frozen==idSet)) {
2266         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2267     }
2268
2269     frozen.
2270         remove(0x62).
2271         remove(0x64, 0x69).
2272         remove(idPattern).
2273         removeAll(idPattern).
2274         removeAll(idSet);
2275     if(frozen!=idSet || !(frozen==idSet)) {
2276         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2277     }
2278
2279     frozen.
2280         complement().
2281         complement(0x62).
2282         complement(0x64, 0x69).
2283         complement(idPattern).
2284         complementAll(idPattern).
2285         complementAll(idSet);
2286     if(frozen!=idSet || !(frozen==idSet)) {
2287         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2288     }
2289 }
2290
2291 // Test span() etc. -------------------------------------------------------- ***
2292
2293 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2294 static int32_t
2295 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2296     UErrorCode errorCode=U_ZERO_ERROR;
2297     int32_t length8=0;
2298     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2299     if(U_SUCCESS(errorCode)) {
2300         return length8;
2301     } else {
2302         // The string contains an unpaired surrogate.
2303         // Ignore this string.
2304         return 0;
2305     }
2306 }
2307
2308 class UnicodeSetWithStringsIterator;
2309
2310 // Make the strings in a UnicodeSet easily accessible.
2311 class UnicodeSetWithStrings {
2312 public:
2313     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2314             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2315         int32_t size=set.size();
2316         if(size>0 && set.charAt(size-1)<0) {
2317             // If a set's last element is not a code point, then it must contain strings.
2318             // Iterate over the set, skip all code point ranges, and cache the strings.
2319             // Convert them to UTF-8 for spanUTF8().
2320             UnicodeSetIterator iter(set);
2321             const UnicodeString *s;
2322             char *s8=utf8;
2323             int32_t length8, utf8Count=0;
2324             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2325                 if(iter.isString()) {
2326                     // Store the pointer to the set's string element
2327                     // which we happen to know is a stable pointer.
2328                     strings[stringsLength]=s=&iter.getString();
2329                     utf8Count+=
2330                         utf8Lengths[stringsLength]=length8=
2331                         appendUTF8(s->getBuffer(), s->length(),
2332                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2333                     if(length8==0) {
2334                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2335                     }
2336                     s8+=length8;
2337                     ++stringsLength;
2338                 }
2339             }
2340         }
2341     }
2342
2343     const UnicodeSet &getSet() const {
2344         return set;
2345     }
2346
2347     UBool hasStrings() const {
2348         return (UBool)(stringsLength>0);
2349     }
2350
2351     UBool hasStringsWithSurrogates() const {
2352         return hasSurrogates;
2353     }
2354
2355 private:
2356     friend class UnicodeSetWithStringsIterator;
2357
2358     const UnicodeSet &set;
2359
2360     const UnicodeString *strings[20];
2361     int32_t stringsLength;
2362     UBool hasSurrogates;
2363
2364     char utf8[1024];
2365     int32_t utf8Lengths[20];
2366
2367     int32_t nextStringIndex;
2368     int32_t nextUTF8Start;
2369 };
2370
2371 class UnicodeSetWithStringsIterator {
2372 public:
2373     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2374             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2375     }
2376
2377     void reset() {
2378         nextStringIndex=nextUTF8Start=0;
2379     }
2380
2381     const UnicodeString *nextString() {
2382         if(nextStringIndex<fSet.stringsLength) {
2383             return fSet.strings[nextStringIndex++];
2384         } else {
2385             return NULL;
2386         }
2387     }
2388
2389     // Do not mix with calls to nextString().
2390     const char *nextUTF8(int32_t &length) {
2391         if(nextStringIndex<fSet.stringsLength) {
2392             const char *s8=fSet.utf8+nextUTF8Start;
2393             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2394             return s8;
2395         } else {
2396             length=0;
2397             return NULL;
2398         }
2399     }
2400
2401 private:
2402     const UnicodeSetWithStrings &fSet;
2403     int32_t nextStringIndex;
2404     int32_t nextUTF8Start;
2405 };
2406
2407 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2408 // at code point boundaries.
2409 // That is, each edge of a match must not be in the middle of a surrogate pair.
2410 static inline UBool
2411 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2412     s+=start;
2413     limit-=start;
2414     int32_t length=t.length();
2415     return 0==t.compare(s, length) &&
2416            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2417            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2418 }
2419
2420 // Implement span() with contains() for comparison.
2421 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2422                                  USetSpanCondition spanCondition) {
2423     const UnicodeSet &realSet(set.getSet());
2424     if(!set.hasStrings()) {
2425         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2426             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2427         }
2428
2429         UChar32 c;
2430         int32_t start=0, prev;
2431         while((prev=start)<length) {
2432             U16_NEXT(s, start, length, c);
2433             if(realSet.contains(c)!=spanCondition) {
2434                 break;
2435             }
2436         }
2437         return prev;
2438     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2439         UnicodeSetWithStringsIterator iter(set);
2440         UChar32 c;
2441         int32_t start, next;
2442         for(start=next=0; start<length;) {
2443             U16_NEXT(s, next, length, c);
2444             if(realSet.contains(c)) {
2445                 break;
2446             }
2447             const UnicodeString *str;
2448             iter.reset();
2449             while((str=iter.nextString())!=NULL) {
2450                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2451                     // spanNeedsStrings=TRUE;
2452                     return start;
2453                 }
2454             }
2455             start=next;
2456         }
2457         return start;
2458     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2459         UnicodeSetWithStringsIterator iter(set);
2460         UChar32 c;
2461         int32_t start, next, maxSpanLimit=0;
2462         for(start=next=0; start<length;) {
2463             U16_NEXT(s, next, length, c);
2464             if(!realSet.contains(c)) {
2465                 next=start;  // Do not span this single, not-contained code point.
2466             }
2467             const UnicodeString *str;
2468             iter.reset();
2469             while((str=iter.nextString())!=NULL) {
2470                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2471                     // spanNeedsStrings=TRUE;
2472                     int32_t matchLimit=start+str->length();
2473                     if(matchLimit==length) {
2474                         return length;
2475                     }
2476                     if(spanCondition==USET_SPAN_CONTAINED) {
2477                         // Iterate for the shortest match at each position.
2478                         // Recurse for each but the shortest match.
2479                         if(next==start) {
2480                             next=matchLimit;  // First match from start.
2481                         } else {
2482                             if(matchLimit<next) {
2483                                 // Remember shortest match from start for iteration.
2484                                 int32_t temp=next;
2485                                 next=matchLimit;
2486                                 matchLimit=temp;
2487                             }
2488                             // Recurse for non-shortest match from start.
2489                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2490                                                                  USET_SPAN_CONTAINED);
2491                             if((matchLimit+spanLength)>maxSpanLimit) {
2492                                 maxSpanLimit=matchLimit+spanLength;
2493                                 if(maxSpanLimit==length) {
2494                                     return length;
2495                                 }
2496                             }
2497                         }
2498                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2499                         if(matchLimit>next) {
2500                             // Remember longest match from start.
2501                             next=matchLimit;
2502                         }
2503                     }
2504                 }
2505             }
2506             if(next==start) {
2507                 break;  // No match from start.
2508             }
2509             start=next;
2510         }
2511         if(start>maxSpanLimit) {
2512             return start;
2513         } else {
2514             return maxSpanLimit;
2515         }
2516     }
2517 }
2518
2519 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2520                                      USetSpanCondition spanCondition) {
2521     if(length==0) {
2522         return 0;
2523     }
2524     const UnicodeSet &realSet(set.getSet());
2525     if(!set.hasStrings()) {
2526         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2527             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2528         }
2529
2530         UChar32 c;
2531         int32_t prev=length;
2532         do {
2533             U16_PREV(s, 0, length, c);
2534             if(realSet.contains(c)!=spanCondition) {
2535                 break;
2536             }
2537         } while((prev=length)>0);
2538         return prev;
2539     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2540         UnicodeSetWithStringsIterator iter(set);
2541         UChar32 c;
2542         int32_t prev=length, length0=length;
2543         do {
2544             U16_PREV(s, 0, length, c);
2545             if(realSet.contains(c)) {
2546                 break;
2547             }
2548             const UnicodeString *str;
2549             iter.reset();
2550             while((str=iter.nextString())!=NULL) {
2551                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2552                     // spanNeedsStrings=TRUE;
2553                     return prev;
2554                 }
2555             }
2556         } while((prev=length)>0);
2557         return prev;
2558     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2559         UnicodeSetWithStringsIterator iter(set);
2560         UChar32 c;
2561         int32_t prev=length, minSpanStart=length, length0=length;
2562         do {
2563             U16_PREV(s, 0, length, c);
2564             if(!realSet.contains(c)) {
2565                 length=prev;  // Do not span this single, not-contained code point.
2566             }
2567             const UnicodeString *str;
2568             iter.reset();
2569             while((str=iter.nextString())!=NULL) {
2570                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2571                     // spanNeedsStrings=TRUE;
2572                     int32_t matchStart=prev-str->length();
2573                     if(matchStart==0) {
2574                         return 0;
2575                     }
2576                     if(spanCondition==USET_SPAN_CONTAINED) {
2577                         // Iterate for the shortest match at each position.
2578                         // Recurse for each but the shortest match.
2579                         if(length==prev) {
2580                             length=matchStart;  // First match from prev.
2581                         } else {
2582                             if(matchStart>length) {
2583                                 // Remember shortest match from prev for iteration.
2584                                 int32_t temp=length;
2585                                 length=matchStart;
2586                                 matchStart=temp;
2587                             }
2588                             // Recurse for non-shortest match from prev.
2589                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2590                                                                     USET_SPAN_CONTAINED);
2591                             if(spanStart<minSpanStart) {
2592                                 minSpanStart=spanStart;
2593                                 if(minSpanStart==0) {
2594                                     return 0;
2595                                 }
2596                             }
2597                         }
2598                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2599                         if(matchStart<length) {
2600                             // Remember longest match from prev.
2601                             length=matchStart;
2602                         }
2603                     }
2604                 }
2605             }
2606             if(length==prev) {
2607                 break;  // No match from prev.
2608             }
2609         } while((prev=length)>0);
2610         if(prev<minSpanStart) {
2611             return prev;
2612         } else {
2613             return minSpanStart;
2614         }
2615     }
2616 }
2617
2618 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2619                                 USetSpanCondition spanCondition) {
2620     const UnicodeSet &realSet(set.getSet());
2621     if(!set.hasStrings()) {
2622         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2623             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2624         }
2625
2626         UChar32 c;
2627         int32_t start=0, prev;
2628         while((prev=start)<length) {
2629             U8_NEXT_OR_FFFD(s, start, length, c);
2630             if(realSet.contains(c)!=spanCondition) {
2631                 break;
2632             }
2633         }
2634         return prev;
2635     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2636         UnicodeSetWithStringsIterator iter(set);
2637         UChar32 c;
2638         int32_t start, next;
2639         for(start=next=0; start<length;) {
2640             U8_NEXT_OR_FFFD(s, next, length, c);
2641             if(realSet.contains(c)) {
2642                 break;
2643             }
2644             const char *s8;
2645             int32_t length8;
2646             iter.reset();
2647             while((s8=iter.nextUTF8(length8))!=NULL) {
2648                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2649                     // spanNeedsStrings=TRUE;
2650                     return start;
2651                 }
2652             }
2653             start=next;
2654         }
2655         return start;
2656     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2657         UnicodeSetWithStringsIterator iter(set);
2658         UChar32 c;
2659         int32_t start, next, maxSpanLimit=0;
2660         for(start=next=0; start<length;) {
2661             U8_NEXT_OR_FFFD(s, next, length, c);
2662             if(!realSet.contains(c)) {
2663                 next=start;  // Do not span this single, not-contained code point.
2664             }
2665             const char *s8;
2666             int32_t length8;
2667             iter.reset();
2668             while((s8=iter.nextUTF8(length8))!=NULL) {
2669                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2670                     // spanNeedsStrings=TRUE;
2671                     int32_t matchLimit=start+length8;
2672                     if(matchLimit==length) {
2673                         return length;
2674                     }
2675                     if(spanCondition==USET_SPAN_CONTAINED) {
2676                         // Iterate for the shortest match at each position.
2677                         // Recurse for each but the shortest match.
2678                         if(next==start) {
2679                             next=matchLimit;  // First match from start.
2680                         } else {
2681                             if(matchLimit<next) {
2682                                 // Remember shortest match from start for iteration.
2683                                 int32_t temp=next;
2684                                 next=matchLimit;
2685                                 matchLimit=temp;
2686                             }
2687                             // Recurse for non-shortest match from start.
2688                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2689                                                                 USET_SPAN_CONTAINED);
2690                             if((matchLimit+spanLength)>maxSpanLimit) {
2691                                 maxSpanLimit=matchLimit+spanLength;
2692                                 if(maxSpanLimit==length) {
2693                                     return length;
2694                                 }
2695                             }
2696                         }
2697                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2698                         if(matchLimit>next) {
2699                             // Remember longest match from start.
2700                             next=matchLimit;
2701                         }
2702                     }
2703                 }
2704             }
2705             if(next==start) {
2706                 break;  // No match from start.
2707             }
2708             start=next;
2709         }
2710         if(start>maxSpanLimit) {
2711             return start;
2712         } else {
2713             return maxSpanLimit;
2714         }
2715     }
2716 }
2717
2718 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2719                                     USetSpanCondition spanCondition) {
2720     if(length==0) {
2721         return 0;
2722     }
2723     const UnicodeSet &realSet(set.getSet());
2724     if(!set.hasStrings()) {
2725         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2726             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2727         }
2728
2729         UChar32 c;
2730         int32_t prev=length;
2731         do {
2732             U8_PREV_OR_FFFD(s, 0, length, c);
2733             if(realSet.contains(c)!=spanCondition) {
2734                 break;
2735             }
2736         } while((prev=length)>0);
2737         return prev;
2738     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2739         UnicodeSetWithStringsIterator iter(set);
2740         UChar32 c;
2741         int32_t prev=length;
2742         do {
2743             U8_PREV_OR_FFFD(s, 0, length, c);
2744             if(realSet.contains(c)) {
2745                 break;
2746             }
2747             const char *s8;
2748             int32_t length8;
2749             iter.reset();
2750             while((s8=iter.nextUTF8(length8))!=NULL) {
2751                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2752                     // spanNeedsStrings=TRUE;
2753                     return prev;
2754                 }
2755             }
2756         } while((prev=length)>0);
2757         return prev;
2758     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2759         UnicodeSetWithStringsIterator iter(set);
2760         UChar32 c;
2761         int32_t prev=length, minSpanStart=length;
2762         do {
2763             U8_PREV_OR_FFFD(s, 0, length, c);
2764             if(!realSet.contains(c)) {
2765                 length=prev;  // Do not span this single, not-contained code point.
2766             }
2767             const char *s8;
2768             int32_t length8;
2769             iter.reset();
2770             while((s8=iter.nextUTF8(length8))!=NULL) {
2771                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2772                     // spanNeedsStrings=TRUE;
2773                     int32_t matchStart=prev-length8;
2774                     if(matchStart==0) {
2775                         return 0;
2776                     }
2777                     if(spanCondition==USET_SPAN_CONTAINED) {
2778                         // Iterate for the shortest match at each position.
2779                         // Recurse for each but the shortest match.
2780                         if(length==prev) {
2781                             length=matchStart;  // First match from prev.
2782                         } else {
2783                             if(matchStart>length) {
2784                                 // Remember shortest match from prev for iteration.
2785                                 int32_t temp=length;
2786                                 length=matchStart;
2787                                 matchStart=temp;
2788                             }
2789                             // Recurse for non-shortest match from prev.
2790                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2791                                                                    USET_SPAN_CONTAINED);
2792                             if(spanStart<minSpanStart) {
2793                                 minSpanStart=spanStart;
2794                                 if(minSpanStart==0) {
2795                                     return 0;
2796                                 }
2797                             }
2798                         }
2799                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2800                         if(matchStart<length) {
2801                             // Remember longest match from prev.
2802                             length=matchStart;
2803                         }
2804                     }
2805                 }
2806             }
2807             if(length==prev) {
2808                 break;  // No match from prev.
2809             }
2810         } while((prev=length)>0);
2811         if(prev<minSpanStart) {
2812             return prev;
2813         } else {
2814             return minSpanStart;
2815         }
2816     }
2817 }
2818
2819 // spans to be performed and compared
2820 enum {
2821     SPAN_UTF16          =1,
2822     SPAN_UTF8           =2,
2823     SPAN_UTFS           =3,
2824
2825     SPAN_SET            =4,
2826     SPAN_COMPLEMENT     =8,
2827     SPAN_POLARITY       =0xc,
2828
2829     SPAN_FWD            =0x10,
2830     SPAN_BACK           =0x20,
2831     SPAN_DIRS           =0x30,
2832
2833     SPAN_CONTAINED      =0x100,
2834     SPAN_SIMPLE         =0x200,
2835     SPAN_CONDITION      =0x300,
2836
2837     SPAN_ALL            =0x33f
2838 };
2839
2840 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2841     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2842 }
2843
2844 static inline int32_t slen(const void *s, UBool isUTF16) {
2845     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2846 }
2847
2848 /*
2849  * Count spans on a string with the method according to type and set the span limits.
2850  * The set may be the complement of the original.
2851  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2852  * according to the expected number of spans.
2853  * Sets typeName to an empty string if there is no such type.
2854  * Returns -1 if the span option is filtered out.
2855  */
2856 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2857                         const void *s, int32_t length, UBool isUTF16,
2858                         uint32_t whichSpans,
2859                         int type, const char *&typeName,
2860                         int32_t limits[], int32_t limitsCapacity,
2861                         int32_t expectCount) {
2862     const UnicodeSet &realSet(set.getSet());
2863     int32_t start, count;
2864     USetSpanCondition spanCondition, firstSpanCondition, contained;
2865     UBool isForward;
2866
2867     if(type<0 || 7<type) {
2868         typeName="";
2869         return 0;
2870     }
2871
2872     static const char *const typeNames16[]={
2873         "contains", "contains(LM)",
2874         "span", "span(LM)",
2875         "containsBack", "containsBack(LM)",
2876         "spanBack", "spanBack(LM)"
2877     };
2878
2879     static const char *const typeNames8[]={
2880         "containsUTF8", "containsUTF8(LM)",
2881         "spanUTF8", "spanUTF8(LM)",
2882         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2883         "spanBackUTF8", "spanBackUTF8(LM)"
2884     };
2885
2886     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2887
2888     // filter span options
2889     if(type<=3) {
2890         // span forward
2891         if((whichSpans&SPAN_FWD)==0) {
2892             return -1;
2893         }
2894         isForward=TRUE;
2895     } else {
2896         // span backward
2897         if((whichSpans&SPAN_BACK)==0) {
2898             return -1;
2899         }
2900         isForward=FALSE;
2901     }
2902     if((type&1)==0) {
2903         // use USET_SPAN_CONTAINED
2904         if((whichSpans&SPAN_CONTAINED)==0) {
2905             return -1;
2906         }
2907         contained=USET_SPAN_CONTAINED;
2908     } else {
2909         // use USET_SPAN_SIMPLE
2910         if((whichSpans&SPAN_SIMPLE)==0) {
2911             return -1;
2912         }
2913         contained=USET_SPAN_SIMPLE;
2914     }
2915
2916     // Default first span condition for going forward with an uncomplemented set.
2917     spanCondition=USET_SPAN_NOT_CONTAINED;
2918     if(isComplement) {
2919         spanCondition=invertSpanCondition(spanCondition, contained);
2920     }
2921
2922     // First span condition for span(), used to terminate the spanBack() iteration.
2923     firstSpanCondition=spanCondition;
2924
2925     // spanBack(): Its initial span condition is span()'s last span condition,
2926     // which is the opposite of span()'s first span condition
2927     // if we expect an even number of spans.
2928     // (The loop inverts spanCondition (expectCount-1) times
2929     // before the expectCount'th span() call.)
2930     // If we do not compare forward and backward directions, then we do not have an
2931     // expectCount and just start with firstSpanCondition.
2932     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2933         spanCondition=invertSpanCondition(spanCondition, contained);
2934     }
2935
2936     count=0;
2937     switch(type) {
2938     case 0:
2939     case 1:
2940         start=0;
2941         if(length<0) {
2942             length=slen(s, isUTF16);
2943         }
2944         for(;;) {
2945             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2946                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2947             if(count<limitsCapacity) {
2948                 limits[count]=start;
2949             }
2950             ++count;
2951             if(start>=length) {
2952                 break;
2953             }
2954             spanCondition=invertSpanCondition(spanCondition, contained);
2955         }
2956         break;
2957     case 2:
2958     case 3:
2959         start=0;
2960         for(;;) {
2961             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2962                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2963             if(count<limitsCapacity) {
2964                 limits[count]=start;
2965             }
2966             ++count;
2967             if(length>=0 ? start>=length :
2968                            isUTF16 ? ((const UChar *)s)[start]==0 :
2969                                      ((const char *)s)[start]==0
2970             ) {
2971                 break;
2972             }
2973             spanCondition=invertSpanCondition(spanCondition, contained);
2974         }
2975         break;
2976     case 4:
2977     case 5:
2978         if(length<0) {
2979             length=slen(s, isUTF16);
2980         }
2981         for(;;) {
2982             ++count;
2983             if(count<=limitsCapacity) {
2984                 limits[limitsCapacity-count]=length;
2985             }
2986             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2987                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2988             if(length==0 && spanCondition==firstSpanCondition) {
2989                 break;
2990             }
2991             spanCondition=invertSpanCondition(spanCondition, contained);
2992         }
2993         if(count<limitsCapacity) {
2994             memmove(limits, limits+(limitsCapacity-count), count*4);
2995         }
2996         break;
2997     case 6:
2998     case 7:
2999         for(;;) {
3000             ++count;
3001             if(count<=limitsCapacity) {
3002                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3003             }
3004             // Note: Length<0 is tested only for the first spanBack().
3005             // If we wanted to keep length<0 for all spanBack()s, we would have to
3006             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3007             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3008                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3009             if(length==0 && spanCondition==firstSpanCondition) {
3010                 break;
3011             }
3012             spanCondition=invertSpanCondition(spanCondition, contained);
3013         }
3014         if(count<limitsCapacity) {
3015             memmove(limits, limits+(limitsCapacity-count), count*4);
3016         }
3017         break;
3018     default:
3019         typeName="";
3020         return -1;
3021     }
3022
3023     return count;
3024 }
3025
3026 // sets to be tested; odd index=isComplement
3027 enum {
3028     SLOW,
3029     SLOW_NOT,
3030     FAST,
3031     FAST_NOT,
3032     SET_COUNT
3033 };
3034
3035 static const char *const setNames[SET_COUNT]={
3036     "slow",
3037     "slow.not",
3038     "fast",
3039     "fast.not"
3040 };
3041
3042 /*
3043  * Verify that we get the same results whether we look at text with contains(),
3044  * span() or spanBack(), using unfrozen or frozen versions of the set,
3045  * and using the set or its complement (switching the spanConditions accordingly).
3046  * The latter verifies that
3047  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3048  *
3049  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3050  * or returned to the caller (with an input expectCount<0).
3051  */
3052 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3053                               const void *s, int32_t length, UBool isUTF16,
3054                               uint32_t whichSpans,
3055                               int32_t expectLimits[], int32_t &expectCount,
3056                               const char *testName, int32_t index) {
3057     int32_t limits[500];
3058     int32_t limitsCount;
3059     int i, j;
3060
3061     const char *typeName;
3062     int type;
3063
3064     for(i=0; i<SET_COUNT; ++i) {
3065         if((i&1)==0) {
3066             // Even-numbered sets are original, uncomplemented sets.
3067             if((whichSpans&SPAN_SET)==0) {
3068                 continue;
3069             }
3070         } else {
3071             // Odd-numbered sets are complemented.
3072             if((whichSpans&SPAN_COMPLEMENT)==0) {
3073                 continue;
3074             }
3075         }
3076         for(type=0;; ++type) {
3077             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3078                                  s, length, isUTF16,
3079                                  whichSpans,
3080                                  type, typeName,
3081                                  limits, LENGTHOF(limits), expectCount);
3082             if(typeName[0]==0) {
3083                 break; // All types tried.
3084             }
3085             if(limitsCount<0) {
3086                 continue; // Span option filtered out.
3087             }
3088             if(expectCount<0) {
3089                 expectCount=limitsCount;
3090                 if(limitsCount>LENGTHOF(limits)) {
3091                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3092                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3093                     return;
3094                 }
3095                 memcpy(expectLimits, limits, limitsCount*4);
3096             } else if(limitsCount!=expectCount) {
3097                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3098                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3099             } else {
3100                 for(j=0; j<limitsCount; ++j) {
3101                     if(limits[j]!=expectLimits[j]) {
3102                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3103                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3104                               j, (long)limits[j], (long)expectLimits[j]);
3105                         break;
3106                     }
3107                 }
3108             }
3109         }
3110     }
3111
3112     // Compare span() with containsAll()/containsNone(),
3113     // but only if we have expectLimits[] from the uncomplemented set.
3114     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3115         const UChar *s16=(const UChar *)s;
3116         UnicodeString string;
3117         int32_t prev=0, limit, length;
3118         for(i=0; i<expectCount; ++i) {
3119             limit=expectLimits[i];
3120             length=limit-prev;
3121             if(length>0) {
3122                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3123                 if(i&1) {
3124                     if(!sets[SLOW]->getSet().containsAll(string)) {
3125                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3126                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3127                         return;
3128                     }
3129                     if(!sets[FAST]->getSet().containsAll(string)) {
3130                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3131                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3132                         return;
3133                     }
3134                 } else {
3135                     if(!sets[SLOW]->getSet().containsNone(string)) {
3136                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3137                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3138                         return;
3139                     }
3140                     if(!sets[FAST]->getSet().containsNone(string)) {
3141                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3142                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3143                         return;
3144                     }
3145                 }
3146             }
3147             prev=limit;
3148         }
3149     }
3150 }
3151
3152 // Specifically test either UTF-16 or UTF-8.
3153 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3154                               const void *s, int32_t length, UBool isUTF16,
3155                               uint32_t whichSpans,
3156                               const char *testName, int32_t index) {
3157     int32_t expectLimits[500];
3158     int32_t expectCount=-1;
3159     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3160 }
3161
3162 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3163     UChar c, c2;
3164
3165     if(length>=0) {
3166         while(length>0) {
3167             c=*s++;
3168             --length;
3169             if(0xd800<=c && c<0xe000) {
3170                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3171                     return TRUE;
3172                 }
3173                 --length;
3174             }
3175         }
3176     } else {
3177         while((c=*s++)!=0) {
3178             if(0xd800<=c && c<0xe000) {
3179                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3180                     return TRUE;
3181                 }
3182             }
3183         }
3184     }
3185     return FALSE;
3186 }
3187
3188 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3189 // unless either UTF is turned off in whichSpans.
3190 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3191 // have the same contains(c) value as U+FFFD.
3192 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3193                                       const UChar *s16, int32_t length16,
3194                                       uint32_t whichSpans,
3195                                       const char *testName, int32_t index) {
3196     int32_t expectLimits[500];
3197     int32_t expectCount;
3198
3199     expectCount=-1;  // Get expectLimits[] from testSpan().
3200
3201     if((whichSpans&SPAN_UTF16)!=0) {
3202         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3203     }
3204     if((whichSpans&SPAN_UTF8)==0) {
3205         return;
3206     }
3207
3208     // Convert s16[] and expectLimits[] to UTF-8.
3209     uint8_t s8[3000];
3210     int32_t offsets[3000];
3211
3212     const UChar *s16Limit=s16+length16;
3213     char *t=(char *)s8;
3214     char *tLimit=t+sizeof(s8);
3215     int32_t *o=offsets;
3216     UErrorCode errorCode=U_ZERO_ERROR;
3217
3218     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3219     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3220     if(U_FAILURE(errorCode)) {
3221         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3222               testName, (long)index, u_errorName(errorCode));
3223         ucnv_resetFromUnicode(utf8Cnv);
3224         return;
3225     }
3226     int32_t length8=(int32_t)(t-(char *)s8);
3227
3228     // Convert expectLimits[].
3229     int32_t i, j, expect;
3230     for(i=j=0; i<expectCount; ++i) {
3231         expect=expectLimits[i];
3232         if(expect==length16) {
3233             expectLimits[i]=length8;
3234         } else {
3235             while(offsets[j]<expect) {
3236                 ++j;
3237             }
3238             expectLimits[i]=j;
3239         }
3240     }
3241
3242     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3243 }
3244
3245 static UChar32 nextCodePoint(UChar32 c) {
3246     // Skip some large and boring ranges.
3247     switch(c) {
3248     case 0x3441:
3249         return 0x4d7f;
3250     case 0x5100:
3251         return 0x9f00;
3252     case 0xb040:
3253         return 0xd780;
3254     case 0xe041:
3255         return 0xf8fe;
3256     case 0x10100:
3257         return 0x20000;
3258     case 0x20041:
3259         return 0xe0000;
3260     case 0xe0101:
3261         return 0x10fffd;
3262     default:
3263         return c+1;
3264     }
3265 }
3266
3267 // Verify that all implementations represent the same set.
3268 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3269     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3270     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3271     // Skip the UTF-8 part of the test - if the string contains surrogates -
3272     // because it is likely to produce a different result.
3273     UBool inconsistentSurrogates=
3274             (!(sets[0]->getSet().contains(0xfffd) ?
3275                sets[0]->getSet().contains(0xd800, 0xdfff) :
3276                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3277              sets[0]->hasStringsWithSurrogates());
3278
3279     UChar s[1000];
3280     int32_t length=0;
3281     uint32_t localWhichSpans;
3282
3283     UChar32 c, first;
3284     for(first=c=0;; c=nextCodePoint(c)) {
3285         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3286             localWhichSpans=whichSpans;
3287             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3288                 localWhichSpans&=~SPAN_UTF8;
3289             }
3290             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3291             if(c>0x10ffff) {
3292                 break;
3293             }
3294             length=0;
3295             first=c;
3296         }
3297         U16_APPEND_UNSAFE(s, length, c);
3298     }
3299 }
3300
3301 // Test with a particular, interesting string.
3302 // Specify length and try NUL-termination.
3303 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3304     static const UChar s[]={
3305         0x61, 0x62, 0x20,                       // Latin, space
3306         0x3b1, 0x3b2, 0x3b3,                    // Greek
3307         0xd900,                                 // lead surrogate
3308         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3309         0xdc05,                                 // trail surrogate
3310         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3311         0xd900, 0xdc05,                         // unassigned supplementary
3312         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3313         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3314         0                                       // NUL
3315     };
3316
3317     if((whichSpans&SPAN_UTF16)==0) {
3318         return;
3319     }
3320     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3321     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3322 }
3323
3324 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3325     static const char s[]={
3326         "abc"                                   // Latin
3327
3328         /* trail byte in lead position */
3329         "\x80"
3330
3331         " "                                     // space
3332
3333         /* truncated multi-byte sequences */
3334         "\xd0"
3335         "\xe0"
3336         "\xe1"
3337         "\xed"
3338         "\xee"
3339         "\xf0"
3340         "\xf1"
3341         "\xf4"
3342         "\xf8"
3343         "\xfc"
3344
3345         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3346
3347         /* trail byte in lead position */
3348         "\x80"
3349
3350         "\xe0\x80"
3351         "\xe0\xa0"
3352         "\xe1\x80"
3353         "\xed\x80"
3354         "\xed\xa0"
3355         "\xee\x80"
3356         "\xf0\x80"
3357         "\xf0\x90"
3358         "\xf1\x80"
3359         "\xf4\x80"
3360         "\xf4\x90"
3361         "\xf8\x80"
3362         "\xfc\x80"
3363
3364         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3365
3366         /* trail byte in lead position */
3367         "\x80"
3368
3369         "\xf0\x80\x80"
3370         "\xf0\x90\x80"
3371         "\xf1\x80\x80"
3372         "\xf4\x80\x80"
3373         "\xf4\x90\x80"
3374         "\xf8\x80\x80"
3375         "\xfc\x80\x80"
3376
3377         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3378
3379         /* trail byte in lead position */
3380         "\x80"
3381
3382         "\xf8\x80\x80\x80"
3383         "\xfc\x80\x80\x80"
3384
3385         "\xF1\x90\x80\x85"                      // unassigned supplementary
3386
3387         /* trail byte in lead position */
3388         "\x80"
3389
3390         "\xfc\x80\x80\x80\x80"
3391
3392         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3393
3394         /* trail byte in lead position */
3395         "\x80"
3396
3397         /* complete sequences but non-shortest forms or out of range etc. */
3398         "\xc0\x80"
3399         "\xe0\x80\x80"
3400         "\xed\xa0\x80"
3401         "\xf0\x80\x80\x80"
3402         "\xf4\x90\x80\x80"
3403         "\xf8\x80\x80\x80\x80"
3404         "\xfc\x80\x80\x80\x80\x80"
3405         "\xfe"
3406         "\xff"
3407
3408         /* trail byte in lead position */
3409         "\x80"
3410
3411         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3412     };
3413
3414     if((whichSpans&SPAN_UTF8)==0) {
3415         return;
3416     }
3417     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3418     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3419 }
3420
3421 // Take a set of span options and multiply them so that
3422 // each portion only has one of the options a, b and c.
3423 // If b==0, then the set of options is just modified with mask and a.
3424 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3425 static int32_t
3426 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3427                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3428     uint32_t s;
3429     int32_t i;
3430
3431     for(i=0; i<whichSpansCount; ++i) {
3432         s=whichSpans[i]&mask;
3433         whichSpans[i]=s|a;
3434         if(b!=0) {
3435             whichSpans[whichSpansCount+i]=s|b;
3436             if(c!=0) {
3437                 whichSpans[2*whichSpansCount+i]=s|c;
3438             }
3439         }
3440     }
3441     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3442 }
3443
3444 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3445 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3446 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3447 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3448
3449 void UnicodeSetTest::TestSpan() {
3450     // "[...]" is a UnicodeSet pattern.
3451     // "*" performs tests on all Unicode code points and on a selection of
3452     //   malformed UTF-8/16 strings.
3453     // "-options" limits the scope of testing for the current set.
3454     //   By default, the test verifies that equivalent boundaries are found
3455     //   for UTF-16 and UTF-8, going forward and backward,
3456     //   alternating USET_SPAN_NOT_CONTAINED with
3457     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3458     //   Single-character options:
3459     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3460     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3461     //          or the set contains strings with unpaired surrogates
3462     //          which do not translate to valid UTF-8.
3463     //     c -- set.span() and set.complement().span() boundaries may differ.
3464     //          Cause: Set strings are not complemented.
3465     //     b -- span() and spanBack() boundaries may differ.
3466     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3467     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3468     //          match with non-overlapping substrings.
3469     //          For example, with a set containing "ab" and "ba",
3470     //          span() of "aba" yields boundaries { 0, 2, 3 }
3471     //          because the initial "ab" matches from 0 to 2,
3472     //          while spanBack() yields boundaries { 0, 1, 3 }
3473     //          because the final "ba" matches from 1 to 3.
3474     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3475     //          Cause: Strings in the set overlap, and a longer match may
3476     //          require a sequence including non-longest substrings.
3477     //          For example, with a set containing "ab", "abc" and "cd",
3478     //          span(contained) of "abcd" spans the entire string
3479     //          but span(longest match) only spans the first 3 characters.
3480     //   Each "-options" first resets all options and then applies the specified options.
3481     //   A "-" without options resets the options.
3482     //   The options are also reset for each new set.
3483     // Other strings will be spanned.
3484     static const char *const testdata[]={
3485         "[:ID_Continue:]",
3486         "*",
3487         "[:White_Space:]",
3488         "*",
3489         "[]",
3490         "*",
3491         "[\\u0000-\\U0010FFFF]",
3492         "*",
3493         "[\\u0000\\u0080\\u0800\\U00010000]",
3494         "*",
3495         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3496         "*",
3497         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3498         "-c",
3499         "*",
3500         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3501         "-c",
3502         "*",
3503
3504         // Overlapping strings cause overlapping attempts to match.
3505         "[x{xy}{xya}{axy}{ax}]",
3506         "-cl",
3507
3508         // More repetitions of "xya" would take too long with the recursive
3509         // reference implementation.
3510         // containsAll()=FALSE
3511         // test_string 0x14
3512         "xx"
3513         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3514         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3515         "xyaxyaxyaxya"
3516         "xx"
3517         "xyaxyaxyaxya"  // span() ends here.
3518         "aaa",
3519
3520         // containsAll()=TRUE
3521         // test_string 0x15
3522         "xx"
3523         "xyaxyaxyaxya"
3524         "xx"
3525         "xyaxyaxyaxya"
3526         "xx"
3527         "xyaxyaxyaxy",
3528
3529         "-bc",
3530         // test_string 0x17
3531         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3532         "-c",
3533         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3534         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3535         "-",
3536         "byaya",     // span() -> { 5 }
3537         "byay",      // span() -> { 4 }
3538         "bya",       // span() -> { 3 }
3539
3540         // span(longest match) will not span the whole string.
3541         "[a{ab}{bc}]",
3542         "-cl",
3543         // test_string 0x21
3544         "abc",
3545
3546         "[a{ab}{abc}{cd}]",
3547         "-cl",
3548         "acdabcdabccd",
3549
3550         // spanBack(longest match) will not span the whole string.
3551         "[c{ab}{bc}]",
3552         "-cl",
3553         "abc",
3554
3555         "[d{cd}{bcd}{ab}]",
3556         "-cl",
3557         "abbcdabcdabd",
3558
3559         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3560         // and UTF-8 trail bytes.
3561         // Copies of above test sets and strings, but transliterated to have
3562         // different code points with similar trail units.
3563         // Previous: a      b         c            d
3564         // Unicode:  042B   30AB      200AB        204AB
3565         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3566         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3567         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3568         "-cl",
3569         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3570
3571         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3572         "-cl",
3573         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3574
3575         // Stress bookkeeping and recursion.
3576         // The following strings are barely doable with the recursive
3577         // reference implementation.
3578         // The not-contained character at the end prevents an early exit from the span().
3579         "[b{bb}]",
3580         "-c",
3581         // test_string 0x33
3582         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3583         // On complement sets, span() and spanBack() get different results
3584         // because b is not in the complement set and there is an odd number of b's
3585         // in the test string.
3586         "-bc",
3587         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3588
3589         // Test with set strings with an initial or final code point span
3590         // longer than 254.
3591         "[a{" _64_a _64_a _64_a _64_a "b}"
3592           "{a" _64_b _64_b _64_b _64_b "}]",
3593         "-c",
3594         _64_a _64_a _64_a _63_a "b",
3595         _64_a _64_a _64_a _64_a "b",
3596         _64_a _64_a _64_a _64_a "aaaabbbb",
3597         "a" _64_b _64_b _64_b _63_b,
3598         "a" _64_b _64_b _64_b _64_b,
3599         "aaaabbbb" _64_b _64_b _64_b _64_b,
3600
3601         // Test with strings containing unpaired surrogates.
3602         // They are not representable in UTF-8, and a leading trail surrogate
3603         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3604         // U+20001 == \\uD840\\uDC01
3605         // U+20400 == \\uD841\\uDC00
3606         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3607         "-8cl",
3608         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3609     };
3610     uint32_t whichSpans[96]={ SPAN_ALL };
3611     int32_t whichSpansCount=1;
3612
3613     UnicodeSet *sets[SET_COUNT]={ NULL };
3614     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3615
3616     char testName[1024];
3617     char *testNameLimit=testName;
3618
3619     int32_t i, j;
3620     for(i=0; i<LENGTHOF(testdata); ++i) {
3621         const char *s=testdata[i];
3622         if(s[0]=='[') {
3623             // Create new test sets from this pattern.
3624             for(j=0; j<SET_COUNT; ++j) {
3625                 delete sets_with_str[j];
3626                 delete sets[j];
3627             }
3628             UErrorCode errorCode=U_ZERO_ERROR;
3629             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3630             if(U_FAILURE(errorCode)) {
3631                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3632                 break;
3633             }
3634             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3635             sets[SLOW_NOT]->complement();
3636             // Intermediate set: Test cloning of a frozen set.
3637             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3638             fast->freeze();
3639             sets[FAST]=(UnicodeSet *)fast->clone();
3640             delete fast;
3641             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3642             fastNot->freeze();
3643             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3644             delete fastNot;
3645
3646             for(j=0; j<SET_COUNT; ++j) {
3647                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3648             }
3649
3650             strcpy(testName, s);
3651             testNameLimit=strchr(testName, 0);
3652             *testNameLimit++=':';
3653             *testNameLimit=0;
3654
3655             whichSpans[0]=SPAN_ALL;
3656             whichSpansCount=1;
3657         } else if(s[0]=='-') {
3658             whichSpans[0]=SPAN_ALL;
3659             whichSpansCount=1;
3660
3661             while(*++s!=0) {
3662                 switch(*s) {
3663                 case 'c':
3664                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3665                                                    ~SPAN_POLARITY,
3666                                                    SPAN_SET,
3667                                                    SPAN_COMPLEMENT,
3668                                                    0);
3669                     break;
3670                 case 'b':
3671                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3672                                                    ~SPAN_DIRS,
3673                                                    SPAN_FWD,
3674                                                    SPAN_BACK,
3675                                                    0);
3676                     break;
3677                 case 'l':
3678                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3679                     // USET_SPAN_SIMPLE only FWD, and separately
3680                     // USET_SPAN_SIMPLE only BACK
3681                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3682                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3683                                                    SPAN_DIRS|SPAN_CONTAINED,
3684                                                    SPAN_FWD|SPAN_SIMPLE,
3685                                                    SPAN_BACK|SPAN_SIMPLE);
3686                     break;
3687                 case '8':
3688                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3689                                                    ~SPAN_UTFS,
3690                                                    SPAN_UTF16,
3691                                                    SPAN_UTF8,
3692                                                    0);
3693                     break;
3694                 default:
3695                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3696                     break;
3697                 }
3698             }
3699         } else if(0==strcmp(s, "*")) {
3700             strcpy(testNameLimit, "bad_string");
3701             for(j=0; j<whichSpansCount; ++j) {
3702                 if(whichSpansCount>1) {
3703                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3704                             "%%0x%3x",
3705                             whichSpans[j]);
3706                 }
3707                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3708                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3709             }
3710
3711             strcpy(testNameLimit, "contents");
3712             for(j=0; j<whichSpansCount; ++j) {
3713                 if(whichSpansCount>1) {
3714                     sprintf(testNameLimit+8 /* strlen("contents") */,
3715                             "%%0x%3x",
3716                             whichSpans[j]);
3717                 }
3718                 testSpanContents(sets_with_str, whichSpans[j], testName);
3719             }
3720         } else {
3721             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3722             strcpy(testNameLimit, "test_string");
3723             for(j=0; j<whichSpansCount; ++j) {
3724                 if(whichSpansCount>1) {
3725                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3726                             "%%0x%3x",
3727                             whichSpans[j]);
3728                 }
3729                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3730             }
3731         }
3732     }
3733     for(j=0; j<SET_COUNT; ++j) {
3734         delete sets_with_str[j];
3735         delete sets[j];
3736     }
3737 }
3738
3739 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3740 void UnicodeSetTest::TestStringSpan() {
3741     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3742     static const char *const string=
3743         "xx"
3744         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3745         "xx"
3746         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3747         "xx"
3748         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3749         "aaaa";
3750
3751     UErrorCode errorCode=U_ZERO_ERROR;
3752     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3753     UnicodeSet set(pattern16, errorCode);
3754     if(U_FAILURE(errorCode)) {
3755         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3756         return;
3757     }
3758
3759     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3760
3761     if(set.containsAll(string16)) {
3762         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3763     }
3764
3765     // Remove trailing "aaaa".
3766     string16.truncate(string16.length()-4);
3767     if(!set.containsAll(string16)) {
3768         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3769     }
3770
3771     string16=UNICODE_STRING_SIMPLE("byayaxya");
3772     const UChar *s16=string16.getBuffer();
3773     int32_t length16=string16.length();
3774     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3775         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3776         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3777         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3778         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3779         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3780     ) {
3781         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3782     }
3783
3784     pattern="[a{ab}{abc}{cd}]";
3785     pattern16=UnicodeString(pattern, -1, US_INV);
3786     set.applyPattern(pattern16, errorCode);
3787     if(U_FAILURE(errorCode)) {
3788         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3789         return;
3790     }
3791     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3792     s16=string16.getBuffer();
3793     length16=string16.length();
3794     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3795         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3796         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3797     ) {
3798         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3799     }
3800
3801     pattern="[d{cd}{bcd}{ab}]";
3802     pattern16=UnicodeString(pattern, -1, US_INV);
3803     set.applyPattern(pattern16, errorCode).freeze();
3804     if(U_FAILURE(errorCode)) {
3805         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3806         return;
3807     }
3808     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3809     s16=string16.getBuffer();
3810     length16=string16.length();
3811     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3812         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3813         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3814     ) {
3815         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3816     }
3817 }