icuSources/test/intltest/usettest.cpp

   1 /*
   2 ********************************************************************************
   3 *   Copyright (C) 1999-2016 International Business Machines Corporation and
   4 *   others. All Rights Reserved.
   5 ********************************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 *   03/22/2000  Madhu       Added additional tests
   9 ********************************************************************************
  10 */
  11
  12 #include <stdio.h>
  13
  14 #include <string.h>
  15 #include "unicode/utypes.h"
  16 #include "usettest.h"
  17 #include "unicode/ucnv.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/usetiter.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/parsepos.h"
  23 #include "unicode/symtable.h"
  24 #include "unicode/uversion.h"
  25 #include "cmemory.h"
  26 #include "hash.h"
  27
  28 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  29     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  30     u_errorName(status));}}
  31
  32 #define TEST_ASSERT(expr) {if (!(expr)) { \
  33     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
  34
  35 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  36     UnicodeString pat;
  37     set.toPattern(pat);
  38     return left + UnicodeSetTest::escape(pat);
  39 }
  40
  41 #define CASE(id,test) case id:                          \
  42                           name = #test;                 \
  43                           if (exec) {                   \
  44                               logln(#test "---");       \
  45                               logln();                  \
  46                               test();                   \
  47                           }                             \
  48                           break
  49
  50 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
  51 }
  52
  53 UConverter *UnicodeSetTest::openUTF8Converter() {
  54     if(utf8Cnv==NULL) {
  55         UErrorCode errorCode=U_ZERO_ERROR;
  56         utf8Cnv=ucnv_open("UTF-8", &errorCode);
  57     }
  58     return utf8Cnv;
  59 }
  60
  61 UnicodeSetTest::~UnicodeSetTest() {
  62     ucnv_close(utf8Cnv);
  63 }
  64
  65 void
  66 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  67                                const char* &name, char* /*par*/) {
  68     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  69     switch (index) {
  70         CASE(0,TestPatterns);
  71         CASE(1,TestAddRemove);
  72         CASE(2,TestCategories);
  73         CASE(3,TestCloneEqualHash);
  74         CASE(4,TestMinimalRep);
  75         CASE(5,TestAPI);
  76         CASE(6,TestScriptSet);
  77         CASE(7,TestPropertySet);
  78         CASE(8,TestClone);
  79         CASE(9,TestExhaustive);
  80         CASE(10,TestToPattern);
  81         CASE(11,TestIndexOf);
  82         CASE(12,TestStrings);
  83         CASE(13,Testj2268);
  84         CASE(14,TestCloseOver);
  85         CASE(15,TestEscapePattern);
  86         CASE(16,TestInvalidCodePoint);
  87         CASE(17,TestSymbolTable);
  88         CASE(18,TestSurrogate);
  89         CASE(19,TestPosixClasses);
  90         CASE(20,TestIteration);
  91         CASE(21,TestFreezable);
  92         CASE(22,TestSpan);
  93         CASE(23,TestStringSpan);
  94         CASE(24,TestUCAUnsafeBackwards);
  95         default: name = ""; break;
  96     }
  97 }
  98
  99 static const char NOT[] = "%%%%";
 100
 101 /**
 102  * UVector was improperly copying contents
 103  * This code will crash this is still true
 104  */
 105 void UnicodeSetTest::Testj2268() {
 106   UnicodeSet t;
 107   t.add(UnicodeString("abc"));
 108   UnicodeSet test(t);
 109   UnicodeString ustrPat;
 110   test.toPattern(ustrPat, TRUE);
 111 }
 112
 113 /**
 114  * Test toPattern().
 115  */
 116 void UnicodeSetTest::TestToPattern() {
 117     UErrorCode ec = U_ZERO_ERROR;
 118
 119     // Test that toPattern() round trips with syntax characters and
 120     // whitespace.
 121     {
 122         static const char* OTHER_TOPATTERN_TESTS[] = {
 123             "[[:latin:]&[:greek:]]",
 124             "[[:latin:]-[:greek:]]",
 125             "[:nonspacing mark:]",
 126             NULL
 127         };
 128
 129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 130             ec = U_ZERO_ERROR;
 131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 132             if (U_FAILURE(ec)) {
 133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
 134                 continue;
 135             }
 136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 137         }
 138
 139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 141
 142                 // check various combinations to make sure they all work.
 143                 if (i != 0 && !toPatternAux(i, i)){
 144                     continue;
 145                 }
 146                 if (!toPatternAux(0, i)){
 147                     continue;
 148                 }
 149                 if (!toPatternAux(i, 0xFFFF)){
 150                     continue;
 151                 }
 152             }
 153         }
 154     }
 155
 156     // Test pattern behavior of multicharacter strings.
 157     {
 158         ec = U_ZERO_ERROR;
 159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 160
 161         // This loop isn't a loop.  It's here to make the compiler happy.
 162         // If you're curious, try removing it and changing the 'break'
 163         // statements (except for the last) to goto's.
 164         for (;;) {
 165             if (U_FAILURE(ec)) break;
 166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 168
 169             s->add("ac");
 170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 172
 173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
 174             if (U_FAILURE(ec)) break;
 175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
 177
 178             s->add("[]");
 179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
 181
 182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
 183             if (U_FAILURE(ec)) break;
 184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
 186
 187             // j2189
 188             s->clear();
 189             s->add(UnicodeString("abc", ""));
 190             s->add(UnicodeString("abc", ""));
 191             const char* exp6[] = {"abc", NOT, "ab", NULL};
 192             expectToPattern(*s, "[{abc}]", exp6);
 193
 194             break;
 195         }
 196
 197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 198         delete s;
 199     }
 200
 201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 202     UnicodeSet s;
 203     s.add((UChar)97, (UChar)98); // 'a', 'b'
 204     expectToPattern(s, "[ab]", NULL);
 205 }
 206
 207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 208
 209     // use Integer.toString because Utility.hex doesn't handle ints
 210     UnicodeString pat = "";
 211     // TODO do these in hex
 212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 214     UnicodeString source;
 215     source = source + (uint32_t)start;
 216     if (start != end)
 217         source = source + ".." + (uint32_t)end;
 218     UnicodeSet testSet;
 219     testSet.add(start, end);
 220     return checkPat(source, testSet);
 221 }
 222
 223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 224                                const UnicodeSet& testSet) {
 225     // What we want to make sure of is that a pattern generated
 226     // by toPattern(), with or without escaped unprintables, can
 227     // be passed back into the UnicodeSet constructor.
 228     UnicodeString pat0;
 229
 230     testSet.toPattern(pat0, TRUE);
 231
 232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 233
 234     //String pat1 = unescapeLeniently(pat0);
 235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 236
 237     UnicodeString pat2;
 238     testSet.toPattern(pat2, FALSE);
 239     if (!checkPat(source, testSet, pat2)) return FALSE;
 240
 241     //String pat3 = unescapeLeniently(pat2);
 242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 243
 244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 246     return TRUE;
 247 }
 248
 249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 250                                const UnicodeSet& testSet,
 251                                const UnicodeString& pat) {
 252     UErrorCode ec = U_ZERO_ERROR;
 253     UnicodeSet testSet2(pat, ec);
 254     if (testSet2 != testSet) {
 255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 256         return FALSE;
 257     }
 258     return TRUE;
 259 }
 260
 261 void
 262 UnicodeSetTest::TestPatterns(void) {
 263     UnicodeSet set;
 264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 270
 271     // Throw in a test of complement
 272     set.complement();
 273     UnicodeString exp;
 274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 275     expectPairs(set, exp);
 276 }
 277
 278 void
 279 UnicodeSetTest::TestCategories(void) {
 280     UErrorCode status = U_ZERO_ERROR;
 281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 282     UnicodeSet set(pat, status);
 283     if (U_FAILURE(status)) {
 284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
 285         return;
 286     } else {
 287         expectContainment(set, pat, "ABC", "abc");
 288     }
 289
 290     UChar32 i;
 291     int32_t failures = 0;
 292     // Make sure generation of L doesn't pollute cached Lu set
 293     // First generate L, then Lu
 294     set.applyPattern("[:L:]", status);
 295     if (U_FAILURE(status)) { errln("FAIL"); return; }
 296     for (i=0; i<0x200; ++i) {
 297         UBool l = u_isalpha((UChar)i);
 298         if (l != set.contains(i)) {
 299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 300                   set.contains(i));
 301             if (++failures == 10) break;
 302         }
 303     }
 304
 305     set.applyPattern("[:Lu:]", status);
 306     if (U_FAILURE(status)) { errln("FAIL"); return; }
 307     for (i=0; i<0x200; ++i) {
 308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 309         if (lu != set.contains(i)) {
 310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 311                   set.contains(i));
 312             if (++failures == 20) break;
 313         }
 314     }
 315 }
 316 void
 317 UnicodeSetTest::TestCloneEqualHash(void) {
 318     UErrorCode status = U_ZERO_ERROR;
 319     // set1 and set2 used to be built with the obsolete constructor taking
 320     // UCharCategory values; replaced with pattern constructors
 321     // markus 20030502
 322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
 323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
 324     if (U_FAILURE(status)){
 325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
 326         return;
 327     }
 328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
 329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
 330     if (U_FAILURE(status)){
 331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 332         return;
 333     }
 334
 335     if (*set1 != *set1a) {
 336         errln("FAIL: category constructor for Ll broken");
 337     }
 338     if (*set2 != *set2a) {
 339         errln("FAIL: category constructor for Nd broken");
 340     }
 341     delete set1a;
 342     delete set2a;
 343
 344     logln("Testing copy construction");
 345     UnicodeSet *set1copy=new UnicodeSet(*set1);
 346     if(*set1 != *set1copy || *set1 == *set2 ||
 347         getPairs(*set1) != getPairs(*set1copy) ||
 348         set1->hashCode() != set1copy->hashCode()){
 349         errln("FAIL : Error in copy construction");
 350         return;
 351     }
 352
 353     logln("Testing =operator");
 354     UnicodeSet set1equal=*set1;
 355     UnicodeSet set2equal=*set2;
 356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 358         errln("FAIL: Error in =operator");
 359     }
 360
 361     logln("Testing clone()");
 362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 367         errln("FAIL: Error in clone");
 368     }
 369
 370     logln("Testing hashcode");
 371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 376         errln("FAIL: Error in hashCode()");
 377     }
 378
 379     delete set1;
 380     delete set1copy;
 381     delete set2;
 382     delete set1clone;
 383     delete set2clone;
 384
 385
 386 }
 387 void
 388 UnicodeSetTest::TestAddRemove(void) {
 389     UnicodeSet set; // Construct empty set
 390     doAssert(set.isEmpty() == TRUE, "set should be empty");
 391     doAssert(set.size() == 0, "size should be 0");
 392     set.complement();
 393     doAssert(set.size() == 0x110000, "size should be 0x110000");
 394     set.clear();
 395     set.add(0x0061, 0x007a);
 396     expectPairs(set, "az");
 397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 398     doAssert(set.size() != 0, "size should not be equal to 0");
 399     doAssert(set.size() == 26, "size should be equal to 26");
 400     set.remove(0x006d, 0x0070);
 401     expectPairs(set, "alqz");
 402     doAssert(set.size() == 22, "size should be equal to 22");
 403     set.remove(0x0065, 0x0067);
 404     expectPairs(set, "adhlqz");
 405     doAssert(set.size() == 19, "size should be equal to 19");
 406     set.remove(0x0064, 0x0069);
 407     expectPairs(set, "acjlqz");
 408     doAssert(set.size() == 16, "size should be equal to 16");
 409     set.remove(0x0063, 0x0072);
 410     expectPairs(set, "absz");
 411     doAssert(set.size() == 10, "size should be equal to 10");
 412     set.add(0x0066, 0x0071);
 413     expectPairs(set, "abfqsz");
 414     doAssert(set.size() == 22, "size should be equal to 22");
 415     set.remove(0x0061, 0x0067);
 416     expectPairs(set, "hqsz");
 417     set.remove(0x0061, 0x007a);
 418     expectPairs(set, "");
 419     doAssert(set.isEmpty() == TRUE, "set should be empty");
 420     doAssert(set.size() == 0, "size should be 0");
 421     set.add(0x0061);
 422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 423     doAssert(set.size() == 1, "size should not be equal to 1");
 424     set.add(0x0062);
 425     set.add(0x0063);
 426     expectPairs(set, "ac");
 427     doAssert(set.size() == 3, "size should not be equal to 3");
 428     set.add(0x0070);
 429     set.add(0x0071);
 430     expectPairs(set, "acpq");
 431     doAssert(set.size() == 5, "size should not be equal to 5");
 432     set.clear();
 433     expectPairs(set, "");
 434     doAssert(set.isEmpty() == TRUE, "set should be empty");
 435     doAssert(set.size() == 0, "size should be 0");
 436
 437     // Try removing an entire set from another set
 438     expectPattern(set, "[c-x]", "cx");
 439     UnicodeSet set2;
 440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 441     set.removeAll(set2);
 442     expectPairs(set, "deluxx");
 443
 444     // Try adding an entire set to another set
 445     expectPattern(set, "[jackiemclean]", "aacceein");
 446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 447     set.addAll(set2);
 448     expectPairs(set, "aacehort");
 449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 450
 451     // Try retaining an set of elements contained in another set (intersection)
 452     UnicodeSet set3;
 453     expectPattern(set3, "[a-c]", "ac");
 454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 455     set3.remove(0x0062);
 456     expectPairs(set3, "aacc");
 457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 458     set.retainAll(set3);
 459     expectPairs(set, "aacc");
 460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 462     set.clear();
 463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 464
 465     // Test commutativity
 466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 467     expectPattern(set2, "[jackiemclean]", "aacceein");
 468     set.addAll(set2);
 469     expectPairs(set, "aacehort");
 470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 471
 472
 473
 474
 475 }
 476
 477 /**
 478  * Make sure minimal representation is maintained.
 479  */
 480 void UnicodeSetTest::TestMinimalRep() {
 481     UErrorCode status = U_ZERO_ERROR;
 482     // This is pretty thoroughly tested by checkCanonicalRep()
 483     // run against the exhaustive operation results.  Use the code
 484     // here for debugging specific spot problems.
 485
 486     // 1 overlap against 2
 487     UnicodeSet set("[h-km-q]", status);
 488     if (U_FAILURE(status)) { errln("FAIL"); return; }
 489     UnicodeSet set2("[i-o]", status);
 490     if (U_FAILURE(status)) { errln("FAIL"); return; }
 491     set.addAll(set2);
 492     expectPairs(set, "hq");
 493     // right
 494     set.applyPattern("[a-m]", status);
 495     if (U_FAILURE(status)) { errln("FAIL"); return; }
 496     set2.applyPattern("[e-o]", status);
 497     if (U_FAILURE(status)) { errln("FAIL"); return; }
 498     set.addAll(set2);
 499     expectPairs(set, "ao");
 500     // left
 501     set.applyPattern("[e-o]", status);
 502     if (U_FAILURE(status)) { errln("FAIL"); return; }
 503     set2.applyPattern("[a-m]", status);
 504     if (U_FAILURE(status)) { errln("FAIL"); return; }
 505     set.addAll(set2);
 506     expectPairs(set, "ao");
 507     // 1 overlap against 3
 508     set.applyPattern("[a-eg-mo-w]", status);
 509     if (U_FAILURE(status)) { errln("FAIL"); return; }
 510     set2.applyPattern("[d-q]", status);
 511     if (U_FAILURE(status)) { errln("FAIL"); return; }
 512     set.addAll(set2);
 513     expectPairs(set, "aw");
 514 }
 515
 516 void UnicodeSetTest::TestAPI() {
 517     UErrorCode status = U_ZERO_ERROR;
 518     // default ct
 519     UnicodeSet set;
 520     if (!set.isEmpty() || set.getRangeCount() != 0) {
 521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 522               set);
 523     }
 524
 525     // clear(), isEmpty()
 526     set.add(0x0061);
 527     if (set.isEmpty()) {
 528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 529               set);
 530     }
 531     set.clear();
 532     if (!set.isEmpty()) {
 533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 534               set);
 535     }
 536
 537     // size()
 538     set.clear();
 539     if (set.size() != 0) {
 540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 541               ": " + set);
 542     }
 543     set.add(0x0061);
 544     if (set.size() != 1) {
 545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 546               ": " + set);
 547     }
 548     set.add(0x0031, 0x0039);
 549     if (set.size() != 10) {
 550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 551               ": " + set);
 552     }
 553
 554     // contains(first, last)
 555     set.clear();
 556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 557     if (U_FAILURE(status)) { errln("FAIL"); return; }
 558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 559         UChar32 a = set.getRangeStart(i);
 560         UChar32 b = set.getRangeEnd(i);
 561         if (!set.contains(a, b)) {
 562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 563                   " but doesn't: " + set);
 564         }
 565         if (set.contains((UChar32)(a-1), b)) {
 566             errln((UnicodeString)"FAIL, shouldn't contain " +
 567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 568                   " but does: " + set);
 569         }
 570         if (set.contains(a, (UChar32)(b+1))) {
 571             errln((UnicodeString)"FAIL, shouldn't contain " +
 572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 573                   " but does: " + set);
 574         }
 575     }
 576
 577     // Ported InversionList test.
 578     UnicodeSet a((UChar32)3,(UChar32)10);
 579     UnicodeSet b((UChar32)7,(UChar32)15);
 580     UnicodeSet c;
 581
 582     logln((UnicodeString)"a [3-10]: " + a);
 583     logln((UnicodeString)"b [7-15]: " + b);
 584     c = a;
 585     c.addAll(b);
 586     UnicodeSet exp((UChar32)3,(UChar32)15);
 587     if (c == exp) {
 588         logln((UnicodeString)"c.set(a).add(b): " + c);
 589     } else {
 590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 591     }
 592     c.complement();
 593     exp.set((UChar32)0, (UChar32)2);
 594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 595     if (c == exp) {
 596         logln((UnicodeString)"c.complement(): " + c);
 597     } else {
 598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 599     }
 600     c.complement();
 601     exp.set((UChar32)3, (UChar32)15);
 602     if (c == exp) {
 603         logln((UnicodeString)"c.complement(): " + c);
 604     } else {
 605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 606     }
 607     c = a;
 608     c.complementAll(b);
 609     exp.set((UChar32)3,(UChar32)6);
 610     exp.add((UChar32)11,(UChar32) 15);
 611     if (c == exp) {
 612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 613     } else {
 614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 615     }
 616
 617     exp = c;
 618     bitsToSet(setToBits(c), c);
 619     if (c == exp) {
 620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 621     } else {
 622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 623     }
 624
 625     // Additional tests for coverage JB#2118
 626     //UnicodeSet::complement(class UnicodeString const &)
 627     //UnicodeSet::complementAll(class UnicodeString const &)
 628     //UnicodeSet::containsNone(class UnicodeSet const &)
 629     //UnicodeSet::containsNone(long,long)
 630     //UnicodeSet::containsSome(class UnicodeSet const &)
 631     //UnicodeSet::containsSome(long,long)
 632     //UnicodeSet::removeAll(class UnicodeString const &)
 633     //UnicodeSet::retain(long)
 634     //UnicodeSet::retainAll(class UnicodeString const &)
 635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 636     //UnicodeSetIterator::getString(void)
 637     set.clear();
 638     set.complement("ab");
 639     exp.applyPattern("[{ab}]", status);
 640     if (U_FAILURE(status)) { errln("FAIL"); return; }
 641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 642
 643     UnicodeSetIterator iset(set);
 644     if (!iset.next() || !iset.isString()) {
 645         errln("FAIL: UnicodeSetIterator::next/isString");
 646     } else if (iset.getString() != "ab") {
 647         errln("FAIL: UnicodeSetIterator::getString");
 648     }
 649
 650     set.add((UChar32)0x61, (UChar32)0x7A);
 651     set.complementAll("alan");
 652     exp.applyPattern("[{ab}b-kmo-z]", status);
 653     if (U_FAILURE(status)) { errln("FAIL"); return; }
 654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 655
 656     exp.applyPattern("[a-z]", status);
 657     if (U_FAILURE(status)) { errln("FAIL"); return; }
 658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 660     exp.applyPattern("[aln]", status);
 661     if (U_FAILURE(status)) { errln("FAIL"); return; }
 662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 664
 665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 666         errln("FAIL: containsNone(UChar32, UChar32)");
 667     }
 668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 669         errln("FAIL: containsSome(UChar32, UChar32)");
 670     }
 671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 672         errln("FAIL: containsNone(UChar32, UChar32)");
 673     }
 674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 675         errln("FAIL: containsSome(UChar32, UChar32)");
 676     }
 677
 678     set.removeAll("liu");
 679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 680     if (U_FAILURE(status)) { errln("FAIL"); return; }
 681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 682
 683     set.retainAll("star");
 684     exp.applyPattern("[rst]", status);
 685     if (U_FAILURE(status)) { errln("FAIL"); return; }
 686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 687
 688     set.retain((UChar32)0x73);
 689     exp.applyPattern("[s]", status);
 690     if (U_FAILURE(status)) { errln("FAIL"); return; }
 691     if (set != exp) { errln("FAIL: retain('s')"); return; }
 692
 693     uint16_t buf[32];
 694     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
 695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 697         errln("FAIL: serialize");
 698         return;
 699     }
 700
 701     // Conversions to and from USet
 702     UnicodeSet *uniset = &set;
 703     USet *uset = uniset->toUSet();
 704     TEST_ASSERT((void *)uset == (void *)uniset);
 705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
 706     TEST_ASSERT((void *)setx == (void *)uset);
 707     const UnicodeSet *constSet = uniset;
 708     const USet *constUSet = constSet->toUSet();
 709     TEST_ASSERT((void *)constUSet == (void *)constSet);
 710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
 711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
 712
 713     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
 714     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
 715     UnicodeSet ac(0x61, 0x63);
 716     ac.remove(0x62).freeze();
 717     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
 718         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
 719         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
 720         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
 721         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 722         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
 723         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
 724         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
 725         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
 726         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
 727     ) {
 728         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
 729     }
 730     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
 731         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
 732         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
 733         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
 734         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 735         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
 736         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
 737         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
 738         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
 739         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
 740     ) {
 741         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
 742     }
 743 }
 744
 745 void UnicodeSetTest::TestIteration() {
 746     UErrorCode ec = U_ZERO_ERROR;
 747     int i = 0;
 748     int outerLoop;
 749
 750     // 6 code points, 3 ranges, 2 strings, 8 total elements
 751     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 752     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
 753     TEST_ASSERT_SUCCESS(ec);
 754     UnicodeSetIterator it(set);
 755
 756     for (outerLoop=0; outerLoop<3; outerLoop++) {
 757         // Run the test multiple times, to check that iterator.reset() is working.
 758         for (i=0; i<10; i++) {
 759             UBool         nextv        = it.next();
 760             UBool         isString     = it.isString();
 761             int32_t       codePoint    = it.getCodepoint();
 762             //int32_t       codePointEnd = it.getCodepointEnd();
 763             UnicodeString s   = it.getString();
 764             switch (i) {
 765             case 0:
 766                 TEST_ASSERT(nextv == TRUE);
 767                 TEST_ASSERT(isString == FALSE);
 768                 TEST_ASSERT(codePoint==0x61);
 769                 TEST_ASSERT(s == "a");
 770                 break;
 771             case 1:
 772                 TEST_ASSERT(nextv == TRUE);
 773                 TEST_ASSERT(isString == FALSE);
 774                 TEST_ASSERT(codePoint==0x62);
 775                 TEST_ASSERT(s == "b");
 776                 break;
 777             case 2:
 778                 TEST_ASSERT(nextv == TRUE);
 779                 TEST_ASSERT(isString == FALSE);
 780                 TEST_ASSERT(codePoint==0x63);
 781                 TEST_ASSERT(s == "c");
 782                 break;
 783             case 3:
 784                 TEST_ASSERT(nextv == TRUE);
 785                 TEST_ASSERT(isString == FALSE);
 786                 TEST_ASSERT(codePoint==0x79);
 787                 TEST_ASSERT(s == "y");
 788                 break;
 789             case 4:
 790                 TEST_ASSERT(nextv == TRUE);
 791                 TEST_ASSERT(isString == FALSE);
 792                 TEST_ASSERT(codePoint==0x7a);
 793                 TEST_ASSERT(s == "z");
 794                 break;
 795             case 5:
 796                 TEST_ASSERT(nextv == TRUE);
 797                 TEST_ASSERT(isString == FALSE);
 798                 TEST_ASSERT(codePoint==0x1abcd);
 799                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 800                 break;
 801             case 6:
 802                 TEST_ASSERT(nextv == TRUE);
 803                 TEST_ASSERT(isString == TRUE);
 804                 TEST_ASSERT(s == "str1");
 805                 break;
 806             case 7:
 807                 TEST_ASSERT(nextv == TRUE);
 808                 TEST_ASSERT(isString == TRUE);
 809                 TEST_ASSERT(s == "str2");
 810                 break;
 811             case 8:
 812                 TEST_ASSERT(nextv == FALSE);
 813                 break;
 814             case 9:
 815                 TEST_ASSERT(nextv == FALSE);
 816                 break;
 817             }
 818         }
 819         it.reset();  // prepare to run the iteration again.
 820     }
 821 }
 822
 823
 824
 825
 826 void UnicodeSetTest::TestStrings() {
 827     UErrorCode ec = U_ZERO_ERROR;
 828
 829     UnicodeSet* testList[] = {
 830         UnicodeSet::createFromAll("abc"),
 831         new UnicodeSet("[a-c]", ec),
 832
 833         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 834         new UnicodeSet("[{ll}{ch}a-z]", ec),
 835
 836         UnicodeSet::createFrom("ab}c"),
 837         new UnicodeSet("[{ab\\}c}]", ec),
 838
 839         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 840         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 841
 842         NULL
 843     };
 844
 845     if (U_FAILURE(ec)) {
 846         errln("FAIL: couldn't construct test sets");
 847     }
 848
 849     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 850         if (U_SUCCESS(ec)) {
 851             UnicodeString pat0, pat1;
 852             testList[i]->toPattern(pat0, TRUE);
 853             testList[i+1]->toPattern(pat1, TRUE);
 854             if (*testList[i] == *testList[i+1]) {
 855                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 856             } else {
 857                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 858             }
 859         }
 860         delete testList[i];
 861         delete testList[i+1];
 862     }
 863 }
 864
 865 /**
 866  * Test the [:Latin:] syntax.
 867  */
 868 void UnicodeSetTest::TestScriptSet() {
 869     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 870
 871     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 872
 873     /* Jitterbug 1423 */
 874     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 875
 876 }
 877
 878 /**
 879  * Test the [:Latin:] syntax.
 880  */
 881 void UnicodeSetTest::TestPropertySet() {
 882     static const char* const DATA[] = {
 883         // Pattern, Chars IN, Chars NOT in
 884
 885         "[:Latin:]",
 886         "aA",
 887         "\\u0391\\u03B1",
 888
 889         "[\\p{Greek}]",
 890         "\\u0391\\u03B1",
 891         "aA",
 892
 893         "\\P{ GENERAL Category = upper case letter }",
 894         "abc",
 895         "ABC",
 896
 897 #if !UCONFIG_NO_NORMALIZATION
 898         // Combining class: @since ICU 2.2
 899         // Check both symbolic and numeric
 900         "\\p{ccc=Nukta}",
 901         "\\u0ABC",
 902         "abc",
 903
 904         "\\p{Canonical Combining Class = 11}",
 905         "\\u05B1",
 906         "\\u05B2",
 907
 908         "[:c c c = iota subscript :]",
 909         "\\u0345",
 910         "xyz",
 911 #endif
 912
 913         // Bidi class: @since ICU 2.2
 914         "\\p{bidiclass=lefttoright}",
 915         "abc",
 916         "\\u0671\\u0672",
 917
 918         // Binary properties: @since ICU 2.2
 919         "\\p{ideographic}",
 920         "\\u4E0A",
 921         "x",
 922
 923         "[:math=false:]",
 924         "q)*(",
 925         // weiv: )(and * were removed from math in Unicode 4.0.1
 926         //"(*+)",
 927         "+<>^",
 928
 929         // JB#1767 \N{}, \p{ASCII}
 930         "[:Ascii:]",
 931         "abc\\u0000\\u007F",
 932         "\\u0080\\u4E00",
 933
 934         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 935         "az",
 936         "qrs",
 937
 938         // JB#2015
 939         "[:any:]",
 940         "a\\U0010FFFF",
 941         "",
 942
 943         "[:nv=0.5:]",
 944         "\\u00BD\\u0F2A",
 945         "\\u00BC",
 946
 947         // JB#2653: Age
 948         "[:Age=1.1:]",
 949         "\\u03D6", // 1.1
 950         "\\u03D8\\u03D9", // 3.2
 951
 952         "[:Age=3.1:]",
 953         "\\u1800\\u3400\\U0002f800",
 954         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 955
 956         // JB#2350: Case_Sensitive
 957         "[:Case Sensitive:]",
 958         "A\\u1FFC\\U00010410",
 959         ";\\u00B4\\U00010500",
 960
 961         // JB#2832: C99-compatibility props
 962         "[:blank:]",
 963         " \\u0009",
 964         "1-9A-Z",
 965
 966         "[:graph:]",
 967         "19AZ",
 968         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 969
 970         "[:punct:]",
 971         "!@#%&*()[]{}-_\\/;:,.?'\"",
 972         "09azAZ",
 973
 974         "[:xdigit:]",
 975         "09afAF",
 976         "gG!",
 977
 978         // Regex compatibility test
 979         "[-b]", // leading '-' is literal
 980         "-b",
 981         "ac",
 982
 983         "[^-b]", // leading '-' is literal
 984         "ac",
 985         "-b",
 986
 987         "[b-]", // trailing '-' is literal
 988         "-b",
 989         "ac",
 990
 991         "[^b-]", // trailing '-' is literal
 992         "ac",
 993         "-b",
 994
 995         "[a-b-]", // trailing '-' is literal
 996         "ab-",
 997         "c=",
 998
 999         "[[a-q]&[p-z]-]", // trailing '-' is literal
1000         "pq-",
1001         "or=",
1002
1003         "[\\s|\\)|:|$|\\>]", // from regex tests
1004         "s|):$>",
1005         "abc",
1006
1007         "[\\uDC00cd]", // JB#2906: isolated trail at start
1008         "cd\\uDC00",
1009         "ab\\uD800\\U00010000",
1010
1011         "[ab\\uD800]", // JB#2906: isolated trail at start
1012         "ab\\uD800",
1013         "cd\\uDC00\\U00010000",
1014
1015         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016         "abcd\\uD800",
1017         "ef\\uDC00\\U00010000",
1018
1019         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020         "abcd\\uDC00",
1021         "ef\\uD800\\U00010000",
1022
1023 #if !UCONFIG_NO_NORMALIZATION
1024         "[:^lccc=0:]", // Lead canonical class
1025         "\\u0300\\u0301",
1026         "abcd\\u00c0\\u00c5",
1027
1028         "[:^tccc=0:]", // Trail canonical class
1029         "\\u0300\\u0301\\u00c0\\u00c5",
1030         "abcd",
1031
1032         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033         "\\u0300\\u0301\\u00c0\\u00c5",
1034         "abcd",
1035
1036         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037         "",
1038         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041         "\\u0F73\\u0F75\\u0F81",
1042         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043 #endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045         "[:Assigned:]",
1046         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049         // Script_Extensions, new in Unicode 6.0
1050         "[:scx=Arab:]",
1051         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052         "\\u061D\\uFDEF\\uFDFE",
1053
1054         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055         // so scx-sc is missing U+FDF2.
1056         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057         "\\u0640\\u064B\\u0650\\u0655",
1058         "\\uFDF2"
1059     };
1060
1061     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1062
1063     for (int32_t i=0; i<DATA_LEN; i+=3) {
1064         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065                           CharsToUnicodeString(DATA[i+2]));
1066     }
1067 }
1068
1069 /**
1070   * Test that Posix style character classes [:digit:], etc.
1071   *   have the Unicode definitions from TR 18.
1072   */
1073 void UnicodeSetTest::TestPosixClasses() {
1074     {
1075         UErrorCode status = U_ZERO_ERROR;
1076         UnicodeSet s1("[:alpha:]", status);
1077         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078         TEST_ASSERT_SUCCESS(status);
1079         TEST_ASSERT(s1==s2);
1080     }
1081     {
1082         UErrorCode status = U_ZERO_ERROR;
1083         UnicodeSet s1("[:lower:]", status);
1084         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085         TEST_ASSERT_SUCCESS(status);
1086         TEST_ASSERT(s1==s2);
1087     }
1088     {
1089         UErrorCode status = U_ZERO_ERROR;
1090         UnicodeSet s1("[:upper:]", status);
1091         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092         TEST_ASSERT_SUCCESS(status);
1093         TEST_ASSERT(s1==s2);
1094     }
1095     {
1096         UErrorCode status = U_ZERO_ERROR;
1097         UnicodeSet s1("[:punct:]", status);
1098         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099         TEST_ASSERT_SUCCESS(status);
1100         TEST_ASSERT(s1==s2);
1101     }
1102     {
1103         UErrorCode status = U_ZERO_ERROR;
1104         UnicodeSet s1("[:digit:]", status);
1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106         TEST_ASSERT_SUCCESS(status);
1107         TEST_ASSERT(s1==s2);
1108     }
1109     {
1110         UErrorCode status = U_ZERO_ERROR;
1111         UnicodeSet s1("[:xdigit:]", status);
1112         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113         TEST_ASSERT_SUCCESS(status);
1114         TEST_ASSERT(s1==s2);
1115     }
1116     {
1117         UErrorCode status = U_ZERO_ERROR;
1118         UnicodeSet s1("[:alnum:]", status);
1119         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120         TEST_ASSERT_SUCCESS(status);
1121         TEST_ASSERT(s1==s2);
1122     }
1123     {
1124         UErrorCode status = U_ZERO_ERROR;
1125         UnicodeSet s1("[:space:]", status);
1126         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127         TEST_ASSERT_SUCCESS(status);
1128         TEST_ASSERT(s1==s2);
1129     }
1130     {
1131         UErrorCode status = U_ZERO_ERROR;
1132         UnicodeSet s1("[:blank:]", status);
1133         TEST_ASSERT_SUCCESS(status);
1134         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135             status);
1136         TEST_ASSERT_SUCCESS(status);
1137         TEST_ASSERT(s1==s2);
1138     }
1139     {
1140         UErrorCode status = U_ZERO_ERROR;
1141         UnicodeSet s1("[:cntrl:]", status);
1142         TEST_ASSERT_SUCCESS(status);
1143         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144         TEST_ASSERT_SUCCESS(status);
1145         TEST_ASSERT(s1==s2);
1146     }
1147     {
1148         UErrorCode status = U_ZERO_ERROR;
1149         UnicodeSet s1("[:graph:]", status);
1150         TEST_ASSERT_SUCCESS(status);
1151         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152         TEST_ASSERT_SUCCESS(status);
1153         TEST_ASSERT(s1==s2);
1154     }
1155     {
1156         UErrorCode status = U_ZERO_ERROR;
1157         UnicodeSet s1("[:print:]", status);
1158         TEST_ASSERT_SUCCESS(status);
1159         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160         TEST_ASSERT_SUCCESS(status);
1161         TEST_ASSERT(s1==s2);
1162     }
1163 }
1164 /**
1165  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1166  */
1167 void UnicodeSetTest::TestClone() {
1168     UErrorCode ec = U_ZERO_ERROR;
1169     UnicodeSet s("[abcxyz]", ec);
1170     UnicodeSet t(s);
1171     expectContainment(t, "abc", "def");
1172 }
1173
1174 /**
1175  * Test the indexOf() and charAt() methods.
1176  */
1177 void UnicodeSetTest::TestIndexOf() {
1178     UErrorCode ec = U_ZERO_ERROR;
1179     UnicodeSet set("[a-cx-y3578]", ec);
1180     if (U_FAILURE(ec)) {
1181         errln("FAIL: UnicodeSet constructor");
1182         return;
1183     }
1184     for (int32_t i=0; i<set.size(); ++i) {
1185         UChar32 c = set.charAt(i);
1186         if (set.indexOf(c) != i) {
1187             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188                 i, c, set.indexOf(c));
1189         }
1190     }
1191     UChar32 c = set.charAt(set.size());
1192     if (c != -1) {
1193         errln("FAIL: charAt(<out of range>) = %X", c);
1194     }
1195     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196     if (j != -1) {
1197         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198     }
1199 }
1200
1201 /**
1202  * Test closure API.
1203  */
1204 void UnicodeSetTest::TestCloseOver() {
1205     UErrorCode ec = U_ZERO_ERROR;
1206
1207     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209     const char* DATA[] = {
1210         // selector, input, output
1211         CASE,
1212         "[aq\\u00DF{Bc}{bC}{Fi}]",
1213         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215         CASE,
1216         "[\\u01F1]", // 'DZ'
1217         "[\\u01F1\\u01F2\\u01F3]",
1218
1219         CASE,
1220         "[\\u1FB4]",
1221         "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223         CASE,
1224         "[{F\\uFB01}]",
1225         "[\\uFB03{ffi}]",
1226
1227         CASE, // make sure binary search finds limits
1228         "[a\\uFF3A]",
1229         "[aA\\uFF3A\\uFF5A]",
1230
1231         CASE,
1232         "[a-z]","[A-Za-z\\u017F\\u212A]",
1233         CASE,
1234         "[abc]","[A-Ca-c]",
1235         CASE,
1236         "[ABC]","[A-Ca-c]",
1237
1238         CASE, "[i]", "[iI]",
1239
1240         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1241         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1242
1243         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1244
1245         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1252
1253         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1256         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1257
1258         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1259
1260         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264 #if !UCONFIG_NO_FILE_IO
1265         CASE_MAPPINGS,
1266         "[aq\\u00DF{Bc}{bC}{Fi}]",
1267         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268 #endif
1269
1270         CASE_MAPPINGS,
1271         "[\\u01F1]", // 'DZ'
1272         "[\\u01F1\\u01F2\\u01F3]",
1273
1274         CASE_MAPPINGS,
1275         "[a-z]",
1276         "[A-Za-z]",
1277
1278         NULL
1279     };
1280
1281     UnicodeSet s;
1282     UnicodeSet t;
1283     UnicodeString buf;
1284     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285         int32_t selector = DATA[i][0];
1286         UnicodeString pat(DATA[i+1], -1, US_INV);
1287         UnicodeString exp(DATA[i+2], -1, US_INV);
1288         s.applyPattern(pat, ec);
1289         s.closeOver(selector);
1290         t.applyPattern(exp, ec);
1291         if (U_FAILURE(ec)) {
1292             errln("FAIL: applyPattern failed");
1293             continue;
1294         }
1295         if (s == t) {
1296             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297         } else {
1298             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299                   s.toPattern(buf, TRUE) + ", expected " + exp);
1300         }
1301     }
1302
1303 #if 0
1304     /*
1305      * Unused test code.
1306      * This was used to compare the old implementation (using USET_CASE)
1307      * with the new one (using 0x100 temporarily)
1308      * while transitioning from hardcoded case closure tables in uniset.cpp
1309      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310      * and using ucase.c functions for closure.
1311      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312      *
1313      * Note: The old and new implementation never fully matched because
1314      * the old implementation turned out to not map U+0130 and U+0131 correctly
1315      * (dotted I and dotless i) and because the old implementation's data tables
1316      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317      * new implementation. (So sigmas and some other characters were not handled
1318      * according to the newer Unicode version.)
1319      */
1320     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321     UnicodeSetIterator si(sens);
1322     UnicodeString str, buf2;
1323     const UnicodeString *pStr;
1324     UChar32 c;
1325     while(si.next()) {
1326         if(!si.isString()) {
1327             c=si.getCodepoint();
1328             s.clear();
1329             s.add(c);
1330
1331             str.setTo(c);
1332             str.foldCase();
1333             sens2.add(str);
1334
1335             t=s;
1336             s.closeOver(USET_CASE);
1337             t.closeOver(0x100);
1338             if(s!=t) {
1339                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341             }
1342         }
1343     }
1344     // remove all code points
1345     // should contain all full case folding mapping strings
1346     sens2.remove(0, 0x10ffff);
1347     si.reset(sens2);
1348     while(si.next()) {
1349         if(si.isString()) {
1350             pStr=&si.getString();
1351             s.clear();
1352             s.add(*pStr);
1353             t=s2=s;
1354             s.closeOver(USET_CASE);
1355             t.closeOver(0x100);
1356             if(s!=t) {
1357                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359             }
1360         }
1361     }
1362 #endif
1363
1364     // Test the pattern API
1365     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366     if (U_FAILURE(ec)) {
1367         errln("FAIL: applyPattern failed");
1368     } else {
1369         expectContainment(s, "abcABC", "defDEF");
1370     }
1371     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372     if (U_FAILURE(ec)) {
1373         errln("FAIL: constructor failed");
1374     } else {
1375         expectContainment(v, "defDEF", "abcABC");
1376     }
1377     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378     if (U_FAILURE(ec)) {
1379         errln("FAIL: construct w/case mappings failed");
1380     } else {
1381         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382     }
1383 }
1384
1385 void UnicodeSetTest::TestEscapePattern() {
1386     const char pattern[] =
1387         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388     const char exp[] =
1389         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390     // We test this with two passes; in the second pass we
1391     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1392     // this fails -- which is what we expect.
1393     for (int32_t pass=1; pass<=2; ++pass) {
1394         UErrorCode ec = U_ZERO_ERROR;
1395         UnicodeString pat(pattern, -1, US_INV);
1396         if (pass==2) {
1397             pat = pat.unescape();
1398         }
1399         // Pattern is only good for pass 1
1400         UBool isPatternValid = (pass==1);
1401
1402         UnicodeSet set(pat, ec);
1403         if (U_SUCCESS(ec) != isPatternValid){
1404             errln((UnicodeString)"FAIL: applyPattern(" +
1405                   escape(pat) + ") => " +
1406                   u_errorName(ec));
1407             continue;
1408         }
1409         if (U_FAILURE(ec)) {
1410             continue;
1411         }
1412         if (set.contains((UChar)0x0644)){
1413             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414         }
1415
1416         UnicodeString newpat;
1417         set.toPattern(newpat, TRUE);
1418         if (newpat == UnicodeString(exp, -1, US_INV)) {
1419             logln(escape(pat) + " => " + newpat);
1420         } else {
1421             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422         }
1423
1424         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425             UnicodeString str("Range ");
1426             str.append((UChar)(0x30 + i))
1427                 .append(": ")
1428                 .append((UChar32)set.getRangeStart(i))
1429                 .append(" - ")
1430                 .append((UChar32)set.getRangeEnd(i));
1431             str = str + " (" + set.getRangeStart(i) + " - " +
1432                 set.getRangeEnd(i) + ")";
1433             if (set.getRangeStart(i) < 0) {
1434                 errln((UnicodeString)"FAIL: " + escape(str));
1435             } else {
1436                 logln(escape(str));
1437             }
1438         }
1439     }
1440 }
1441
1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
1443                                  const UnicodeSet& set,
1444                                  UChar32 start, UChar32 end) {
1445     UnicodeSet exp(start, end);
1446     UnicodeString pat;
1447     if (set == exp) {
1448         logln(label + " => " + set.toPattern(pat, TRUE));
1449     } else {
1450         UnicodeString xpat;
1451         errln((UnicodeString)"FAIL: " + label + " => " +
1452               set.toPattern(pat, TRUE) +
1453               ", expected " + exp.toPattern(xpat, TRUE));
1454     }
1455 }
1456
1457 void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459     const UChar32 DATA[] = {
1460         // Test range             Expected range
1461         0, 0x10FFFF,              0, 0x10FFFF,
1462         (UChar32)-1, 8,           0, 8,
1463         8, 0x110000,              8, 0x10FFFF
1464     };
1465     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1466
1467     UnicodeString pat;
1468     int32_t i;
1469
1470     for (i=0; i<DATA_LENGTH; i+=4) {
1471         UChar32 start  = DATA[i];
1472         UChar32 end    = DATA[i+1];
1473         UChar32 xstart = DATA[i+2];
1474         UChar32 xend   = DATA[i+3];
1475
1476         // Try various API using the test code points
1477
1478         UnicodeSet set(start, end);
1479         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480                     set, xstart, xend);
1481
1482         set.clear();
1483         set.set(start, end);
1484         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485                     set, xstart, xend);
1486
1487         UBool b = set.contains(start);
1488         b = set.contains(start, end);
1489         b = set.containsNone(start, end);
1490         b = set.containsSome(start, end);
1491         (void)b;   // Suppress set but not used warning.
1492
1493         /*int32_t index = set.indexOf(start);*/
1494
1495         set.clear();
1496         set.add(start);
1497         set.add(start, end);
1498         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1499                     set, xstart, xend);
1500
1501         set.set(0, 0x10FFFF);
1502         set.retain(start, end);
1503         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1504                     set, xstart, xend);
1505         set.retain(start);
1506
1507         set.set(0, 0x10FFFF);
1508         set.remove(start);
1509         set.remove(start, end);
1510         set.complement();
1511         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1512                     set, xstart, xend);
1513
1514         set.set(0, 0x10FFFF);
1515         set.complement(start, end);
1516         set.complement();
1517         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1518                     set, xstart, xend);
1519         set.complement(start);
1520     }
1521
1522     const UChar32 DATA2[] = {
1523         0,
1524         0x10FFFF,
1525         (UChar32)-1,
1526         0x110000
1527     };
1528     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1529
1530     for (i=0; i<DATA2_LENGTH; ++i) {
1531         UChar32 c = DATA2[i], end = 0x10FFFF;
1532         UBool valid = (c >= 0 && c <= 0x10FFFF);
1533
1534         UnicodeSet set(0, 0x10FFFF);
1535
1536         // For single-codepoint contains, invalid codepoints are NOT contained
1537         UBool b = set.contains(c);
1538         if (b == valid) {
1539             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1540                   ") = " + b);
1541         } else {
1542             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1543                   ") = " + b);
1544         }
1545
1546         // For codepoint range contains, containsNone, and containsSome,
1547         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1548         b = set.contains(c, end);
1549         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1550               "," + end + ") = " + b);
1551
1552         b = set.containsNone(c, end);
1553         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1554               "," + end + ") = " + b);
1555
1556         b = set.containsSome(c, end);
1557         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1558               "," + end + ") = " + b);
1559
1560         int32_t index = set.indexOf(c);
1561         if ((index >= 0) == valid) {
1562             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1563                   ") = " + index);
1564         } else {
1565             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1566                   ") = " + index);
1567         }
1568     }
1569 }
1570
1571 // Used by TestSymbolTable
1572 class TokenSymbolTable : public SymbolTable {
1573 public:
1574     Hashtable contents;
1575
1576     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1577         contents.setValueDeleter(uprv_deleteUObject);
1578     }
1579
1580     ~TokenSymbolTable() {}
1581
1582     /**
1583      * (Non-SymbolTable API) Add the given variable and value to
1584      * the table.  Variable should NOT contain leading '$'.
1585      */
1586     void add(const UnicodeString& var, const UnicodeString& value,
1587              UErrorCode& ec) {
1588         if (U_SUCCESS(ec)) {
1589             contents.put(var, new UnicodeString(value), ec);
1590         }
1591     }
1592
1593     /**
1594      * SymbolTable API
1595      */
1596     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1597         return (const UnicodeString*) contents.get(s);
1598     }
1599
1600     /**
1601      * SymbolTable API
1602      */
1603     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1604         return NULL;
1605     }
1606
1607     /**
1608      * SymbolTable API
1609      */
1610     virtual UnicodeString parseReference(const UnicodeString& text,
1611                                          ParsePosition& pos, int32_t limit) const {
1612         int32_t start = pos.getIndex();
1613         int32_t i = start;
1614         UnicodeString result;
1615         while (i < limit) {
1616             UChar c = text.charAt(i);
1617             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1618                 break;
1619             }
1620             ++i;
1621         }
1622         if (i == start) { // No valid name chars
1623             return result; // Indicate failure with empty string
1624         }
1625         pos.setIndex(i);
1626         text.extractBetween(start, i, result);
1627         return result;
1628     }
1629 };
1630
1631 void UnicodeSetTest::TestSymbolTable() {
1632     // Multiple test cases can be set up here.  Each test case
1633     // is terminated by null:
1634     // var, value, var, value,..., input pat., exp. output pat., null
1635     const char* DATA[] = {
1636         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1637         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1638         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1639         NULL
1640     };
1641
1642     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1643         UErrorCode ec = U_ZERO_ERROR;
1644         TokenSymbolTable sym(ec);
1645         if (U_FAILURE(ec)) {
1646             errln("FAIL: couldn't construct TokenSymbolTable");
1647             continue;
1648         }
1649
1650         // Set up variables
1651         while (DATA[i+2] != NULL) {
1652             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1653             if (U_FAILURE(ec)) {
1654                 errln("FAIL: couldn't add to TokenSymbolTable");
1655                 continue;
1656             }
1657             i += 2;
1658         }
1659
1660         // Input pattern and expected output pattern
1661         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1662         i += 2;
1663
1664         ParsePosition pos(0);
1665         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1666         if (U_FAILURE(ec)) {
1667             errln("FAIL: couldn't construct UnicodeSet");
1668             continue;
1669         }
1670
1671         // results
1672         if (pos.getIndex() != inpat.length()) {
1673             errln((UnicodeString)"Failed to read to end of string \""
1674                   + inpat + "\": read to "
1675                   + pos.getIndex() + ", length is "
1676                   + inpat.length());
1677         }
1678
1679         UnicodeSet us2(exppat, ec);
1680         if (U_FAILURE(ec)) {
1681             errln("FAIL: couldn't construct expected UnicodeSet");
1682             continue;
1683         }
1684
1685         UnicodeString a, b;
1686         if (us != us2) {
1687             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1688                   ", expected " + us2.toPattern(b, TRUE));
1689         } else {
1690             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1691         }
1692     }
1693 }
1694
1695 void UnicodeSetTest::TestSurrogate() {
1696     const char* DATA[] = {
1697         // These should all behave identically
1698         "[abc\\uD800\\uDC00]",
1699         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1700         "[abc\\U00010000]",
1701         0
1702     };
1703     for (int i=0; DATA[i] != 0; ++i) {
1704         UErrorCode ec = U_ZERO_ERROR;
1705         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1706         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1707         UnicodeSet set(str, ec);
1708         if (U_FAILURE(ec)) {
1709             errln("FAIL: UnicodeSet constructor");
1710             continue;
1711         }
1712         expectContainment(set,
1713                           CharsToUnicodeString("abc\\U00010000"),
1714                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1715         if (set.size() != 4) {
1716             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1717                   set.size() + ", expected 4");
1718         }
1719
1720         {
1721           UErrorCode subErr = U_ZERO_ERROR;
1722           checkRoundTrip(set);
1723           checkSerializeRoundTrip(set, subErr);
1724         }
1725     }
1726 }
1727
1728 void UnicodeSetTest::TestExhaustive() {
1729     // exhaustive tests. Simulate UnicodeSets with integers.
1730     // That gives us very solid tests (except for large memory tests).
1731
1732     int32_t limit = 128;
1733
1734     UnicodeSet x, y, z, aa;
1735
1736     for (int32_t i = 0; i < limit; ++i) {
1737         bitsToSet(i, x);
1738         logln((UnicodeString)"Testing " + i + ", " + x);
1739         _testComplement(i, x, y);
1740
1741         UnicodeSet &toTest = bitsToSet(i, aa);
1742
1743         // AS LONG AS WE ARE HERE, check roundtrip
1744         checkRoundTrip(toTest);
1745         UErrorCode ec = U_ZERO_ERROR;
1746         checkSerializeRoundTrip(toTest, ec);
1747
1748         for (int32_t j = 0; j < limit; ++j) {
1749             _testAdd(i,j,  x,y,z);
1750             _testXor(i,j,  x,y,z);
1751             _testRetain(i,j,  x,y,z);
1752             _testRemove(i,j,  x,y,z);
1753         }
1754     }
1755 }
1756
1757 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1758     bitsToSet(a, x);
1759     z = x;
1760     z.complement();
1761     int32_t c = setToBits(z);
1762     if (c != (~a)) {
1763         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1764         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1765     }
1766     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1767 }
1768
1769 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1770     bitsToSet(a, x);
1771     bitsToSet(b, y);
1772     z = x;
1773     z.addAll(y);
1774     int32_t c = setToBits(z);
1775     if (c != (a | b)) {
1776         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1777         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1778     }
1779     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1780 }
1781
1782 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1783     bitsToSet(a, x);
1784     bitsToSet(b, y);
1785     z = x;
1786     z.retainAll(y);
1787     int32_t c = setToBits(z);
1788     if (c != (a & b)) {
1789         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1790         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1791     }
1792     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1793 }
1794
1795 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1796     bitsToSet(a, x);
1797     bitsToSet(b, y);
1798     z = x;
1799     z.removeAll(y);
1800     int32_t c = setToBits(z);
1801     if (c != (a &~ b)) {
1802         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1803         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1804     }
1805     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1806 }
1807
1808 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1809     bitsToSet(a, x);
1810     bitsToSet(b, y);
1811     z = x;
1812     z.complementAll(y);
1813     int32_t c = setToBits(z);
1814     if (c != (a ^ b)) {
1815         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1816         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1817     }
1818     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1819 }
1820
1821 /**
1822  * Check that ranges are monotonically increasing and non-
1823  * overlapping.
1824  */
1825 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1826     int32_t n = set.getRangeCount();
1827     if (n < 0) {
1828         errln((UnicodeString)"FAIL result of " + msg +
1829               ": range count should be >= 0 but is " +
1830               n /*+ " for " + set.toPattern())*/);
1831         return;
1832     }
1833     UChar32 last = 0;
1834     for (int32_t i=0; i<n; ++i) {
1835         UChar32 start = set.getRangeStart(i);
1836         UChar32 end = set.getRangeEnd(i);
1837         if (start > end) {
1838             errln((UnicodeString)"FAIL result of " + msg +
1839                   ": range " + (i+1) +
1840                   " start > end: " + (int)start + ", " + (int)end +
1841                   " for " + set);
1842         }
1843         if (i > 0 && start <= last) {
1844             errln((UnicodeString)"FAIL result of " + msg +
1845                   ": range " + (i+1) +
1846                   " overlaps previous range: " + (int)start + ", " + (int)end +
1847                   " for " + set);
1848         }
1849         last = end;
1850     }
1851 }
1852
1853 /**
1854  * Convert a bitmask to a UnicodeSet.
1855  */
1856 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1857     result.clear();
1858     for (UChar32 i = 0; i < 32; ++i) {
1859         if ((a & (1<<i)) != 0) {
1860             result.add(i);
1861         }
1862     }
1863     return result;
1864 }
1865
1866 /**
1867  * Convert a UnicodeSet to a bitmask.  Only the characters
1868  * U+0000 to U+0020 are represented in the bitmask.
1869  */
1870 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1871     int32_t result = 0;
1872     for (int32_t i = 0; i < 32; ++i) {
1873         if (x.contains((UChar32)i)) {
1874             result |= (1<<i);
1875         }
1876     }
1877     return result;
1878 }
1879
1880 /**
1881  * Return the representation of an inversion list based UnicodeSet
1882  * as a pairs list.  Ranges are listed in ascending Unicode order.
1883  * For example, the set [a-zA-M3] is represented as "33AMaz".
1884  */
1885 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1886     UnicodeString pairs;
1887     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1888         UChar32 start = set.getRangeStart(i);
1889         UChar32 end = set.getRangeEnd(i);
1890         if (end > 0xFFFF) {
1891             end = 0xFFFF;
1892             i = set.getRangeCount(); // Should be unnecessary
1893         }
1894         pairs.append((UChar)start).append((UChar)end);
1895     }
1896     return pairs;
1897 }
1898
1899 /**
1900  * Basic consistency check for a few items.
1901  * That the iterator works, and that we can create a pattern and
1902  * get the same thing back
1903  */
1904 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1905     {
1906         UnicodeSet t(s);
1907         checkEqual(s, t, "copy ct");
1908     }
1909
1910     {
1911         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1912         t = s;
1913         checkEqual(s, t, "operator=");
1914     }
1915
1916     {
1917         UnicodeSet t;
1918         copyWithIterator(t, s, FALSE);
1919         checkEqual(s, t, "iterator roundtrip");
1920     }
1921
1922     {
1923         UnicodeSet t;
1924         copyWithIterator(t, s, TRUE); // try range
1925         checkEqual(s, t, "iterator roundtrip");
1926     }
1927
1928     {
1929         UnicodeSet t;
1930         UnicodeString pat;
1931         UErrorCode ec = U_ZERO_ERROR;
1932         s.toPattern(pat, FALSE);
1933         t.applyPattern(pat, ec);
1934         if (U_FAILURE(ec)) {
1935             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1936             return;
1937         } else {
1938             checkEqual(s, t, "toPattern(false)");
1939         }
1940     }
1941
1942     {
1943         UnicodeSet t;
1944         UnicodeString pat;
1945         UErrorCode ec = U_ZERO_ERROR;
1946         s.toPattern(pat, TRUE);
1947         t.applyPattern(pat, ec);
1948         if (U_FAILURE(ec)) {
1949             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1950             return;
1951         } else {
1952             checkEqual(s, t, "toPattern(true)");
1953         }
1954     }
1955 }
1956
1957 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1958   if(U_FAILURE(status)) return;
1959   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1960   if(status == U_BUFFER_OVERFLOW_ERROR) {
1961     status = U_ZERO_ERROR;
1962     serializeBuffer.resize(len);
1963     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1964     // let 2nd error stand
1965   }
1966   if(U_FAILURE(status)) {
1967     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1968     return;
1969   }
1970   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1971   if(U_FAILURE(status)) {
1972     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1973     return;
1974   }
1975
1976   checkEqual(t, deserialized, "Set was unequal when deserialized");
1977 }
1978
1979 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1980     t.clear();
1981     UnicodeSetIterator it(s);
1982     if (withRange) {
1983         while (it.nextRange()) {
1984             if (it.isString()) {
1985                 t.add(it.getString());
1986             } else {
1987                 t.add(it.getCodepoint(), it.getCodepointEnd());
1988             }
1989         }
1990     } else {
1991         while (it.next()) {
1992             if (it.isString()) {
1993                 t.add(it.getString());
1994             } else {
1995                 t.add(it.getCodepoint());
1996             }
1997         }
1998     }
1999 }
2000
2001 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2002   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2003   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2004     UnicodeString source; s.toPattern(source, TRUE);
2005     UnicodeString result; t.toPattern(result, TRUE);
2006     if (s != t) {
2007         errln((UnicodeString)"FAIL: " + message
2008               + "; source = " + source
2009               + "; result = " + result
2010               );
2011         return FALSE;
2012     } else {
2013         logln((UnicodeString)"Ok: " + message
2014               + "; source = " + source
2015               + "; result = " + result
2016               );
2017     }
2018     return TRUE;
2019 }
2020
2021 void
2022 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2023                                   const UnicodeString& charsIn,
2024                                   const UnicodeString& charsOut) {
2025     UErrorCode ec = U_ZERO_ERROR;
2026     UnicodeSet set(pat, ec);
2027     if (U_FAILURE(ec)) {
2028         dataerrln((UnicodeString)"FAIL: pattern \"" +
2029               pat + "\" => " + u_errorName(ec));
2030         return;
2031     }
2032     expectContainment(set, pat, charsIn, charsOut);
2033 }
2034
2035 void
2036 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2037                                   const UnicodeString& charsIn,
2038                                   const UnicodeString& charsOut) {
2039     UnicodeString pat;
2040     set.toPattern(pat);
2041     expectContainment(set, pat, charsIn, charsOut);
2042 }
2043
2044 void
2045 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2046                                   const UnicodeString& setName,
2047                                   const UnicodeString& charsIn,
2048                                   const UnicodeString& charsOut) {
2049     UnicodeString bad;
2050     UChar32 c;
2051     int32_t i;
2052
2053     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2054         c = charsIn.char32At(i);
2055         if (!set.contains(c)) {
2056             bad.append(c);
2057         }
2058     }
2059     if (bad.length() > 0) {
2060         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2061               ", expected containment of " + prettify(charsIn));
2062     } else {
2063         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2064     }
2065
2066     bad.truncate(0);
2067     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2068         c = charsOut.char32At(i);
2069         if (set.contains(c)) {
2070             bad.append(c);
2071         }
2072     }
2073     if (bad.length() > 0) {
2074         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2075               ", expected non-containment of " + prettify(charsOut));
2076     } else {
2077         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2078     }
2079 }
2080
2081 void
2082 UnicodeSetTest::expectPattern(UnicodeSet& set,
2083                               const UnicodeString& pattern,
2084                               const UnicodeString& expectedPairs){
2085     UErrorCode status = U_ZERO_ERROR;
2086     set.applyPattern(pattern, status);
2087     if (U_FAILURE(status)) {
2088         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2089               "\") failed");
2090         return;
2091     } else {
2092         if (getPairs(set) != expectedPairs ) {
2093             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2094                   "\") => pairs \"" +
2095                   escape(getPairs(set)) + "\", expected \"" +
2096                   escape(expectedPairs) + "\"");
2097         } else {
2098             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2099                   "\") => pairs \"" +
2100                   escape(getPairs(set)) + "\"");
2101         }
2102     }
2103     // the result of calling set.toPattern(), which is the string representation of
2104     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2105     // will produce another set that is equal to this one.
2106     UnicodeString temppattern;
2107     set.toPattern(temppattern);
2108     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2109     if (U_FAILURE(status)) {
2110         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2111         return;
2112     }
2113     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2114         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2115             escape(getPairs(set)) + "\""));
2116     } else{
2117         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2118     }
2119
2120     delete tempset;
2121
2122 }
2123
2124 void
2125 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2126     if (getPairs(set) != expectedPairs) {
2127         errln(UnicodeString("FAIL: Expected pair list \"") +
2128               escape(expectedPairs) + "\", got \"" +
2129               escape(getPairs(set)) + "\"");
2130     }
2131 }
2132
2133 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2134                                      const UnicodeString& expPat,
2135                                      const char** expStrings) {
2136     UnicodeString pat;
2137     set.toPattern(pat, TRUE);
2138     if (pat == expPat) {
2139         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2140     } else {
2141         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2142         return;
2143     }
2144     if (expStrings == NULL) {
2145         return;
2146     }
2147     UBool in = TRUE;
2148     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2149         if (expStrings[i] == NOT) { // sic; pointer comparison
2150             in = FALSE;
2151             continue;
2152         }
2153         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2154         UBool contained = set.contains(s);
2155         if (contained == in) {
2156             logln((UnicodeString)"Ok: " + expPat +
2157                   (contained ? " contains {" : " does not contain {") +
2158                   escape(expStrings[i]) + "}");
2159         } else {
2160             errln((UnicodeString)"FAIL: " + expPat +
2161                   (contained ? " contains {" : " does not contain {") +
2162                   escape(expStrings[i]) + "}");
2163         }
2164     }
2165 }
2166
2167 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2168
2169 void
2170 UnicodeSetTest::doAssert(UBool condition, const char *message)
2171 {
2172     if (!condition) {
2173         errln(UnicodeString("ERROR : ") + message);
2174     }
2175 }
2176
2177 UnicodeString
2178 UnicodeSetTest::escape(const UnicodeString& s) {
2179     UnicodeString buf;
2180     for (int32_t i=0; i<s.length(); )
2181     {
2182         UChar32 c = s.char32At(i);
2183         if (0x0020 <= c && c <= 0x007F) {
2184             buf += c;
2185         } else {
2186             if (c <= 0xFFFF) {
2187                 buf += (UChar)0x5c; buf += (UChar)0x75;
2188             } else {
2189                 buf += (UChar)0x5c; buf += (UChar)0x55;
2190                 buf += toHexString((c & 0xF0000000) >> 28);
2191                 buf += toHexString((c & 0x0F000000) >> 24);
2192                 buf += toHexString((c & 0x00F00000) >> 20);
2193                 buf += toHexString((c & 0x000F0000) >> 16);
2194             }
2195             buf += toHexString((c & 0xF000) >> 12);
2196             buf += toHexString((c & 0x0F00) >> 8);
2197             buf += toHexString((c & 0x00F0) >> 4);
2198             buf += toHexString(c & 0x000F);
2199         }
2200         i += U16_LENGTH(c);
2201     }
2202     return buf;
2203 }
2204
2205 void UnicodeSetTest::TestFreezable() {
2206     UErrorCode errorCode=U_ZERO_ERROR;
2207     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2208     UnicodeSet idSet(idPattern, errorCode);
2209     if(U_FAILURE(errorCode)) {
2210         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2211         return;
2212     }
2213
2214     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2215     UnicodeSet wsSet(wsPattern, errorCode);
2216     if(U_FAILURE(errorCode)) {
2217         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2218         return;
2219     }
2220
2221     idSet.add(idPattern);
2222     UnicodeSet frozen(idSet);
2223     frozen.freeze();
2224
2225     if(idSet.isFrozen() || !frozen.isFrozen()) {
2226         errln("FAIL: isFrozen() is wrong");
2227     }
2228     if(frozen!=idSet || !(frozen==idSet)) {
2229         errln("FAIL: a copy-constructed frozen set differs from its original");
2230     }
2231
2232     frozen=wsSet;
2233     if(frozen!=idSet || !(frozen==idSet)) {
2234         errln("FAIL: a frozen set was modified by operator=");
2235     }
2236
2237     UnicodeSet frozen2(frozen);
2238     if(frozen2!=frozen || frozen2!=idSet) {
2239         errln("FAIL: a copied frozen set differs from its frozen original");
2240     }
2241     if(!frozen2.isFrozen()) {
2242         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2243     }
2244     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2245     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2246         errln("FAIL: UnicodeSet(5, 55) failed");
2247     }
2248     frozen3=frozen;
2249     if(!frozen3.isFrozen()) {
2250         errln("FAIL: copying a frozen set results in a thawed one");
2251     }
2252
2253     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2254     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2255         errln("FAIL: clone() failed");
2256     }
2257     cloned->add(0xd802, 0xd805);
2258     if(cloned->containsSome(0xd802, 0xd805)) {
2259         errln("FAIL: unable to modify clone");
2260     }
2261     delete cloned;
2262
2263     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2264     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2265         errln("FAIL: cloneAsThawed() failed");
2266     }
2267     thawed->add(0xd802, 0xd805);
2268     if(!thawed->contains(0xd802, 0xd805)) {
2269         errln("FAIL: unable to modify thawed clone");
2270     }
2271     delete thawed;
2272
2273     frozen.set(5, 55);
2274     if(frozen!=idSet || !(frozen==idSet)) {
2275         errln("FAIL: UnicodeSet::set() modified a frozen set");
2276     }
2277
2278     frozen.clear();
2279     if(frozen!=idSet || !(frozen==idSet)) {
2280         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2281     }
2282
2283     frozen.closeOver(USET_CASE_INSENSITIVE);
2284     if(frozen!=idSet || !(frozen==idSet)) {
2285         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2286     }
2287
2288     frozen.compact();
2289     if(frozen!=idSet || !(frozen==idSet)) {
2290         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2291     }
2292
2293     ParsePosition pos;
2294     frozen.
2295         applyPattern(wsPattern, errorCode).
2296         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2297         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2298         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2299         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2300     if(frozen!=idSet || !(frozen==idSet)) {
2301         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2302     }
2303
2304     frozen.
2305         add(0xd800).
2306         add(0xd802, 0xd805).
2307         add(wsPattern).
2308         addAll(idPattern).
2309         addAll(wsSet);
2310     if(frozen!=idSet || !(frozen==idSet)) {
2311         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2312     }
2313
2314     frozen.
2315         retain(0x62).
2316         retain(0x64, 0x69).
2317         retainAll(wsPattern).
2318         retainAll(wsSet);
2319     if(frozen!=idSet || !(frozen==idSet)) {
2320         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2321     }
2322
2323     frozen.
2324         remove(0x62).
2325         remove(0x64, 0x69).
2326         remove(idPattern).
2327         removeAll(idPattern).
2328         removeAll(idSet);
2329     if(frozen!=idSet || !(frozen==idSet)) {
2330         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2331     }
2332
2333     frozen.
2334         complement().
2335         complement(0x62).
2336         complement(0x64, 0x69).
2337         complement(idPattern).
2338         complementAll(idPattern).
2339         complementAll(idSet);
2340     if(frozen!=idSet || !(frozen==idSet)) {
2341         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2342     }
2343 }
2344
2345 // Test span() etc. -------------------------------------------------------- ***
2346
2347 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2348 static int32_t
2349 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2350     UErrorCode errorCode=U_ZERO_ERROR;
2351     int32_t length8=0;
2352     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2353     if(U_SUCCESS(errorCode)) {
2354         return length8;
2355     } else {
2356         // The string contains an unpaired surrogate.
2357         // Ignore this string.
2358         return 0;
2359     }
2360 }
2361
2362 class UnicodeSetWithStringsIterator;
2363
2364 // Make the strings in a UnicodeSet easily accessible.
2365 class UnicodeSetWithStrings {
2366 public:
2367     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2368             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2369         int32_t size=set.size();
2370         if(size>0 && set.charAt(size-1)<0) {
2371             // If a set's last element is not a code point, then it must contain strings.
2372             // Iterate over the set, skip all code point ranges, and cache the strings.
2373             // Convert them to UTF-8 for spanUTF8().
2374             UnicodeSetIterator iter(set);
2375             const UnicodeString *s;
2376             char *s8=utf8;
2377             int32_t length8, utf8Count=0;
2378             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2379                 if(iter.isString()) {
2380                     // Store the pointer to the set's string element
2381                     // which we happen to know is a stable pointer.
2382                     strings[stringsLength]=s=&iter.getString();
2383                     utf8Count+=
2384                         utf8Lengths[stringsLength]=length8=
2385                         appendUTF8(s->getBuffer(), s->length(),
2386                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2387                     if(length8==0) {
2388                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2389                     }
2390                     s8+=length8;
2391                     ++stringsLength;
2392                 }
2393             }
2394         }
2395     }
2396
2397     const UnicodeSet &getSet() const {
2398         return set;
2399     }
2400
2401     UBool hasStrings() const {
2402         return (UBool)(stringsLength>0);
2403     }
2404
2405     UBool hasStringsWithSurrogates() const {
2406         return hasSurrogates;
2407     }
2408
2409 private:
2410     friend class UnicodeSetWithStringsIterator;
2411
2412     const UnicodeSet &set;
2413
2414     const UnicodeString *strings[20];
2415     int32_t stringsLength;
2416     UBool hasSurrogates;
2417
2418     char utf8[1024];
2419     int32_t utf8Lengths[20];
2420 };
2421
2422 class UnicodeSetWithStringsIterator {
2423 public:
2424     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2425             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2426     }
2427
2428     void reset() {
2429         nextStringIndex=nextUTF8Start=0;
2430     }
2431
2432     const UnicodeString *nextString() {
2433         if(nextStringIndex<fSet.stringsLength) {
2434             return fSet.strings[nextStringIndex++];
2435         } else {
2436             return NULL;
2437         }
2438     }
2439
2440     // Do not mix with calls to nextString().
2441     const char *nextUTF8(int32_t &length) {
2442         if(nextStringIndex<fSet.stringsLength) {
2443             const char *s8=fSet.utf8+nextUTF8Start;
2444             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2445             return s8;
2446         } else {
2447             length=0;
2448             return NULL;
2449         }
2450     }
2451
2452 private:
2453     const UnicodeSetWithStrings &fSet;
2454     int32_t nextStringIndex;
2455     int32_t nextUTF8Start;
2456 };
2457
2458 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2459 // at code point boundaries.
2460 // That is, each edge of a match must not be in the middle of a surrogate pair.
2461 static inline UBool
2462 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2463     s+=start;
2464     limit-=start;
2465     int32_t length=t.length();
2466     return 0==t.compare(s, length) &&
2467            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2468            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2469 }
2470
2471 // Implement span() with contains() for comparison.
2472 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2473                                  USetSpanCondition spanCondition) {
2474     const UnicodeSet &realSet(set.getSet());
2475     if(!set.hasStrings()) {
2476         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2477             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2478         }
2479
2480         UChar32 c;
2481         int32_t start=0, prev;
2482         while((prev=start)<length) {
2483             U16_NEXT(s, start, length, c);
2484             if(realSet.contains(c)!=spanCondition) {
2485                 break;
2486             }
2487         }
2488         return prev;
2489     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2490         UnicodeSetWithStringsIterator iter(set);
2491         UChar32 c;
2492         int32_t start, next;
2493         for(start=next=0; start<length;) {
2494             U16_NEXT(s, next, length, c);
2495             if(realSet.contains(c)) {
2496                 break;
2497             }
2498             const UnicodeString *str;
2499             iter.reset();
2500             while((str=iter.nextString())!=NULL) {
2501                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2502                     // spanNeedsStrings=TRUE;
2503                     return start;
2504                 }
2505             }
2506             start=next;
2507         }
2508         return start;
2509     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2510         UnicodeSetWithStringsIterator iter(set);
2511         UChar32 c;
2512         int32_t start, next, maxSpanLimit=0;
2513         for(start=next=0; start<length;) {
2514             U16_NEXT(s, next, length, c);
2515             if(!realSet.contains(c)) {
2516                 next=start;  // Do not span this single, not-contained code point.
2517             }
2518             const UnicodeString *str;
2519             iter.reset();
2520             while((str=iter.nextString())!=NULL) {
2521                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2522                     // spanNeedsStrings=TRUE;
2523                     int32_t matchLimit=start+str->length();
2524                     if(matchLimit==length) {
2525                         return length;
2526                     }
2527                     if(spanCondition==USET_SPAN_CONTAINED) {
2528                         // Iterate for the shortest match at each position.
2529                         // Recurse for each but the shortest match.
2530                         if(next==start) {
2531                             next=matchLimit;  // First match from start.
2532                         } else {
2533                             if(matchLimit<next) {
2534                                 // Remember shortest match from start for iteration.
2535                                 int32_t temp=next;
2536                                 next=matchLimit;
2537                                 matchLimit=temp;
2538                             }
2539                             // Recurse for non-shortest match from start.
2540                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2541                                                                  USET_SPAN_CONTAINED);
2542                             if((matchLimit+spanLength)>maxSpanLimit) {
2543                                 maxSpanLimit=matchLimit+spanLength;
2544                                 if(maxSpanLimit==length) {
2545                                     return length;
2546                                 }
2547                             }
2548                         }
2549                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2550                         if(matchLimit>next) {
2551                             // Remember longest match from start.
2552                             next=matchLimit;
2553                         }
2554                     }
2555                 }
2556             }
2557             if(next==start) {
2558                 break;  // No match from start.
2559             }
2560             start=next;
2561         }
2562         if(start>maxSpanLimit) {
2563             return start;
2564         } else {
2565             return maxSpanLimit;
2566         }
2567     }
2568 }
2569
2570 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2571                                      USetSpanCondition spanCondition) {
2572     if(length==0) {
2573         return 0;
2574     }
2575     const UnicodeSet &realSet(set.getSet());
2576     if(!set.hasStrings()) {
2577         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2578             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2579         }
2580
2581         UChar32 c;
2582         int32_t prev=length;
2583         do {
2584             U16_PREV(s, 0, length, c);
2585             if(realSet.contains(c)!=spanCondition) {
2586                 break;
2587             }
2588         } while((prev=length)>0);
2589         return prev;
2590     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2591         UnicodeSetWithStringsIterator iter(set);
2592         UChar32 c;
2593         int32_t prev=length, length0=length;
2594         do {
2595             U16_PREV(s, 0, length, c);
2596             if(realSet.contains(c)) {
2597                 break;
2598             }
2599             const UnicodeString *str;
2600             iter.reset();
2601             while((str=iter.nextString())!=NULL) {
2602                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2603                     // spanNeedsStrings=TRUE;
2604                     return prev;
2605                 }
2606             }
2607         } while((prev=length)>0);
2608         return prev;
2609     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2610         UnicodeSetWithStringsIterator iter(set);
2611         UChar32 c;
2612         int32_t prev=length, minSpanStart=length, length0=length;
2613         do {
2614             U16_PREV(s, 0, length, c);
2615             if(!realSet.contains(c)) {
2616                 length=prev;  // Do not span this single, not-contained code point.
2617             }
2618             const UnicodeString *str;
2619             iter.reset();
2620             while((str=iter.nextString())!=NULL) {
2621                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2622                     // spanNeedsStrings=TRUE;
2623                     int32_t matchStart=prev-str->length();
2624                     if(matchStart==0) {
2625                         return 0;
2626                     }
2627                     if(spanCondition==USET_SPAN_CONTAINED) {
2628                         // Iterate for the shortest match at each position.
2629                         // Recurse for each but the shortest match.
2630                         if(length==prev) {
2631                             length=matchStart;  // First match from prev.
2632                         } else {
2633                             if(matchStart>length) {
2634                                 // Remember shortest match from prev for iteration.
2635                                 int32_t temp=length;
2636                                 length=matchStart;
2637                                 matchStart=temp;
2638                             }
2639                             // Recurse for non-shortest match from prev.
2640                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2641                                                                     USET_SPAN_CONTAINED);
2642                             if(spanStart<minSpanStart) {
2643                                 minSpanStart=spanStart;
2644                                 if(minSpanStart==0) {
2645                                     return 0;
2646                                 }
2647                             }
2648                         }
2649                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2650                         if(matchStart<length) {
2651                             // Remember longest match from prev.
2652                             length=matchStart;
2653                         }
2654                     }
2655                 }
2656             }
2657             if(length==prev) {
2658                 break;  // No match from prev.
2659             }
2660         } while((prev=length)>0);
2661         if(prev<minSpanStart) {
2662             return prev;
2663         } else {
2664             return minSpanStart;
2665         }
2666     }
2667 }
2668
2669 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2670                                 USetSpanCondition spanCondition) {
2671     const UnicodeSet &realSet(set.getSet());
2672     if(!set.hasStrings()) {
2673         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2674             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2675         }
2676
2677         UChar32 c;
2678         int32_t start=0, prev;
2679         while((prev=start)<length) {
2680             U8_NEXT_OR_FFFD(s, start, length, c);
2681             if(realSet.contains(c)!=spanCondition) {
2682                 break;
2683             }
2684         }
2685         return prev;
2686     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2687         UnicodeSetWithStringsIterator iter(set);
2688         UChar32 c;
2689         int32_t start, next;
2690         for(start=next=0; start<length;) {
2691             U8_NEXT_OR_FFFD(s, next, length, c);
2692             if(realSet.contains(c)) {
2693                 break;
2694             }
2695             const char *s8;
2696             int32_t length8;
2697             iter.reset();
2698             while((s8=iter.nextUTF8(length8))!=NULL) {
2699                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2700                     // spanNeedsStrings=TRUE;
2701                     return start;
2702                 }
2703             }
2704             start=next;
2705         }
2706         return start;
2707     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2708         UnicodeSetWithStringsIterator iter(set);
2709         UChar32 c;
2710         int32_t start, next, maxSpanLimit=0;
2711         for(start=next=0; start<length;) {
2712             U8_NEXT_OR_FFFD(s, next, length, c);
2713             if(!realSet.contains(c)) {
2714                 next=start;  // Do not span this single, not-contained code point.
2715             }
2716             const char *s8;
2717             int32_t length8;
2718             iter.reset();
2719             while((s8=iter.nextUTF8(length8))!=NULL) {
2720                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2721                     // spanNeedsStrings=TRUE;
2722                     int32_t matchLimit=start+length8;
2723                     if(matchLimit==length) {
2724                         return length;
2725                     }
2726                     if(spanCondition==USET_SPAN_CONTAINED) {
2727                         // Iterate for the shortest match at each position.
2728                         // Recurse for each but the shortest match.
2729                         if(next==start) {
2730                             next=matchLimit;  // First match from start.
2731                         } else {
2732                             if(matchLimit<next) {
2733                                 // Remember shortest match from start for iteration.
2734                                 int32_t temp=next;
2735                                 next=matchLimit;
2736                                 matchLimit=temp;
2737                             }
2738                             // Recurse for non-shortest match from start.
2739                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2740                                                                 USET_SPAN_CONTAINED);
2741                             if((matchLimit+spanLength)>maxSpanLimit) {
2742                                 maxSpanLimit=matchLimit+spanLength;
2743                                 if(maxSpanLimit==length) {
2744                                     return length;
2745                                 }
2746                             }
2747                         }
2748                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2749                         if(matchLimit>next) {
2750                             // Remember longest match from start.
2751                             next=matchLimit;
2752                         }
2753                     }
2754                 }
2755             }
2756             if(next==start) {
2757                 break;  // No match from start.
2758             }
2759             start=next;
2760         }
2761         if(start>maxSpanLimit) {
2762             return start;
2763         } else {
2764             return maxSpanLimit;
2765         }
2766     }
2767 }
2768
2769 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2770                                     USetSpanCondition spanCondition) {
2771     if(length==0) {
2772         return 0;
2773     }
2774     const UnicodeSet &realSet(set.getSet());
2775     if(!set.hasStrings()) {
2776         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2777             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2778         }
2779
2780         UChar32 c;
2781         int32_t prev=length;
2782         do {
2783             U8_PREV_OR_FFFD(s, 0, length, c);
2784             if(realSet.contains(c)!=spanCondition) {
2785                 break;
2786             }
2787         } while((prev=length)>0);
2788         return prev;
2789     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2790         UnicodeSetWithStringsIterator iter(set);
2791         UChar32 c;
2792         int32_t prev=length;
2793         do {
2794             U8_PREV_OR_FFFD(s, 0, length, c);
2795             if(realSet.contains(c)) {
2796                 break;
2797             }
2798             const char *s8;
2799             int32_t length8;
2800             iter.reset();
2801             while((s8=iter.nextUTF8(length8))!=NULL) {
2802                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2803                     // spanNeedsStrings=TRUE;
2804                     return prev;
2805                 }
2806             }
2807         } while((prev=length)>0);
2808         return prev;
2809     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2810         UnicodeSetWithStringsIterator iter(set);
2811         UChar32 c;
2812         int32_t prev=length, minSpanStart=length;
2813         do {
2814             U8_PREV_OR_FFFD(s, 0, length, c);
2815             if(!realSet.contains(c)) {
2816                 length=prev;  // Do not span this single, not-contained code point.
2817             }
2818             const char *s8;
2819             int32_t length8;
2820             iter.reset();
2821             while((s8=iter.nextUTF8(length8))!=NULL) {
2822                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2823                     // spanNeedsStrings=TRUE;
2824                     int32_t matchStart=prev-length8;
2825                     if(matchStart==0) {
2826                         return 0;
2827                     }
2828                     if(spanCondition==USET_SPAN_CONTAINED) {
2829                         // Iterate for the shortest match at each position.
2830                         // Recurse for each but the shortest match.
2831                         if(length==prev) {
2832                             length=matchStart;  // First match from prev.
2833                         } else {
2834                             if(matchStart>length) {
2835                                 // Remember shortest match from prev for iteration.
2836                                 int32_t temp=length;
2837                                 length=matchStart;
2838                                 matchStart=temp;
2839                             }
2840                             // Recurse for non-shortest match from prev.
2841                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2842                                                                    USET_SPAN_CONTAINED);
2843                             if(spanStart<minSpanStart) {
2844                                 minSpanStart=spanStart;
2845                                 if(minSpanStart==0) {
2846                                     return 0;
2847                                 }
2848                             }
2849                         }
2850                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2851                         if(matchStart<length) {
2852                             // Remember longest match from prev.
2853                             length=matchStart;
2854                         }
2855                     }
2856                 }
2857             }
2858             if(length==prev) {
2859                 break;  // No match from prev.
2860             }
2861         } while((prev=length)>0);
2862         if(prev<minSpanStart) {
2863             return prev;
2864         } else {
2865             return minSpanStart;
2866         }
2867     }
2868 }
2869
2870 // spans to be performed and compared
2871 enum {
2872     SPAN_UTF16          =1,
2873     SPAN_UTF8           =2,
2874     SPAN_UTFS           =3,
2875
2876     SPAN_SET            =4,
2877     SPAN_COMPLEMENT     =8,
2878     SPAN_POLARITY       =0xc,
2879
2880     SPAN_FWD            =0x10,
2881     SPAN_BACK           =0x20,
2882     SPAN_DIRS           =0x30,
2883
2884     SPAN_CONTAINED      =0x100,
2885     SPAN_SIMPLE         =0x200,
2886     SPAN_CONDITION      =0x300,
2887
2888     SPAN_ALL            =0x33f
2889 };
2890
2891 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2892     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2893 }
2894
2895 static inline int32_t slen(const void *s, UBool isUTF16) {
2896     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2897 }
2898
2899 /*
2900  * Count spans on a string with the method according to type and set the span limits.
2901  * The set may be the complement of the original.
2902  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2903  * according to the expected number of spans.
2904  * Sets typeName to an empty string if there is no such type.
2905  * Returns -1 if the span option is filtered out.
2906  */
2907 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2908                         const void *s, int32_t length, UBool isUTF16,
2909                         uint32_t whichSpans,
2910                         int type, const char *&typeName,
2911                         int32_t limits[], int32_t limitsCapacity,
2912                         int32_t expectCount) {
2913     const UnicodeSet &realSet(set.getSet());
2914     int32_t start, count;
2915     USetSpanCondition spanCondition, firstSpanCondition, contained;
2916     UBool isForward;
2917
2918     if(type<0 || 7<type) {
2919         typeName="";
2920         return 0;
2921     }
2922
2923     static const char *const typeNames16[]={
2924         "contains", "contains(LM)",
2925         "span", "span(LM)",
2926         "containsBack", "containsBack(LM)",
2927         "spanBack", "spanBack(LM)"
2928     };
2929
2930     static const char *const typeNames8[]={
2931         "containsUTF8", "containsUTF8(LM)",
2932         "spanUTF8", "spanUTF8(LM)",
2933         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2934         "spanBackUTF8", "spanBackUTF8(LM)"
2935     };
2936
2937     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2938
2939     // filter span options
2940     if(type<=3) {
2941         // span forward
2942         if((whichSpans&SPAN_FWD)==0) {
2943             return -1;
2944         }
2945         isForward=TRUE;
2946     } else {
2947         // span backward
2948         if((whichSpans&SPAN_BACK)==0) {
2949             return -1;
2950         }
2951         isForward=FALSE;
2952     }
2953     if((type&1)==0) {
2954         // use USET_SPAN_CONTAINED
2955         if((whichSpans&SPAN_CONTAINED)==0) {
2956             return -1;
2957         }
2958         contained=USET_SPAN_CONTAINED;
2959     } else {
2960         // use USET_SPAN_SIMPLE
2961         if((whichSpans&SPAN_SIMPLE)==0) {
2962             return -1;
2963         }
2964         contained=USET_SPAN_SIMPLE;
2965     }
2966
2967     // Default first span condition for going forward with an uncomplemented set.
2968     spanCondition=USET_SPAN_NOT_CONTAINED;
2969     if(isComplement) {
2970         spanCondition=invertSpanCondition(spanCondition, contained);
2971     }
2972
2973     // First span condition for span(), used to terminate the spanBack() iteration.
2974     firstSpanCondition=spanCondition;
2975
2976     // spanBack(): Its initial span condition is span()'s last span condition,
2977     // which is the opposite of span()'s first span condition
2978     // if we expect an even number of spans.
2979     // (The loop inverts spanCondition (expectCount-1) times
2980     // before the expectCount'th span() call.)
2981     // If we do not compare forward and backward directions, then we do not have an
2982     // expectCount and just start with firstSpanCondition.
2983     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2984         spanCondition=invertSpanCondition(spanCondition, contained);
2985     }
2986
2987     count=0;
2988     switch(type) {
2989     case 0:
2990     case 1:
2991         start=0;
2992         if(length<0) {
2993             length=slen(s, isUTF16);
2994         }
2995         for(;;) {
2996             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2997                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2998             if(count<limitsCapacity) {
2999                 limits[count]=start;
3000             }
3001             ++count;
3002             if(start>=length) {
3003                 break;
3004             }
3005             spanCondition=invertSpanCondition(spanCondition, contained);
3006         }
3007         break;
3008     case 2:
3009     case 3:
3010         start=0;
3011         for(;;) {
3012             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3013                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3014             if(count<limitsCapacity) {
3015                 limits[count]=start;
3016             }
3017             ++count;
3018             if(length>=0 ? start>=length :
3019                            isUTF16 ? ((const UChar *)s)[start]==0 :
3020                                      ((const char *)s)[start]==0
3021             ) {
3022                 break;
3023             }
3024             spanCondition=invertSpanCondition(spanCondition, contained);
3025         }
3026         break;
3027     case 4:
3028     case 5:
3029         if(length<0) {
3030             length=slen(s, isUTF16);
3031         }
3032         for(;;) {
3033             ++count;
3034             if(count<=limitsCapacity) {
3035                 limits[limitsCapacity-count]=length;
3036             }
3037             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3038                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3039             if(length==0 && spanCondition==firstSpanCondition) {
3040                 break;
3041             }
3042             spanCondition=invertSpanCondition(spanCondition, contained);
3043         }
3044         if(count<limitsCapacity) {
3045             memmove(limits, limits+(limitsCapacity-count), count*4);
3046         }
3047         break;
3048     case 6:
3049     case 7:
3050         for(;;) {
3051             ++count;
3052             if(count<=limitsCapacity) {
3053                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3054             }
3055             // Note: Length<0 is tested only for the first spanBack().
3056             // If we wanted to keep length<0 for all spanBack()s, we would have to
3057             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3058             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3059                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3060             if(length==0 && spanCondition==firstSpanCondition) {
3061                 break;
3062             }
3063             spanCondition=invertSpanCondition(spanCondition, contained);
3064         }
3065         if(count<limitsCapacity) {
3066             memmove(limits, limits+(limitsCapacity-count), count*4);
3067         }
3068         break;
3069     default:
3070         typeName="";
3071         return -1;
3072     }
3073
3074     return count;
3075 }
3076
3077 // sets to be tested; odd index=isComplement
3078 enum {
3079     SLOW,
3080     SLOW_NOT,
3081     FAST,
3082     FAST_NOT,
3083     SET_COUNT
3084 };
3085
3086 static const char *const setNames[SET_COUNT]={
3087     "slow",
3088     "slow.not",
3089     "fast",
3090     "fast.not"
3091 };
3092
3093 /*
3094  * Verify that we get the same results whether we look at text with contains(),
3095  * span() or spanBack(), using unfrozen or frozen versions of the set,
3096  * and using the set or its complement (switching the spanConditions accordingly).
3097  * The latter verifies that
3098  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3099  *
3100  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3101  * or returned to the caller (with an input expectCount<0).
3102  */
3103 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3104                               const void *s, int32_t length, UBool isUTF16,
3105                               uint32_t whichSpans,
3106                               int32_t expectLimits[], int32_t &expectCount,
3107                               const char *testName, int32_t index) {
3108     int32_t limits[500];
3109     int32_t limitsCount;
3110     int i, j;
3111
3112     const char *typeName;
3113     int type;
3114
3115     for(i=0; i<SET_COUNT; ++i) {
3116         if((i&1)==0) {
3117             // Even-numbered sets are original, uncomplemented sets.
3118             if((whichSpans&SPAN_SET)==0) {
3119                 continue;
3120             }
3121         } else {
3122             // Odd-numbered sets are complemented.
3123             if((whichSpans&SPAN_COMPLEMENT)==0) {
3124                 continue;
3125             }
3126         }
3127         for(type=0;; ++type) {
3128             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3129                                  s, length, isUTF16,
3130                                  whichSpans,
3131                                  type, typeName,
3132                                  limits, UPRV_LENGTHOF(limits), expectCount);
3133             if(typeName[0]==0) {
3134                 break; // All types tried.
3135             }
3136             if(limitsCount<0) {
3137                 continue; // Span option filtered out.
3138             }
3139             if(expectCount<0) {
3140                 expectCount=limitsCount;
3141                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3142                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3143                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3144                     return;
3145                 }
3146                 memcpy(expectLimits, limits, limitsCount*4);
3147             } else if(limitsCount!=expectCount) {
3148                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3149                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3150             } else {
3151                 for(j=0; j<limitsCount; ++j) {
3152                     if(limits[j]!=expectLimits[j]) {
3153                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3154                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3155                               j, (long)limits[j], (long)expectLimits[j]);
3156                         break;
3157                     }
3158                 }
3159             }
3160         }
3161     }
3162
3163     // Compare span() with containsAll()/containsNone(),
3164     // but only if we have expectLimits[] from the uncomplemented set.
3165     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3166         const UChar *s16=(const UChar *)s;
3167         UnicodeString string;
3168         int32_t prev=0, limit, length;
3169         for(i=0; i<expectCount; ++i) {
3170             limit=expectLimits[i];
3171             length=limit-prev;
3172             if(length>0) {
3173                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3174                 if(i&1) {
3175                     if(!sets[SLOW]->getSet().containsAll(string)) {
3176                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3177                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3178                         return;
3179                     }
3180                     if(!sets[FAST]->getSet().containsAll(string)) {
3181                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3182                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3183                         return;
3184                     }
3185                 } else {
3186                     if(!sets[SLOW]->getSet().containsNone(string)) {
3187                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3188                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3189                         return;
3190                     }
3191                     if(!sets[FAST]->getSet().containsNone(string)) {
3192                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3193                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3194                         return;
3195                     }
3196                 }
3197             }
3198             prev=limit;
3199         }
3200     }
3201 }
3202
3203 // Specifically test either UTF-16 or UTF-8.
3204 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3205                               const void *s, int32_t length, UBool isUTF16,
3206                               uint32_t whichSpans,
3207                               const char *testName, int32_t index) {
3208     int32_t expectLimits[500];
3209     int32_t expectCount=-1;
3210     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3211 }
3212
3213 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3214     UChar c, c2;
3215
3216     if(length>=0) {
3217         while(length>0) {
3218             c=*s++;
3219             --length;
3220             if(0xd800<=c && c<0xe000) {
3221                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3222                     return TRUE;
3223                 }
3224                 --length;
3225             }
3226         }
3227     } else {
3228         while((c=*s++)!=0) {
3229             if(0xd800<=c && c<0xe000) {
3230                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3231                     return TRUE;
3232                 }
3233             }
3234         }
3235     }
3236     return FALSE;
3237 }
3238
3239 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3240 // unless either UTF is turned off in whichSpans.
3241 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3242 // have the same contains(c) value as U+FFFD.
3243 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3244                                       const UChar *s16, int32_t length16,
3245                                       uint32_t whichSpans,
3246                                       const char *testName, int32_t index) {
3247     int32_t expectLimits[500];
3248     int32_t expectCount;
3249
3250     expectCount=-1;  // Get expectLimits[] from testSpan().
3251
3252     if((whichSpans&SPAN_UTF16)!=0) {
3253         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3254     }
3255     if((whichSpans&SPAN_UTF8)==0) {
3256         return;
3257     }
3258
3259     // Convert s16[] and expectLimits[] to UTF-8.
3260     uint8_t s8[3000];
3261     int32_t offsets[3000];
3262
3263     const UChar *s16Limit=s16+length16;
3264     char *t=(char *)s8;
3265     char *tLimit=t+sizeof(s8);
3266     int32_t *o=offsets;
3267     UErrorCode errorCode=U_ZERO_ERROR;
3268
3269     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3270     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3271     if(U_FAILURE(errorCode)) {
3272         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3273               testName, (long)index, u_errorName(errorCode));
3274         ucnv_resetFromUnicode(utf8Cnv);
3275         return;
3276     }
3277     int32_t length8=(int32_t)(t-(char *)s8);
3278
3279     // Convert expectLimits[].
3280     int32_t i, j, expect;
3281     for(i=j=0; i<expectCount; ++i) {
3282         expect=expectLimits[i];
3283         if(expect==length16) {
3284             expectLimits[i]=length8;
3285         } else {
3286             while(offsets[j]<expect) {
3287                 ++j;
3288             }
3289             expectLimits[i]=j;
3290         }
3291     }
3292
3293     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3294 }
3295
3296 static UChar32 nextCodePoint(UChar32 c) {
3297     // Skip some large and boring ranges.
3298     switch(c) {
3299     case 0x3441:
3300         return 0x4d7f;
3301     case 0x5100:
3302         return 0x9f00;
3303     case 0xb040:
3304         return 0xd780;
3305     case 0xe041:
3306         return 0xf8fe;
3307     case 0x10100:
3308         return 0x20000;
3309     case 0x20041:
3310         return 0xe0000;
3311     case 0xe0101:
3312         return 0x10fffd;
3313     default:
3314         return c+1;
3315     }
3316 }
3317
3318 // Verify that all implementations represent the same set.
3319 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3320     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3321     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3322     // Skip the UTF-8 part of the test - if the string contains surrogates -
3323     // because it is likely to produce a different result.
3324     UBool inconsistentSurrogates=
3325             (!(sets[0]->getSet().contains(0xfffd) ?
3326                sets[0]->getSet().contains(0xd800, 0xdfff) :
3327                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3328              sets[0]->hasStringsWithSurrogates());
3329
3330     UChar s[1000];
3331     int32_t length=0;
3332     uint32_t localWhichSpans;
3333
3334     UChar32 c, first;
3335     for(first=c=0;; c=nextCodePoint(c)) {
3336         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3337             localWhichSpans=whichSpans;
3338             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3339                 localWhichSpans&=~SPAN_UTF8;
3340             }
3341             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3342             if(c>0x10ffff) {
3343                 break;
3344             }
3345             length=0;
3346             first=c;
3347         }
3348         U16_APPEND_UNSAFE(s, length, c);
3349     }
3350 }
3351
3352 // Test with a particular, interesting string.
3353 // Specify length and try NUL-termination.
3354 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3355     static const UChar s[]={
3356         0x61, 0x62, 0x20,                       // Latin, space
3357         0x3b1, 0x3b2, 0x3b3,                    // Greek
3358         0xd900,                                 // lead surrogate
3359         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3360         0xdc05,                                 // trail surrogate
3361         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3362         0xd900, 0xdc05,                         // unassigned supplementary
3363         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3364         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3365         0                                       // NUL
3366     };
3367
3368     if((whichSpans&SPAN_UTF16)==0) {
3369         return;
3370     }
3371     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3372     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3373 }
3374
3375 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3376     static const char s[]={
3377         "abc"                                   // Latin
3378
3379         /* trail byte in lead position */
3380         "\x80"
3381
3382         " "                                     // space
3383
3384         /* truncated multi-byte sequences */
3385         "\xd0"
3386         "\xe0"
3387         "\xe1"
3388         "\xed"
3389         "\xee"
3390         "\xf0"
3391         "\xf1"
3392         "\xf4"
3393         "\xf8"
3394         "\xfc"
3395
3396         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3397
3398         /* trail byte in lead position */
3399         "\x80"
3400
3401         "\xe0\x80"
3402         "\xe0\xa0"
3403         "\xe1\x80"
3404         "\xed\x80"
3405         "\xed\xa0"
3406         "\xee\x80"
3407         "\xf0\x80"
3408         "\xf0\x90"
3409         "\xf1\x80"
3410         "\xf4\x80"
3411         "\xf4\x90"
3412         "\xf8\x80"
3413         "\xfc\x80"
3414
3415         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3416
3417         /* trail byte in lead position */
3418         "\x80"
3419
3420         "\xf0\x80\x80"
3421         "\xf0\x90\x80"
3422         "\xf1\x80\x80"
3423         "\xf4\x80\x80"
3424         "\xf4\x90\x80"
3425         "\xf8\x80\x80"
3426         "\xfc\x80\x80"
3427
3428         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3429
3430         /* trail byte in lead position */
3431         "\x80"
3432
3433         "\xf8\x80\x80\x80"
3434         "\xfc\x80\x80\x80"
3435
3436         "\xF1\x90\x80\x85"                      // unassigned supplementary
3437
3438         /* trail byte in lead position */
3439         "\x80"
3440
3441         "\xfc\x80\x80\x80\x80"
3442
3443         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3444
3445         /* trail byte in lead position */
3446         "\x80"
3447
3448         /* complete sequences but non-shortest forms or out of range etc. */
3449         "\xc0\x80"
3450         "\xe0\x80\x80"
3451         "\xed\xa0\x80"
3452         "\xf0\x80\x80\x80"
3453         "\xf4\x90\x80\x80"
3454         "\xf8\x80\x80\x80\x80"
3455         "\xfc\x80\x80\x80\x80\x80"
3456         "\xfe"
3457         "\xff"
3458
3459         /* trail byte in lead position */
3460         "\x80"
3461
3462         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3463     };
3464
3465     if((whichSpans&SPAN_UTF8)==0) {
3466         return;
3467     }
3468     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3469     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3470 }
3471
3472 // Take a set of span options and multiply them so that
3473 // each portion only has one of the options a, b and c.
3474 // If b==0, then the set of options is just modified with mask and a.
3475 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3476 static int32_t
3477 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3478                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3479     uint32_t s;
3480     int32_t i;
3481
3482     for(i=0; i<whichSpansCount; ++i) {
3483         s=whichSpans[i]&mask;
3484         whichSpans[i]=s|a;
3485         if(b!=0) {
3486             whichSpans[whichSpansCount+i]=s|b;
3487             if(c!=0) {
3488                 whichSpans[2*whichSpansCount+i]=s|c;
3489             }
3490         }
3491     }
3492     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3493 }
3494
3495 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3496 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3497 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3498 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3499
3500 void UnicodeSetTest::TestSpan() {
3501     // "[...]" is a UnicodeSet pattern.
3502     // "*" performs tests on all Unicode code points and on a selection of
3503     //   malformed UTF-8/16 strings.
3504     // "-options" limits the scope of testing for the current set.
3505     //   By default, the test verifies that equivalent boundaries are found
3506     //   for UTF-16 and UTF-8, going forward and backward,
3507     //   alternating USET_SPAN_NOT_CONTAINED with
3508     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3509     //   Single-character options:
3510     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3511     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3512     //          or the set contains strings with unpaired surrogates
3513     //          which do not translate to valid UTF-8.
3514     //     c -- set.span() and set.complement().span() boundaries may differ.
3515     //          Cause: Set strings are not complemented.
3516     //     b -- span() and spanBack() boundaries may differ.
3517     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3518     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3519     //          match with non-overlapping substrings.
3520     //          For example, with a set containing "ab" and "ba",
3521     //          span() of "aba" yields boundaries { 0, 2, 3 }
3522     //          because the initial "ab" matches from 0 to 2,
3523     //          while spanBack() yields boundaries { 0, 1, 3 }
3524     //          because the final "ba" matches from 1 to 3.
3525     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3526     //          Cause: Strings in the set overlap, and a longer match may
3527     //          require a sequence including non-longest substrings.
3528     //          For example, with a set containing "ab", "abc" and "cd",
3529     //          span(contained) of "abcd" spans the entire string
3530     //          but span(longest match) only spans the first 3 characters.
3531     //   Each "-options" first resets all options and then applies the specified options.
3532     //   A "-" without options resets the options.
3533     //   The options are also reset for each new set.
3534     // Other strings will be spanned.
3535     static const char *const testdata[]={
3536         "[:ID_Continue:]",
3537         "*",
3538         "[:White_Space:]",
3539         "*",
3540         "[]",
3541         "*",
3542         "[\\u0000-\\U0010FFFF]",
3543         "*",
3544         "[\\u0000\\u0080\\u0800\\U00010000]",
3545         "*",
3546         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3547         "*",
3548         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3549         "-c",
3550         "*",
3551         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3552         "-c",
3553         "*",
3554
3555         // Overlapping strings cause overlapping attempts to match.
3556         "[x{xy}{xya}{axy}{ax}]",
3557         "-cl",
3558
3559         // More repetitions of "xya" would take too long with the recursive
3560         // reference implementation.
3561         // containsAll()=FALSE
3562         // test_string 0x14
3563         "xx"
3564         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3565         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3566         "xyaxyaxyaxya"
3567         "xx"
3568         "xyaxyaxyaxya"  // span() ends here.
3569         "aaa",
3570
3571         // containsAll()=TRUE
3572         // test_string 0x15
3573         "xx"
3574         "xyaxyaxyaxya"
3575         "xx"
3576         "xyaxyaxyaxya"
3577         "xx"
3578         "xyaxyaxyaxy",
3579
3580         "-bc",
3581         // test_string 0x17
3582         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3583         "-c",
3584         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3585         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3586         "-",
3587         "byaya",     // span() -> { 5 }
3588         "byay",      // span() -> { 4 }
3589         "bya",       // span() -> { 3 }
3590
3591         // span(longest match) will not span the whole string.
3592         "[a{ab}{bc}]",
3593         "-cl",
3594         // test_string 0x21
3595         "abc",
3596
3597         "[a{ab}{abc}{cd}]",
3598         "-cl",
3599         "acdabcdabccd",
3600
3601         // spanBack(longest match) will not span the whole string.
3602         "[c{ab}{bc}]",
3603         "-cl",
3604         "abc",
3605
3606         "[d{cd}{bcd}{ab}]",
3607         "-cl",
3608         "abbcdabcdabd",
3609
3610         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3611         // and UTF-8 trail bytes.
3612         // Copies of above test sets and strings, but transliterated to have
3613         // different code points with similar trail units.
3614         // Previous: a      b         c            d
3615         // Unicode:  042B   30AB      200AB        204AB
3616         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3617         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3618         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3619         "-cl",
3620         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3621
3622         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3623         "-cl",
3624         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3625
3626         // Stress bookkeeping and recursion.
3627         // The following strings are barely doable with the recursive
3628         // reference implementation.
3629         // The not-contained character at the end prevents an early exit from the span().
3630         "[b{bb}]",
3631         "-c",
3632         // test_string 0x33
3633         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3634         // On complement sets, span() and spanBack() get different results
3635         // because b is not in the complement set and there is an odd number of b's
3636         // in the test string.
3637         "-bc",
3638         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3639
3640         // Test with set strings with an initial or final code point span
3641         // longer than 254.
3642         "[a{" _64_a _64_a _64_a _64_a "b}"
3643           "{a" _64_b _64_b _64_b _64_b "}]",
3644         "-c",
3645         _64_a _64_a _64_a _63_a "b",
3646         _64_a _64_a _64_a _64_a "b",
3647         _64_a _64_a _64_a _64_a "aaaabbbb",
3648         "a" _64_b _64_b _64_b _63_b,
3649         "a" _64_b _64_b _64_b _64_b,
3650         "aaaabbbb" _64_b _64_b _64_b _64_b,
3651
3652         // Test with strings containing unpaired surrogates.
3653         // They are not representable in UTF-8, and a leading trail surrogate
3654         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3655         // U+20001 == \\uD840\\uDC01
3656         // U+20400 == \\uD841\\uDC00
3657         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3658         "-8cl",
3659         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3660     };
3661     uint32_t whichSpans[96]={ SPAN_ALL };
3662     int32_t whichSpansCount=1;
3663
3664     UnicodeSet *sets[SET_COUNT]={ NULL };
3665     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3666
3667     char testName[1024];
3668     char *testNameLimit=testName;
3669
3670     int32_t i, j;
3671     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3672         const char *s=testdata[i];
3673         if(s[0]=='[') {
3674             // Create new test sets from this pattern.
3675             for(j=0; j<SET_COUNT; ++j) {
3676                 delete sets_with_str[j];
3677                 delete sets[j];
3678             }
3679             UErrorCode errorCode=U_ZERO_ERROR;
3680             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3681             if(U_FAILURE(errorCode)) {
3682                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3683                 break;
3684             }
3685             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3686             sets[SLOW_NOT]->complement();
3687             // Intermediate set: Test cloning of a frozen set.
3688             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3689             fast->freeze();
3690             sets[FAST]=(UnicodeSet *)fast->clone();
3691             delete fast;
3692             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3693             fastNot->freeze();
3694             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3695             delete fastNot;
3696
3697             for(j=0; j<SET_COUNT; ++j) {
3698                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3699             }
3700
3701             strcpy(testName, s);
3702             testNameLimit=strchr(testName, 0);
3703             *testNameLimit++=':';
3704             *testNameLimit=0;
3705
3706             whichSpans[0]=SPAN_ALL;
3707             whichSpansCount=1;
3708         } else if(s[0]=='-') {
3709             whichSpans[0]=SPAN_ALL;
3710             whichSpansCount=1;
3711
3712             while(*++s!=0) {
3713                 switch(*s) {
3714                 case 'c':
3715                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3716                                                    ~SPAN_POLARITY,
3717                                                    SPAN_SET,
3718                                                    SPAN_COMPLEMENT,
3719                                                    0);
3720                     break;
3721                 case 'b':
3722                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3723                                                    ~SPAN_DIRS,
3724                                                    SPAN_FWD,
3725                                                    SPAN_BACK,
3726                                                    0);
3727                     break;
3728                 case 'l':
3729                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3730                     // USET_SPAN_SIMPLE only FWD, and separately
3731                     // USET_SPAN_SIMPLE only BACK
3732                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3733                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3734                                                    SPAN_DIRS|SPAN_CONTAINED,
3735                                                    SPAN_FWD|SPAN_SIMPLE,
3736                                                    SPAN_BACK|SPAN_SIMPLE);
3737                     break;
3738                 case '8':
3739                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3740                                                    ~SPAN_UTFS,
3741                                                    SPAN_UTF16,
3742                                                    SPAN_UTF8,
3743                                                    0);
3744                     break;
3745                 default:
3746                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3747                     break;
3748                 }
3749             }
3750         } else if(0==strcmp(s, "*")) {
3751             strcpy(testNameLimit, "bad_string");
3752             for(j=0; j<whichSpansCount; ++j) {
3753                 if(whichSpansCount>1) {
3754                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3755                             "%%0x%3x",
3756                             whichSpans[j]);
3757                 }
3758                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3759                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3760             }
3761
3762             strcpy(testNameLimit, "contents");
3763             for(j=0; j<whichSpansCount; ++j) {
3764                 if(whichSpansCount>1) {
3765                     sprintf(testNameLimit+8 /* strlen("contents") */,
3766                             "%%0x%3x",
3767                             whichSpans[j]);
3768                 }
3769                 testSpanContents(sets_with_str, whichSpans[j], testName);
3770             }
3771         } else {
3772             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3773             strcpy(testNameLimit, "test_string");
3774             for(j=0; j<whichSpansCount; ++j) {
3775                 if(whichSpansCount>1) {
3776                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3777                             "%%0x%3x",
3778                             whichSpans[j]);
3779                 }
3780                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3781             }
3782         }
3783     }
3784     for(j=0; j<SET_COUNT; ++j) {
3785         delete sets_with_str[j];
3786         delete sets[j];
3787     }
3788 }
3789
3790 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3791 void UnicodeSetTest::TestStringSpan() {
3792     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3793     static const char *const string=
3794         "xx"
3795         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3796         "xx"
3797         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3798         "xx"
3799         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3800         "aaaa";
3801
3802     UErrorCode errorCode=U_ZERO_ERROR;
3803     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3804     UnicodeSet set(pattern16, errorCode);
3805     if(U_FAILURE(errorCode)) {
3806         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3807         return;
3808     }
3809
3810     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3811
3812     if(set.containsAll(string16)) {
3813         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3814     }
3815
3816     // Remove trailing "aaaa".
3817     string16.truncate(string16.length()-4);
3818     if(!set.containsAll(string16)) {
3819         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3820     }
3821
3822     string16=UNICODE_STRING_SIMPLE("byayaxya");
3823     const UChar *s16=string16.getBuffer();
3824     int32_t length16=string16.length();
3825     (void)length16;   // Suppress set but not used warning.
3826     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3827         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3828         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3829         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3830         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3831         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3832     ) {
3833         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3834     }
3835
3836     pattern="[a{ab}{abc}{cd}]";
3837     pattern16=UnicodeString(pattern, -1, US_INV);
3838     set.applyPattern(pattern16, errorCode);
3839     if(U_FAILURE(errorCode)) {
3840         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3841         return;
3842     }
3843     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3844     s16=string16.getBuffer();
3845     length16=string16.length();
3846     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3847         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3848         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3849     ) {
3850         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3851     }
3852
3853     pattern="[d{cd}{bcd}{ab}]";
3854     pattern16=UnicodeString(pattern, -1, US_INV);
3855     set.applyPattern(pattern16, errorCode).freeze();
3856     if(U_FAILURE(errorCode)) {
3857         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3858         return;
3859     }
3860     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3861     s16=string16.getBuffer();
3862     length16=string16.length();
3863     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3864         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3865         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3866     ) {
3867         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3868     }
3869 }
3870
3871 /**
3872  * Including collationroot.h fails here with
3873 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3874  *  .. so, we skip this test on Windows.
3875  *
3876  * the cause is that  intltest builds with /Za which disables language extensions - which means
3877  *  windows header files can't be used.
3878  */
3879 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3880 #include "collationroot.h"
3881 #include "collationtailoring.h"
3882 #endif
3883
3884 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3885 #if U_PLATFORM_HAS_WIN32_API
3886     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3887 #elif !UCONFIG_NO_COLLATION
3888     UErrorCode errorCode = U_ZERO_ERROR;
3889
3890     // Get the unsafeBackwardsSet
3891     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3892     if(U_FAILURE(errorCode)) {
3893       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3894       return;
3895     }
3896     //const UVersionInfo &version = rootEntry->tailoring->version;
3897     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3898
3899     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3900
3901     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3902         // simple test case
3903         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3904         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3905         UnicodeSet surrogates;
3906         surrogates.add(0xd83a);  // a lead surrogate
3907         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3908         UnicodeString pat;
3909         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3910         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3911         // so that at least one type of surrogate code points are escaped,
3912         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3913         errorCode = U_ZERO_ERROR;
3914         UnicodeSet s2;
3915         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3916         if(U_FAILURE(errorCode)) {
3917             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3918         } else {
3919             checkEqual(surrogates, s2, "surrogates to/from pattern");
3920         }
3921         // This occurs in the UCA unsafe-backwards set.
3922         checkRoundTrip(*unsafeBackwardSet);
3923     }
3924 #endif
3925 }