icuSources/test/intltest/usettest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ********************************************************************************
   5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
   6 *   others. All Rights Reserved.
   7 ********************************************************************************
   8 *   Date        Name        Description
   9 *   10/20/99    alan        Creation.
  10 *   03/22/2000  Madhu       Added additional tests
  11 ********************************************************************************
  12 */
  13
  14 #include <stdio.h>
  15
  16 #include <string.h>
  17 #include "unicode/utypes.h"
  18 #include "usettest.h"
  19 #include "unicode/ucnv.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/usetiter.h"
  23 #include "unicode/ustring.h"
  24 #include "unicode/parsepos.h"
  25 #include "unicode/symtable.h"
  26 #include "unicode/uversion.h"
  27 #include "cmemory.h"
  28 #include "hash.h"
  29
  30 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  31     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  32     u_errorName(status));}}
  33
  34 #define TEST_ASSERT(expr) {if (!(expr)) { \
  35     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
  36
  37 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  38     UnicodeString pat;
  39     set.toPattern(pat);
  40     return left + UnicodeSetTest::escape(pat);
  41 }
  42
  43 #define CASE(id,test) case id:                          \
  44                           name = #test;                 \
  45                           if (exec) {                   \
  46                               logln(#test "---");       \
  47                               logln();                  \
  48                               test();                   \
  49                           }                             \
  50                           break
  51
  52 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
  53 }
  54
  55 UConverter *UnicodeSetTest::openUTF8Converter() {
  56     if(utf8Cnv==NULL) {
  57         UErrorCode errorCode=U_ZERO_ERROR;
  58         utf8Cnv=ucnv_open("UTF-8", &errorCode);
  59     }
  60     return utf8Cnv;
  61 }
  62
  63 UnicodeSetTest::~UnicodeSetTest() {
  64     ucnv_close(utf8Cnv);
  65 }
  66
  67 void
  68 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  69                                const char* &name, char* /*par*/) {
  70     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  71     switch (index) {
  72         CASE(0,TestPatterns);
  73         CASE(1,TestAddRemove);
  74         CASE(2,TestCategories);
  75         CASE(3,TestCloneEqualHash);
  76         CASE(4,TestMinimalRep);
  77         CASE(5,TestAPI);
  78         CASE(6,TestScriptSet);
  79         CASE(7,TestPropertySet);
  80         CASE(8,TestClone);
  81         CASE(9,TestExhaustive);
  82         CASE(10,TestToPattern);
  83         CASE(11,TestIndexOf);
  84         CASE(12,TestStrings);
  85         CASE(13,Testj2268);
  86         CASE(14,TestCloseOver);
  87         CASE(15,TestEscapePattern);
  88         CASE(16,TestInvalidCodePoint);
  89         CASE(17,TestSymbolTable);
  90         CASE(18,TestSurrogate);
  91         CASE(19,TestPosixClasses);
  92         CASE(20,TestIteration);
  93         CASE(21,TestFreezable);
  94         CASE(22,TestSpan);
  95         CASE(23,TestStringSpan);
  96         CASE(24,TestUCAUnsafeBackwards);
  97         default: name = ""; break;
  98     }
  99 }
 100
 101 static const char NOT[] = "%%%%";
 102
 103 /**
 104  * UVector was improperly copying contents
 105  * This code will crash this is still true
 106  */
 107 void UnicodeSetTest::Testj2268() {
 108   UnicodeSet t;
 109   t.add(UnicodeString("abc"));
 110   UnicodeSet test(t);
 111   UnicodeString ustrPat;
 112   test.toPattern(ustrPat, TRUE);
 113 }
 114
 115 /**
 116  * Test toPattern().
 117  */
 118 void UnicodeSetTest::TestToPattern() {
 119     UErrorCode ec = U_ZERO_ERROR;
 120
 121     // Test that toPattern() round trips with syntax characters and
 122     // whitespace.
 123     {
 124         static const char* OTHER_TOPATTERN_TESTS[] = {
 125             "[[:latin:]&[:greek:]]",
 126             "[[:latin:]-[:greek:]]",
 127             "[:nonspacing mark:]",
 128             NULL
 129         };
 130
 131         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 132             ec = U_ZERO_ERROR;
 133             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 134             if (U_FAILURE(ec)) {
 135                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
 136                 continue;
 137             }
 138             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 139         }
 140
 141         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 142             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 143
 144                 // check various combinations to make sure they all work.
 145                 if (i != 0 && !toPatternAux(i, i)){
 146                     continue;
 147                 }
 148                 if (!toPatternAux(0, i)){
 149                     continue;
 150                 }
 151                 if (!toPatternAux(i, 0xFFFF)){
 152                     continue;
 153                 }
 154             }
 155         }
 156     }
 157
 158     // Test pattern behavior of multicharacter strings.
 159     {
 160         ec = U_ZERO_ERROR;
 161         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 162
 163         // This loop isn't a loop.  It's here to make the compiler happy.
 164         // If you're curious, try removing it and changing the 'break'
 165         // statements (except for the last) to goto's.
 166         for (;;) {
 167             if (U_FAILURE(ec)) break;
 168             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 169             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 170
 171             s->add("ac");
 172             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 173             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 174
 175             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
 176             if (U_FAILURE(ec)) break;
 177             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 178             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
 179
 180             s->add("[]");
 181             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 182             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
 183
 184             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
 185             if (U_FAILURE(ec)) break;
 186             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 187             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
 188
 189             // j2189
 190             s->clear();
 191             s->add(UnicodeString("abc", ""));
 192             s->add(UnicodeString("abc", ""));
 193             const char* exp6[] = {"abc", NOT, "ab", NULL};
 194             expectToPattern(*s, "[{abc}]", exp6);
 195
 196             break;
 197         }
 198
 199         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 200         delete s;
 201     }
 202
 203     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 204     UnicodeSet s;
 205     s.add((UChar)97, (UChar)98); // 'a', 'b'
 206     expectToPattern(s, "[ab]", NULL);
 207 }
 208
 209 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 210
 211     // use Integer.toString because Utility.hex doesn't handle ints
 212     UnicodeString pat = "";
 213     // TODO do these in hex
 214     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 215     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 216     UnicodeString source;
 217     source = source + (uint32_t)start;
 218     if (start != end)
 219         source = source + ".." + (uint32_t)end;
 220     UnicodeSet testSet;
 221     testSet.add(start, end);
 222     return checkPat(source, testSet);
 223 }
 224
 225 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 226                                const UnicodeSet& testSet) {
 227     // What we want to make sure of is that a pattern generated
 228     // by toPattern(), with or without escaped unprintables, can
 229     // be passed back into the UnicodeSet constructor.
 230     UnicodeString pat0;
 231
 232     testSet.toPattern(pat0, TRUE);
 233
 234     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 235
 236     //String pat1 = unescapeLeniently(pat0);
 237     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 238
 239     UnicodeString pat2;
 240     testSet.toPattern(pat2, FALSE);
 241     if (!checkPat(source, testSet, pat2)) return FALSE;
 242
 243     //String pat3 = unescapeLeniently(pat2);
 244     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 245
 246     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 247     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 248     return TRUE;
 249 }
 250
 251 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 252                                const UnicodeSet& testSet,
 253                                const UnicodeString& pat) {
 254     UErrorCode ec = U_ZERO_ERROR;
 255     UnicodeSet testSet2(pat, ec);
 256     if (testSet2 != testSet) {
 257         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 258         return FALSE;
 259     }
 260     return TRUE;
 261 }
 262
 263 void
 264 UnicodeSetTest::TestPatterns(void) {
 265     UnicodeSet set;
 266     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 267     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 268     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 269     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 270     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 271     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 272
 273     // Throw in a test of complement
 274     set.complement();
 275     UnicodeString exp;
 276     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 277     expectPairs(set, exp);
 278 }
 279
 280 void
 281 UnicodeSetTest::TestCategories(void) {
 282     UErrorCode status = U_ZERO_ERROR;
 283     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 284     UnicodeSet set(pat, status);
 285     if (U_FAILURE(status)) {
 286         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
 287         return;
 288     } else {
 289         expectContainment(set, pat, "ABC", "abc");
 290     }
 291
 292     UChar32 i;
 293     int32_t failures = 0;
 294     // Make sure generation of L doesn't pollute cached Lu set
 295     // First generate L, then Lu
 296     set.applyPattern("[:L:]", status);
 297     if (U_FAILURE(status)) { errln("FAIL"); return; }
 298     for (i=0; i<0x200; ++i) {
 299         UBool l = u_isalpha((UChar)i);
 300         if (l != set.contains(i)) {
 301             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 302                   set.contains(i));
 303             if (++failures == 10) break;
 304         }
 305     }
 306
 307     set.applyPattern("[:Lu:]", status);
 308     if (U_FAILURE(status)) { errln("FAIL"); return; }
 309     for (i=0; i<0x200; ++i) {
 310         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 311         if (lu != set.contains(i)) {
 312             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 313                   set.contains(i));
 314             if (++failures == 20) break;
 315         }
 316     }
 317 }
 318 void
 319 UnicodeSetTest::TestCloneEqualHash(void) {
 320     UErrorCode status = U_ZERO_ERROR;
 321     // set1 and set2 used to be built with the obsolete constructor taking
 322     // UCharCategory values; replaced with pattern constructors
 323     // markus 20030502
 324     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
 325     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
 326     if (U_FAILURE(status)){
 327         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
 328         return;
 329     }
 330     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
 331     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
 332     if (U_FAILURE(status)){
 333         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 334         return;
 335     }
 336
 337     if (*set1 != *set1a) {
 338         errln("FAIL: category constructor for Ll broken");
 339     }
 340     if (*set2 != *set2a) {
 341         errln("FAIL: category constructor for Nd broken");
 342     }
 343     delete set1a;
 344     delete set2a;
 345
 346     logln("Testing copy construction");
 347     UnicodeSet *set1copy=new UnicodeSet(*set1);
 348     if(*set1 != *set1copy || *set1 == *set2 ||
 349         getPairs(*set1) != getPairs(*set1copy) ||
 350         set1->hashCode() != set1copy->hashCode()){
 351         errln("FAIL : Error in copy construction");
 352         return;
 353     }
 354
 355     logln("Testing =operator");
 356     UnicodeSet set1equal=*set1;
 357     UnicodeSet set2equal=*set2;
 358     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 359         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 360         errln("FAIL: Error in =operator");
 361     }
 362
 363     logln("Testing clone()");
 364     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 365     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 366     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 367         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 368         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 369         errln("FAIL: Error in clone");
 370     }
 371
 372     logln("Testing hashcode");
 373     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 374         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 375         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 376         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 377         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 378         errln("FAIL: Error in hashCode()");
 379     }
 380
 381     delete set1;
 382     delete set1copy;
 383     delete set2;
 384     delete set1clone;
 385     delete set2clone;
 386
 387
 388 }
 389 void
 390 UnicodeSetTest::TestAddRemove(void) {
 391     UnicodeSet set; // Construct empty set
 392     doAssert(set.isEmpty() == TRUE, "set should be empty");
 393     doAssert(set.size() == 0, "size should be 0");
 394     set.complement();
 395     doAssert(set.size() == 0x110000, "size should be 0x110000");
 396     set.clear();
 397     set.add(0x0061, 0x007a);
 398     expectPairs(set, "az");
 399     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 400     doAssert(set.size() != 0, "size should not be equal to 0");
 401     doAssert(set.size() == 26, "size should be equal to 26");
 402     set.remove(0x006d, 0x0070);
 403     expectPairs(set, "alqz");
 404     doAssert(set.size() == 22, "size should be equal to 22");
 405     set.remove(0x0065, 0x0067);
 406     expectPairs(set, "adhlqz");
 407     doAssert(set.size() == 19, "size should be equal to 19");
 408     set.remove(0x0064, 0x0069);
 409     expectPairs(set, "acjlqz");
 410     doAssert(set.size() == 16, "size should be equal to 16");
 411     set.remove(0x0063, 0x0072);
 412     expectPairs(set, "absz");
 413     doAssert(set.size() == 10, "size should be equal to 10");
 414     set.add(0x0066, 0x0071);
 415     expectPairs(set, "abfqsz");
 416     doAssert(set.size() == 22, "size should be equal to 22");
 417     set.remove(0x0061, 0x0067);
 418     expectPairs(set, "hqsz");
 419     set.remove(0x0061, 0x007a);
 420     expectPairs(set, "");
 421     doAssert(set.isEmpty() == TRUE, "set should be empty");
 422     doAssert(set.size() == 0, "size should be 0");
 423     set.add(0x0061);
 424     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 425     doAssert(set.size() == 1, "size should not be equal to 1");
 426     set.add(0x0062);
 427     set.add(0x0063);
 428     expectPairs(set, "ac");
 429     doAssert(set.size() == 3, "size should not be equal to 3");
 430     set.add(0x0070);
 431     set.add(0x0071);
 432     expectPairs(set, "acpq");
 433     doAssert(set.size() == 5, "size should not be equal to 5");
 434     set.clear();
 435     expectPairs(set, "");
 436     doAssert(set.isEmpty() == TRUE, "set should be empty");
 437     doAssert(set.size() == 0, "size should be 0");
 438
 439     // Try removing an entire set from another set
 440     expectPattern(set, "[c-x]", "cx");
 441     UnicodeSet set2;
 442     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 443     set.removeAll(set2);
 444     expectPairs(set, "deluxx");
 445
 446     // Try adding an entire set to another set
 447     expectPattern(set, "[jackiemclean]", "aacceein");
 448     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 449     set.addAll(set2);
 450     expectPairs(set, "aacehort");
 451     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 452
 453     // Try retaining an set of elements contained in another set (intersection)
 454     UnicodeSet set3;
 455     expectPattern(set3, "[a-c]", "ac");
 456     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 457     set3.remove(0x0062);
 458     expectPairs(set3, "aacc");
 459     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 460     set.retainAll(set3);
 461     expectPairs(set, "aacc");
 462     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 463     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 464     set.clear();
 465     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 466
 467     // Test commutativity
 468     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 469     expectPattern(set2, "[jackiemclean]", "aacceein");
 470     set.addAll(set2);
 471     expectPairs(set, "aacehort");
 472     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 473
 474
 475
 476
 477 }
 478
 479 /**
 480  * Make sure minimal representation is maintained.
 481  */
 482 void UnicodeSetTest::TestMinimalRep() {
 483     UErrorCode status = U_ZERO_ERROR;
 484     // This is pretty thoroughly tested by checkCanonicalRep()
 485     // run against the exhaustive operation results.  Use the code
 486     // here for debugging specific spot problems.
 487
 488     // 1 overlap against 2
 489     UnicodeSet set("[h-km-q]", status);
 490     if (U_FAILURE(status)) { errln("FAIL"); return; }
 491     UnicodeSet set2("[i-o]", status);
 492     if (U_FAILURE(status)) { errln("FAIL"); return; }
 493     set.addAll(set2);
 494     expectPairs(set, "hq");
 495     // right
 496     set.applyPattern("[a-m]", status);
 497     if (U_FAILURE(status)) { errln("FAIL"); return; }
 498     set2.applyPattern("[e-o]", status);
 499     if (U_FAILURE(status)) { errln("FAIL"); return; }
 500     set.addAll(set2);
 501     expectPairs(set, "ao");
 502     // left
 503     set.applyPattern("[e-o]", status);
 504     if (U_FAILURE(status)) { errln("FAIL"); return; }
 505     set2.applyPattern("[a-m]", status);
 506     if (U_FAILURE(status)) { errln("FAIL"); return; }
 507     set.addAll(set2);
 508     expectPairs(set, "ao");
 509     // 1 overlap against 3
 510     set.applyPattern("[a-eg-mo-w]", status);
 511     if (U_FAILURE(status)) { errln("FAIL"); return; }
 512     set2.applyPattern("[d-q]", status);
 513     if (U_FAILURE(status)) { errln("FAIL"); return; }
 514     set.addAll(set2);
 515     expectPairs(set, "aw");
 516 }
 517
 518 void UnicodeSetTest::TestAPI() {
 519     UErrorCode status = U_ZERO_ERROR;
 520     // default ct
 521     UnicodeSet set;
 522     if (!set.isEmpty() || set.getRangeCount() != 0) {
 523         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 524               set);
 525     }
 526
 527     // clear(), isEmpty()
 528     set.add(0x0061);
 529     if (set.isEmpty()) {
 530         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 531               set);
 532     }
 533     set.clear();
 534     if (!set.isEmpty()) {
 535         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 536               set);
 537     }
 538
 539     // size()
 540     set.clear();
 541     if (set.size() != 0) {
 542         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 543               ": " + set);
 544     }
 545     set.add(0x0061);
 546     if (set.size() != 1) {
 547         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 548               ": " + set);
 549     }
 550     set.add(0x0031, 0x0039);
 551     if (set.size() != 10) {
 552         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 553               ": " + set);
 554     }
 555
 556     // contains(first, last)
 557     set.clear();
 558     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 559     if (U_FAILURE(status)) { errln("FAIL"); return; }
 560     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 561         UChar32 a = set.getRangeStart(i);
 562         UChar32 b = set.getRangeEnd(i);
 563         if (!set.contains(a, b)) {
 564             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 565                   " but doesn't: " + set);
 566         }
 567         if (set.contains((UChar32)(a-1), b)) {
 568             errln((UnicodeString)"FAIL, shouldn't contain " +
 569                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 570                   " but does: " + set);
 571         }
 572         if (set.contains(a, (UChar32)(b+1))) {
 573             errln((UnicodeString)"FAIL, shouldn't contain " +
 574                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 575                   " but does: " + set);
 576         }
 577     }
 578
 579     // Ported InversionList test.
 580     UnicodeSet a((UChar32)3,(UChar32)10);
 581     UnicodeSet b((UChar32)7,(UChar32)15);
 582     UnicodeSet c;
 583
 584     logln((UnicodeString)"a [3-10]: " + a);
 585     logln((UnicodeString)"b [7-15]: " + b);
 586     c = a;
 587     c.addAll(b);
 588     UnicodeSet exp((UChar32)3,(UChar32)15);
 589     if (c == exp) {
 590         logln((UnicodeString)"c.set(a).add(b): " + c);
 591     } else {
 592         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 593     }
 594     c.complement();
 595     exp.set((UChar32)0, (UChar32)2);
 596     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 597     if (c == exp) {
 598         logln((UnicodeString)"c.complement(): " + c);
 599     } else {
 600         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 601     }
 602     c.complement();
 603     exp.set((UChar32)3, (UChar32)15);
 604     if (c == exp) {
 605         logln((UnicodeString)"c.complement(): " + c);
 606     } else {
 607         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 608     }
 609     c = a;
 610     c.complementAll(b);
 611     exp.set((UChar32)3,(UChar32)6);
 612     exp.add((UChar32)11,(UChar32) 15);
 613     if (c == exp) {
 614         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 615     } else {
 616         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 617     }
 618
 619     exp = c;
 620     bitsToSet(setToBits(c), c);
 621     if (c == exp) {
 622         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 623     } else {
 624         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 625     }
 626
 627     // Additional tests for coverage JB#2118
 628     //UnicodeSet::complement(class UnicodeString const &)
 629     //UnicodeSet::complementAll(class UnicodeString const &)
 630     //UnicodeSet::containsNone(class UnicodeSet const &)
 631     //UnicodeSet::containsNone(long,long)
 632     //UnicodeSet::containsSome(class UnicodeSet const &)
 633     //UnicodeSet::containsSome(long,long)
 634     //UnicodeSet::removeAll(class UnicodeString const &)
 635     //UnicodeSet::retain(long)
 636     //UnicodeSet::retainAll(class UnicodeString const &)
 637     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 638     //UnicodeSetIterator::getString(void)
 639     set.clear();
 640     set.complement("ab");
 641     exp.applyPattern("[{ab}]", status);
 642     if (U_FAILURE(status)) { errln("FAIL"); return; }
 643     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 644
 645     UnicodeSetIterator iset(set);
 646     if (!iset.next() || !iset.isString()) {
 647         errln("FAIL: UnicodeSetIterator::next/isString");
 648     } else if (iset.getString() != "ab") {
 649         errln("FAIL: UnicodeSetIterator::getString");
 650     }
 651
 652     set.add((UChar32)0x61, (UChar32)0x7A);
 653     set.complementAll("alan");
 654     exp.applyPattern("[{ab}b-kmo-z]", status);
 655     if (U_FAILURE(status)) { errln("FAIL"); return; }
 656     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 657
 658     exp.applyPattern("[a-z]", status);
 659     if (U_FAILURE(status)) { errln("FAIL"); return; }
 660     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 661     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 662     exp.applyPattern("[aln]", status);
 663     if (U_FAILURE(status)) { errln("FAIL"); return; }
 664     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 665     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 666
 667     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 668         errln("FAIL: containsNone(UChar32, UChar32)");
 669     }
 670     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 671         errln("FAIL: containsSome(UChar32, UChar32)");
 672     }
 673     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 674         errln("FAIL: containsNone(UChar32, UChar32)");
 675     }
 676     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 677         errln("FAIL: containsSome(UChar32, UChar32)");
 678     }
 679
 680     set.removeAll("liu");
 681     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 682     if (U_FAILURE(status)) { errln("FAIL"); return; }
 683     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 684
 685     set.retainAll("star");
 686     exp.applyPattern("[rst]", status);
 687     if (U_FAILURE(status)) { errln("FAIL"); return; }
 688     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 689
 690     set.retain((UChar32)0x73);
 691     exp.applyPattern("[s]", status);
 692     if (U_FAILURE(status)) { errln("FAIL"); return; }
 693     if (set != exp) { errln("FAIL: retain('s')"); return; }
 694
 695     uint16_t buf[32];
 696     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
 697     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 698     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 699         errln("FAIL: serialize");
 700         return;
 701     }
 702
 703     // Conversions to and from USet
 704     UnicodeSet *uniset = &set;
 705     USet *uset = uniset->toUSet();
 706     TEST_ASSERT((void *)uset == (void *)uniset);
 707     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
 708     TEST_ASSERT((void *)setx == (void *)uset);
 709     const UnicodeSet *constSet = uniset;
 710     const USet *constUSet = constSet->toUSet();
 711     TEST_ASSERT((void *)constUSet == (void *)constSet);
 712     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
 713     TEST_ASSERT((void *)constSetx == (void *)constUSet);
 714
 715     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
 716     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
 717     UnicodeSet ac(0x61, 0x63);
 718     ac.remove(0x62).freeze();
 719     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
 720         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
 721         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
 722         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
 723         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 724         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
 725         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
 726         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
 727         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
 728         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
 729     ) {
 730         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
 731     }
 732     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
 733         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
 734         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
 735         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
 736         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 737         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
 738         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
 739         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
 740         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
 741         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
 742     ) {
 743         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
 744     }
 745 }
 746
 747 void UnicodeSetTest::TestIteration() {
 748     UErrorCode ec = U_ZERO_ERROR;
 749     int i = 0;
 750     int outerLoop;
 751
 752     // 6 code points, 3 ranges, 2 strings, 8 total elements
 753     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 754     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
 755     TEST_ASSERT_SUCCESS(ec);
 756     UnicodeSetIterator it(set);
 757
 758     for (outerLoop=0; outerLoop<3; outerLoop++) {
 759         // Run the test multiple times, to check that iterator.reset() is working.
 760         for (i=0; i<10; i++) {
 761             UBool         nextv        = it.next();
 762             UBool         isString     = it.isString();
 763             int32_t       codePoint    = it.getCodepoint();
 764             //int32_t       codePointEnd = it.getCodepointEnd();
 765             UnicodeString s   = it.getString();
 766             switch (i) {
 767             case 0:
 768                 TEST_ASSERT(nextv == TRUE);
 769                 TEST_ASSERT(isString == FALSE);
 770                 TEST_ASSERT(codePoint==0x61);
 771                 TEST_ASSERT(s == "a");
 772                 break;
 773             case 1:
 774                 TEST_ASSERT(nextv == TRUE);
 775                 TEST_ASSERT(isString == FALSE);
 776                 TEST_ASSERT(codePoint==0x62);
 777                 TEST_ASSERT(s == "b");
 778                 break;
 779             case 2:
 780                 TEST_ASSERT(nextv == TRUE);
 781                 TEST_ASSERT(isString == FALSE);
 782                 TEST_ASSERT(codePoint==0x63);
 783                 TEST_ASSERT(s == "c");
 784                 break;
 785             case 3:
 786                 TEST_ASSERT(nextv == TRUE);
 787                 TEST_ASSERT(isString == FALSE);
 788                 TEST_ASSERT(codePoint==0x79);
 789                 TEST_ASSERT(s == "y");
 790                 break;
 791             case 4:
 792                 TEST_ASSERT(nextv == TRUE);
 793                 TEST_ASSERT(isString == FALSE);
 794                 TEST_ASSERT(codePoint==0x7a);
 795                 TEST_ASSERT(s == "z");
 796                 break;
 797             case 5:
 798                 TEST_ASSERT(nextv == TRUE);
 799                 TEST_ASSERT(isString == FALSE);
 800                 TEST_ASSERT(codePoint==0x1abcd);
 801                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 802                 break;
 803             case 6:
 804                 TEST_ASSERT(nextv == TRUE);
 805                 TEST_ASSERT(isString == TRUE);
 806                 TEST_ASSERT(s == "str1");
 807                 break;
 808             case 7:
 809                 TEST_ASSERT(nextv == TRUE);
 810                 TEST_ASSERT(isString == TRUE);
 811                 TEST_ASSERT(s == "str2");
 812                 break;
 813             case 8:
 814                 TEST_ASSERT(nextv == FALSE);
 815                 break;
 816             case 9:
 817                 TEST_ASSERT(nextv == FALSE);
 818                 break;
 819             }
 820         }
 821         it.reset();  // prepare to run the iteration again.
 822     }
 823 }
 824
 825
 826
 827
 828 void UnicodeSetTest::TestStrings() {
 829     UErrorCode ec = U_ZERO_ERROR;
 830
 831     UnicodeSet* testList[] = {
 832         UnicodeSet::createFromAll("abc"),
 833         new UnicodeSet("[a-c]", ec),
 834
 835         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 836         new UnicodeSet("[{ll}{ch}a-z]", ec),
 837
 838         UnicodeSet::createFrom("ab}c"),
 839         new UnicodeSet("[{ab\\}c}]", ec),
 840
 841         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 842         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 843
 844         NULL
 845     };
 846
 847     if (U_FAILURE(ec)) {
 848         errln("FAIL: couldn't construct test sets");
 849     }
 850
 851     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 852         if (U_SUCCESS(ec)) {
 853             UnicodeString pat0, pat1;
 854             testList[i]->toPattern(pat0, TRUE);
 855             testList[i+1]->toPattern(pat1, TRUE);
 856             if (*testList[i] == *testList[i+1]) {
 857                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 858             } else {
 859                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 860             }
 861         }
 862         delete testList[i];
 863         delete testList[i+1];
 864     }
 865 }
 866
 867 /**
 868  * Test the [:Latin:] syntax.
 869  */
 870 void UnicodeSetTest::TestScriptSet() {
 871     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 872
 873     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 874
 875     /* Jitterbug 1423 */
 876     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 877
 878 }
 879
 880 /**
 881  * Test the [:Latin:] syntax.
 882  */
 883 void UnicodeSetTest::TestPropertySet() {
 884     static const char* const DATA[] = {
 885         // Pattern, Chars IN, Chars NOT in
 886
 887         "[:Latin:]",
 888         "aA",
 889         "\\u0391\\u03B1",
 890
 891         "[\\p{Greek}]",
 892         "\\u0391\\u03B1",
 893         "aA",
 894
 895         "\\P{ GENERAL Category = upper case letter }",
 896         "abc",
 897         "ABC",
 898
 899 #if !UCONFIG_NO_NORMALIZATION
 900         // Combining class: @since ICU 2.2
 901         // Check both symbolic and numeric
 902         "\\p{ccc=Nukta}",
 903         "\\u0ABC",
 904         "abc",
 905
 906         "\\p{Canonical Combining Class = 11}",
 907         "\\u05B1",
 908         "\\u05B2",
 909
 910         "[:c c c = iota subscript :]",
 911         "\\u0345",
 912         "xyz",
 913 #endif
 914
 915         // Bidi class: @since ICU 2.2
 916         "\\p{bidiclass=lefttoright}",
 917         "abc",
 918         "\\u0671\\u0672",
 919
 920         // Binary properties: @since ICU 2.2
 921         "\\p{ideographic}",
 922         "\\u4E0A",
 923         "x",
 924
 925         "[:math=false:]",
 926         "q)*(",
 927         // weiv: )(and * were removed from math in Unicode 4.0.1
 928         //"(*+)",
 929         "+<>^",
 930
 931         // JB#1767 \N{}, \p{ASCII}
 932         "[:Ascii:]",
 933         "abc\\u0000\\u007F",
 934         "\\u0080\\u4E00",
 935
 936         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 937         "az",
 938         "qrs",
 939
 940         // JB#2015
 941         "[:any:]",
 942         "a\\U0010FFFF",
 943         "",
 944
 945         "[:nv=0.5:]",
 946         "\\u00BD\\u0F2A",
 947         "\\u00BC",
 948
 949         // JB#2653: Age
 950         "[:Age=1.1:]",
 951         "\\u03D6", // 1.1
 952         "\\u03D8\\u03D9", // 3.2
 953
 954         "[:Age=3.1:]",
 955         "\\u1800\\u3400\\U0002f800",
 956         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 957
 958         // JB#2350: Case_Sensitive
 959         "[:Case Sensitive:]",
 960         "A\\u1FFC\\U00010410",
 961         ";\\u00B4\\U00010500",
 962
 963         // JB#2832: C99-compatibility props
 964         "[:blank:]",
 965         " \\u0009",
 966         "1-9A-Z",
 967
 968         "[:graph:]",
 969         "19AZ",
 970         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 971
 972         "[:punct:]",
 973         "!@#%&*()[]{}-_\\/;:,.?'\"",
 974         "09azAZ",
 975
 976         "[:xdigit:]",
 977         "09afAF",
 978         "gG!",
 979
 980         // Regex compatibility test
 981         "[-b]", // leading '-' is literal
 982         "-b",
 983         "ac",
 984
 985         "[^-b]", // leading '-' is literal
 986         "ac",
 987         "-b",
 988
 989         "[b-]", // trailing '-' is literal
 990         "-b",
 991         "ac",
 992
 993         "[^b-]", // trailing '-' is literal
 994         "ac",
 995         "-b",
 996
 997         "[a-b-]", // trailing '-' is literal
 998         "ab-",
 999         "c=",
1000
1001         "[[a-q]&[p-z]-]", // trailing '-' is literal
1002         "pq-",
1003         "or=",
1004
1005         "[\\s|\\)|:|$|\\>]", // from regex tests
1006         "s|):$>",
1007         "abc",
1008
1009         "[\\uDC00cd]", // JB#2906: isolated trail at start
1010         "cd\\uDC00",
1011         "ab\\uD800\\U00010000",
1012
1013         "[ab\\uD800]", // JB#2906: isolated trail at start
1014         "ab\\uD800",
1015         "cd\\uDC00\\U00010000",
1016
1017         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1018         "abcd\\uD800",
1019         "ef\\uDC00\\U00010000",
1020
1021         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1022         "abcd\\uDC00",
1023         "ef\\uD800\\U00010000",
1024
1025 #if !UCONFIG_NO_NORMALIZATION
1026         "[:^lccc=0:]", // Lead canonical class
1027         "\\u0300\\u0301",
1028         "abcd\\u00c0\\u00c5",
1029
1030         "[:^tccc=0:]", // Trail canonical class
1031         "\\u0300\\u0301\\u00c0\\u00c5",
1032         "abcd",
1033
1034         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1035         "\\u0300\\u0301\\u00c0\\u00c5",
1036         "abcd",
1037
1038         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1039         "",
1040         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041
1042         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1043         "\\u0F73\\u0F75\\u0F81",
1044         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1045 #endif /* !UCONFIG_NO_NORMALIZATION */
1046
1047         "[:Assigned:]",
1048         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1049         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1050
1051         // Script_Extensions, new in Unicode 6.0
1052         "[:scx=Arab:]",
1053         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1054         "\\u061D\\uFDEF\\uFDFE",
1055
1056         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1057         // so scx-sc is missing U+FDF2.
1058         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1059         "\\u0640\\u064B\\u0650\\u0655",
1060         "\\uFDF2"
1061     };
1062
1063     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1064
1065     for (int32_t i=0; i<DATA_LEN; i+=3) {
1066         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1067                           CharsToUnicodeString(DATA[i+2]));
1068     }
1069 }
1070
1071 /**
1072   * Test that Posix style character classes [:digit:], etc.
1073   *   have the Unicode definitions from TR 18.
1074   */
1075 void UnicodeSetTest::TestPosixClasses() {
1076     {
1077         UErrorCode status = U_ZERO_ERROR;
1078         UnicodeSet s1("[:alpha:]", status);
1079         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1080         TEST_ASSERT_SUCCESS(status);
1081         TEST_ASSERT(s1==s2);
1082     }
1083     {
1084         UErrorCode status = U_ZERO_ERROR;
1085         UnicodeSet s1("[:lower:]", status);
1086         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1087         TEST_ASSERT_SUCCESS(status);
1088         TEST_ASSERT(s1==s2);
1089     }
1090     {
1091         UErrorCode status = U_ZERO_ERROR;
1092         UnicodeSet s1("[:upper:]", status);
1093         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1094         TEST_ASSERT_SUCCESS(status);
1095         TEST_ASSERT(s1==s2);
1096     }
1097     {
1098         UErrorCode status = U_ZERO_ERROR;
1099         UnicodeSet s1("[:punct:]", status);
1100         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1101         TEST_ASSERT_SUCCESS(status);
1102         TEST_ASSERT(s1==s2);
1103     }
1104     {
1105         UErrorCode status = U_ZERO_ERROR;
1106         UnicodeSet s1("[:digit:]", status);
1107         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1108         TEST_ASSERT_SUCCESS(status);
1109         TEST_ASSERT(s1==s2);
1110     }
1111     {
1112         UErrorCode status = U_ZERO_ERROR;
1113         UnicodeSet s1("[:xdigit:]", status);
1114         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1115         TEST_ASSERT_SUCCESS(status);
1116         TEST_ASSERT(s1==s2);
1117     }
1118     {
1119         UErrorCode status = U_ZERO_ERROR;
1120         UnicodeSet s1("[:alnum:]", status);
1121         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1122         TEST_ASSERT_SUCCESS(status);
1123         TEST_ASSERT(s1==s2);
1124     }
1125     {
1126         UErrorCode status = U_ZERO_ERROR;
1127         UnicodeSet s1("[:space:]", status);
1128         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1129         TEST_ASSERT_SUCCESS(status);
1130         TEST_ASSERT(s1==s2);
1131     }
1132     {
1133         UErrorCode status = U_ZERO_ERROR;
1134         UnicodeSet s1("[:blank:]", status);
1135         TEST_ASSERT_SUCCESS(status);
1136         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1137             status);
1138         TEST_ASSERT_SUCCESS(status);
1139         TEST_ASSERT(s1==s2);
1140     }
1141     {
1142         UErrorCode status = U_ZERO_ERROR;
1143         UnicodeSet s1("[:cntrl:]", status);
1144         TEST_ASSERT_SUCCESS(status);
1145         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1146         TEST_ASSERT_SUCCESS(status);
1147         TEST_ASSERT(s1==s2);
1148     }
1149     {
1150         UErrorCode status = U_ZERO_ERROR;
1151         UnicodeSet s1("[:graph:]", status);
1152         TEST_ASSERT_SUCCESS(status);
1153         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1154         TEST_ASSERT_SUCCESS(status);
1155         TEST_ASSERT(s1==s2);
1156     }
1157     {
1158         UErrorCode status = U_ZERO_ERROR;
1159         UnicodeSet s1("[:print:]", status);
1160         TEST_ASSERT_SUCCESS(status);
1161         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1162         TEST_ASSERT_SUCCESS(status);
1163         TEST_ASSERT(s1==s2);
1164     }
1165 }
1166 /**
1167  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1168  */
1169 void UnicodeSetTest::TestClone() {
1170     UErrorCode ec = U_ZERO_ERROR;
1171     UnicodeSet s("[abcxyz]", ec);
1172     UnicodeSet t(s);
1173     expectContainment(t, "abc", "def");
1174 }
1175
1176 /**
1177  * Test the indexOf() and charAt() methods.
1178  */
1179 void UnicodeSetTest::TestIndexOf() {
1180     UErrorCode ec = U_ZERO_ERROR;
1181     UnicodeSet set("[a-cx-y3578]", ec);
1182     if (U_FAILURE(ec)) {
1183         errln("FAIL: UnicodeSet constructor");
1184         return;
1185     }
1186     for (int32_t i=0; i<set.size(); ++i) {
1187         UChar32 c = set.charAt(i);
1188         if (set.indexOf(c) != i) {
1189             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1190                 i, c, set.indexOf(c));
1191         }
1192     }
1193     UChar32 c = set.charAt(set.size());
1194     if (c != -1) {
1195         errln("FAIL: charAt(<out of range>) = %X", c);
1196     }
1197     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1198     if (j != -1) {
1199         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1200     }
1201 }
1202
1203 /**
1204  * Test closure API.
1205  */
1206 void UnicodeSetTest::TestCloseOver() {
1207     UErrorCode ec = U_ZERO_ERROR;
1208
1209     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1210     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1211     const char* DATA[] = {
1212         // selector, input, output
1213         CASE,
1214         "[aq\\u00DF{Bc}{bC}{Fi}]",
1215         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1216
1217         CASE,
1218         "[\\u01F1]", // 'DZ'
1219         "[\\u01F1\\u01F2\\u01F3]",
1220
1221         CASE,
1222         "[\\u1FB4]",
1223         "[\\u1FB4{\\u03AC\\u03B9}]",
1224
1225         CASE,
1226         "[{F\\uFB01}]",
1227         "[\\uFB03{ffi}]",
1228
1229         CASE, // make sure binary search finds limits
1230         "[a\\uFF3A]",
1231         "[aA\\uFF3A\\uFF5A]",
1232
1233         CASE,
1234         "[a-z]","[A-Za-z\\u017F\\u212A]",
1235         CASE,
1236         "[abc]","[A-Ca-c]",
1237         CASE,
1238         "[ABC]","[A-Ca-c]",
1239
1240         CASE, "[i]", "[iI]",
1241
1242         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1243         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1244
1245         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1246
1247         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1248
1249         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1250
1251         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1252
1253         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1254
1255         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1256
1257         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1258         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1259
1260         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1261
1262         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1263
1264         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1265
1266 #if !UCONFIG_NO_FILE_IO
1267         CASE_MAPPINGS,
1268         "[aq\\u00DF{Bc}{bC}{Fi}]",
1269         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1270 #endif
1271
1272         CASE_MAPPINGS,
1273         "[\\u01F1]", // 'DZ'
1274         "[\\u01F1\\u01F2\\u01F3]",
1275
1276         CASE_MAPPINGS,
1277         "[a-z]",
1278         "[A-Za-z]",
1279
1280         NULL
1281     };
1282
1283     UnicodeSet s;
1284     UnicodeSet t;
1285     UnicodeString buf;
1286     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1287         int32_t selector = DATA[i][0];
1288         UnicodeString pat(DATA[i+1], -1, US_INV);
1289         UnicodeString exp(DATA[i+2], -1, US_INV);
1290         s.applyPattern(pat, ec);
1291         s.closeOver(selector);
1292         t.applyPattern(exp, ec);
1293         if (U_FAILURE(ec)) {
1294             errln("FAIL: applyPattern failed");
1295             continue;
1296         }
1297         if (s == t) {
1298             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1299         } else {
1300             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1301                   s.toPattern(buf, TRUE) + ", expected " + exp);
1302         }
1303     }
1304
1305 #if 0
1306     /*
1307      * Unused test code.
1308      * This was used to compare the old implementation (using USET_CASE)
1309      * with the new one (using 0x100 temporarily)
1310      * while transitioning from hardcoded case closure tables in uniset.cpp
1311      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1312      * and using ucase.c functions for closure.
1313      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1314      *
1315      * Note: The old and new implementation never fully matched because
1316      * the old implementation turned out to not map U+0130 and U+0131 correctly
1317      * (dotted I and dotless i) and because the old implementation's data tables
1318      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1319      * new implementation. (So sigmas and some other characters were not handled
1320      * according to the newer Unicode version.)
1321      */
1322     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1323     UnicodeSetIterator si(sens);
1324     UnicodeString str, buf2;
1325     const UnicodeString *pStr;
1326     UChar32 c;
1327     while(si.next()) {
1328         if(!si.isString()) {
1329             c=si.getCodepoint();
1330             s.clear();
1331             s.add(c);
1332
1333             str.setTo(c);
1334             str.foldCase();
1335             sens2.add(str);
1336
1337             t=s;
1338             s.closeOver(USET_CASE);
1339             t.closeOver(0x100);
1340             if(s!=t) {
1341                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1342                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1343             }
1344         }
1345     }
1346     // remove all code points
1347     // should contain all full case folding mapping strings
1348     sens2.remove(0, 0x10ffff);
1349     si.reset(sens2);
1350     while(si.next()) {
1351         if(si.isString()) {
1352             pStr=&si.getString();
1353             s.clear();
1354             s.add(*pStr);
1355             t=s2=s;
1356             s.closeOver(USET_CASE);
1357             t.closeOver(0x100);
1358             if(s!=t) {
1359                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1360                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1361             }
1362         }
1363     }
1364 #endif
1365
1366     // Test the pattern API
1367     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1368     if (U_FAILURE(ec)) {
1369         errln("FAIL: applyPattern failed");
1370     } else {
1371         expectContainment(s, "abcABC", "defDEF");
1372     }
1373     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1374     if (U_FAILURE(ec)) {
1375         errln("FAIL: constructor failed");
1376     } else {
1377         expectContainment(v, "defDEF", "abcABC");
1378     }
1379     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1380     if (U_FAILURE(ec)) {
1381         errln("FAIL: construct w/case mappings failed");
1382     } else {
1383         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1384     }
1385 }
1386
1387 void UnicodeSetTest::TestEscapePattern() {
1388     const char pattern[] =
1389         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1390     const char exp[] =
1391         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1392     // We test this with two passes; in the second pass we
1393     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1394     // this fails -- which is what we expect.
1395     for (int32_t pass=1; pass<=2; ++pass) {
1396         UErrorCode ec = U_ZERO_ERROR;
1397         UnicodeString pat(pattern, -1, US_INV);
1398         if (pass==2) {
1399             pat = pat.unescape();
1400         }
1401         // Pattern is only good for pass 1
1402         UBool isPatternValid = (pass==1);
1403
1404         UnicodeSet set(pat, ec);
1405         if (U_SUCCESS(ec) != isPatternValid){
1406             errln((UnicodeString)"FAIL: applyPattern(" +
1407                   escape(pat) + ") => " +
1408                   u_errorName(ec));
1409             continue;
1410         }
1411         if (U_FAILURE(ec)) {
1412             continue;
1413         }
1414         if (set.contains((UChar)0x0644)){
1415             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1416         }
1417
1418         UnicodeString newpat;
1419         set.toPattern(newpat, TRUE);
1420         if (newpat == UnicodeString(exp, -1, US_INV)) {
1421             logln(escape(pat) + " => " + newpat);
1422         } else {
1423             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1424         }
1425
1426         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1427             UnicodeString str("Range ");
1428             str.append((UChar)(0x30 + i))
1429                 .append(": ")
1430                 .append((UChar32)set.getRangeStart(i))
1431                 .append(" - ")
1432                 .append((UChar32)set.getRangeEnd(i));
1433             str = str + " (" + set.getRangeStart(i) + " - " +
1434                 set.getRangeEnd(i) + ")";
1435             if (set.getRangeStart(i) < 0) {
1436                 errln((UnicodeString)"FAIL: " + escape(str));
1437             } else {
1438                 logln(escape(str));
1439             }
1440         }
1441     }
1442 }
1443
1444 void UnicodeSetTest::expectRange(const UnicodeString& label,
1445                                  const UnicodeSet& set,
1446                                  UChar32 start, UChar32 end) {
1447     UnicodeSet exp(start, end);
1448     UnicodeString pat;
1449     if (set == exp) {
1450         logln(label + " => " + set.toPattern(pat, TRUE));
1451     } else {
1452         UnicodeString xpat;
1453         errln((UnicodeString)"FAIL: " + label + " => " +
1454               set.toPattern(pat, TRUE) +
1455               ", expected " + exp.toPattern(xpat, TRUE));
1456     }
1457 }
1458
1459 void UnicodeSetTest::TestInvalidCodePoint() {
1460
1461     const UChar32 DATA[] = {
1462         // Test range             Expected range
1463         0, 0x10FFFF,              0, 0x10FFFF,
1464         (UChar32)-1, 8,           0, 8,
1465         8, 0x110000,              8, 0x10FFFF
1466     };
1467     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1468
1469     UnicodeString pat;
1470     int32_t i;
1471
1472     for (i=0; i<DATA_LENGTH; i+=4) {
1473         UChar32 start  = DATA[i];
1474         UChar32 end    = DATA[i+1];
1475         UChar32 xstart = DATA[i+2];
1476         UChar32 xend   = DATA[i+3];
1477
1478         // Try various API using the test code points
1479
1480         UnicodeSet set(start, end);
1481         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1482                     set, xstart, xend);
1483
1484         set.clear();
1485         set.set(start, end);
1486         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1487                     set, xstart, xend);
1488
1489         UBool b = set.contains(start);
1490         b = set.contains(start, end);
1491         b = set.containsNone(start, end);
1492         b = set.containsSome(start, end);
1493         (void)b;   // Suppress set but not used warning.
1494
1495         /*int32_t index = set.indexOf(start);*/
1496
1497         set.clear();
1498         set.add(start);
1499         set.add(start, end);
1500         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1501                     set, xstart, xend);
1502
1503         set.set(0, 0x10FFFF);
1504         set.retain(start, end);
1505         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1506                     set, xstart, xend);
1507         set.retain(start);
1508
1509         set.set(0, 0x10FFFF);
1510         set.remove(start);
1511         set.remove(start, end);
1512         set.complement();
1513         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1514                     set, xstart, xend);
1515
1516         set.set(0, 0x10FFFF);
1517         set.complement(start, end);
1518         set.complement();
1519         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1520                     set, xstart, xend);
1521         set.complement(start);
1522     }
1523
1524     const UChar32 DATA2[] = {
1525         0,
1526         0x10FFFF,
1527         (UChar32)-1,
1528         0x110000
1529     };
1530     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1531
1532     for (i=0; i<DATA2_LENGTH; ++i) {
1533         UChar32 c = DATA2[i], end = 0x10FFFF;
1534         UBool valid = (c >= 0 && c <= 0x10FFFF);
1535
1536         UnicodeSet set(0, 0x10FFFF);
1537
1538         // For single-codepoint contains, invalid codepoints are NOT contained
1539         UBool b = set.contains(c);
1540         if (b == valid) {
1541             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1542                   ") = " + b);
1543         } else {
1544             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1545                   ") = " + b);
1546         }
1547
1548         // For codepoint range contains, containsNone, and containsSome,
1549         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1550         b = set.contains(c, end);
1551         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1552               "," + end + ") = " + b);
1553
1554         b = set.containsNone(c, end);
1555         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1556               "," + end + ") = " + b);
1557
1558         b = set.containsSome(c, end);
1559         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1560               "," + end + ") = " + b);
1561
1562         int32_t index = set.indexOf(c);
1563         if ((index >= 0) == valid) {
1564             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1565                   ") = " + index);
1566         } else {
1567             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1568                   ") = " + index);
1569         }
1570     }
1571 }
1572
1573 // Used by TestSymbolTable
1574 class TokenSymbolTable : public SymbolTable {
1575 public:
1576     Hashtable contents;
1577
1578     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1579         contents.setValueDeleter(uprv_deleteUObject);
1580     }
1581
1582     ~TokenSymbolTable() {}
1583
1584     /**
1585      * (Non-SymbolTable API) Add the given variable and value to
1586      * the table.  Variable should NOT contain leading '$'.
1587      */
1588     void add(const UnicodeString& var, const UnicodeString& value,
1589              UErrorCode& ec) {
1590         if (U_SUCCESS(ec)) {
1591             contents.put(var, new UnicodeString(value), ec);
1592         }
1593     }
1594
1595     /**
1596      * SymbolTable API
1597      */
1598     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1599         return (const UnicodeString*) contents.get(s);
1600     }
1601
1602     /**
1603      * SymbolTable API
1604      */
1605     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1606         return NULL;
1607     }
1608
1609     /**
1610      * SymbolTable API
1611      */
1612     virtual UnicodeString parseReference(const UnicodeString& text,
1613                                          ParsePosition& pos, int32_t limit) const {
1614         int32_t start = pos.getIndex();
1615         int32_t i = start;
1616         UnicodeString result;
1617         while (i < limit) {
1618             UChar c = text.charAt(i);
1619             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1620                 break;
1621             }
1622             ++i;
1623         }
1624         if (i == start) { // No valid name chars
1625             return result; // Indicate failure with empty string
1626         }
1627         pos.setIndex(i);
1628         text.extractBetween(start, i, result);
1629         return result;
1630     }
1631 };
1632
1633 void UnicodeSetTest::TestSymbolTable() {
1634     // Multiple test cases can be set up here.  Each test case
1635     // is terminated by null:
1636     // var, value, var, value,..., input pat., exp. output pat., null
1637     const char* DATA[] = {
1638         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1639         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1640         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1641         NULL
1642     };
1643
1644     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1645         UErrorCode ec = U_ZERO_ERROR;
1646         TokenSymbolTable sym(ec);
1647         if (U_FAILURE(ec)) {
1648             errln("FAIL: couldn't construct TokenSymbolTable");
1649             continue;
1650         }
1651
1652         // Set up variables
1653         while (DATA[i+2] != NULL) {
1654             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1655             if (U_FAILURE(ec)) {
1656                 errln("FAIL: couldn't add to TokenSymbolTable");
1657                 continue;
1658             }
1659             i += 2;
1660         }
1661
1662         // Input pattern and expected output pattern
1663         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1664         i += 2;
1665
1666         ParsePosition pos(0);
1667         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1668         if (U_FAILURE(ec)) {
1669             errln("FAIL: couldn't construct UnicodeSet");
1670             continue;
1671         }
1672
1673         // results
1674         if (pos.getIndex() != inpat.length()) {
1675             errln((UnicodeString)"Failed to read to end of string \""
1676                   + inpat + "\": read to "
1677                   + pos.getIndex() + ", length is "
1678                   + inpat.length());
1679         }
1680
1681         UnicodeSet us2(exppat, ec);
1682         if (U_FAILURE(ec)) {
1683             errln("FAIL: couldn't construct expected UnicodeSet");
1684             continue;
1685         }
1686
1687         UnicodeString a, b;
1688         if (us != us2) {
1689             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1690                   ", expected " + us2.toPattern(b, TRUE));
1691         } else {
1692             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1693         }
1694     }
1695 }
1696
1697 void UnicodeSetTest::TestSurrogate() {
1698     const char* DATA[] = {
1699         // These should all behave identically
1700         "[abc\\uD800\\uDC00]",
1701         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1702         "[abc\\U00010000]",
1703         0
1704     };
1705     for (int i=0; DATA[i] != 0; ++i) {
1706         UErrorCode ec = U_ZERO_ERROR;
1707         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1708         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1709         UnicodeSet set(str, ec);
1710         if (U_FAILURE(ec)) {
1711             errln("FAIL: UnicodeSet constructor");
1712             continue;
1713         }
1714         expectContainment(set,
1715                           CharsToUnicodeString("abc\\U00010000"),
1716                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1717         if (set.size() != 4) {
1718             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1719                   set.size() + ", expected 4");
1720         }
1721
1722         {
1723           UErrorCode subErr = U_ZERO_ERROR;
1724           checkRoundTrip(set);
1725           checkSerializeRoundTrip(set, subErr);
1726         }
1727     }
1728 }
1729
1730 void UnicodeSetTest::TestExhaustive() {
1731     // exhaustive tests. Simulate UnicodeSets with integers.
1732     // That gives us very solid tests (except for large memory tests).
1733
1734     int32_t limit = 128;
1735
1736     UnicodeSet x, y, z, aa;
1737
1738     for (int32_t i = 0; i < limit; ++i) {
1739         bitsToSet(i, x);
1740         logln((UnicodeString)"Testing " + i + ", " + x);
1741         _testComplement(i, x, y);
1742
1743         UnicodeSet &toTest = bitsToSet(i, aa);
1744
1745         // AS LONG AS WE ARE HERE, check roundtrip
1746         checkRoundTrip(toTest);
1747         UErrorCode ec = U_ZERO_ERROR;
1748         checkSerializeRoundTrip(toTest, ec);
1749
1750         for (int32_t j = 0; j < limit; ++j) {
1751             _testAdd(i,j,  x,y,z);
1752             _testXor(i,j,  x,y,z);
1753             _testRetain(i,j,  x,y,z);
1754             _testRemove(i,j,  x,y,z);
1755         }
1756     }
1757 }
1758
1759 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1760     bitsToSet(a, x);
1761     z = x;
1762     z.complement();
1763     int32_t c = setToBits(z);
1764     if (c != (~a)) {
1765         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1766         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1767     }
1768     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1769 }
1770
1771 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772     bitsToSet(a, x);
1773     bitsToSet(b, y);
1774     z = x;
1775     z.addAll(y);
1776     int32_t c = setToBits(z);
1777     if (c != (a | b)) {
1778         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1779         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1780     }
1781     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1782 }
1783
1784 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785     bitsToSet(a, x);
1786     bitsToSet(b, y);
1787     z = x;
1788     z.retainAll(y);
1789     int32_t c = setToBits(z);
1790     if (c != (a & b)) {
1791         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1792         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1793     }
1794     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1795 }
1796
1797 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798     bitsToSet(a, x);
1799     bitsToSet(b, y);
1800     z = x;
1801     z.removeAll(y);
1802     int32_t c = setToBits(z);
1803     if (c != (a &~ b)) {
1804         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1805         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1806     }
1807     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1808 }
1809
1810 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1811     bitsToSet(a, x);
1812     bitsToSet(b, y);
1813     z = x;
1814     z.complementAll(y);
1815     int32_t c = setToBits(z);
1816     if (c != (a ^ b)) {
1817         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1818         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1819     }
1820     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1821 }
1822
1823 /**
1824  * Check that ranges are monotonically increasing and non-
1825  * overlapping.
1826  */
1827 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1828     int32_t n = set.getRangeCount();
1829     if (n < 0) {
1830         errln((UnicodeString)"FAIL result of " + msg +
1831               ": range count should be >= 0 but is " +
1832               n /*+ " for " + set.toPattern())*/);
1833         return;
1834     }
1835     UChar32 last = 0;
1836     for (int32_t i=0; i<n; ++i) {
1837         UChar32 start = set.getRangeStart(i);
1838         UChar32 end = set.getRangeEnd(i);
1839         if (start > end) {
1840             errln((UnicodeString)"FAIL result of " + msg +
1841                   ": range " + (i+1) +
1842                   " start > end: " + (int)start + ", " + (int)end +
1843                   " for " + set);
1844         }
1845         if (i > 0 && start <= last) {
1846             errln((UnicodeString)"FAIL result of " + msg +
1847                   ": range " + (i+1) +
1848                   " overlaps previous range: " + (int)start + ", " + (int)end +
1849                   " for " + set);
1850         }
1851         last = end;
1852     }
1853 }
1854
1855 /**
1856  * Convert a bitmask to a UnicodeSet.
1857  */
1858 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1859     result.clear();
1860     for (UChar32 i = 0; i < 32; ++i) {
1861         if ((a & (1<<i)) != 0) {
1862             result.add(i);
1863         }
1864     }
1865     return result;
1866 }
1867
1868 /**
1869  * Convert a UnicodeSet to a bitmask.  Only the characters
1870  * U+0000 to U+0020 are represented in the bitmask.
1871  */
1872 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1873     int32_t result = 0;
1874     for (int32_t i = 0; i < 32; ++i) {
1875         if (x.contains((UChar32)i)) {
1876             result |= (1<<i);
1877         }
1878     }
1879     return result;
1880 }
1881
1882 /**
1883  * Return the representation of an inversion list based UnicodeSet
1884  * as a pairs list.  Ranges are listed in ascending Unicode order.
1885  * For example, the set [a-zA-M3] is represented as "33AMaz".
1886  */
1887 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1888     UnicodeString pairs;
1889     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1890         UChar32 start = set.getRangeStart(i);
1891         UChar32 end = set.getRangeEnd(i);
1892         if (end > 0xFFFF) {
1893             end = 0xFFFF;
1894             i = set.getRangeCount(); // Should be unnecessary
1895         }
1896         pairs.append((UChar)start).append((UChar)end);
1897     }
1898     return pairs;
1899 }
1900
1901 /**
1902  * Basic consistency check for a few items.
1903  * That the iterator works, and that we can create a pattern and
1904  * get the same thing back
1905  */
1906 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1907     {
1908         UnicodeSet t(s);
1909         checkEqual(s, t, "copy ct");
1910     }
1911
1912     {
1913         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1914         t = s;
1915         checkEqual(s, t, "operator=");
1916     }
1917
1918     {
1919         UnicodeSet t;
1920         copyWithIterator(t, s, FALSE);
1921         checkEqual(s, t, "iterator roundtrip");
1922     }
1923
1924     {
1925         UnicodeSet t;
1926         copyWithIterator(t, s, TRUE); // try range
1927         checkEqual(s, t, "iterator roundtrip");
1928     }
1929
1930     {
1931         UnicodeSet t;
1932         UnicodeString pat;
1933         UErrorCode ec = U_ZERO_ERROR;
1934         s.toPattern(pat, FALSE);
1935         t.applyPattern(pat, ec);
1936         if (U_FAILURE(ec)) {
1937             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1938             return;
1939         } else {
1940             checkEqual(s, t, "toPattern(false)");
1941         }
1942     }
1943
1944     {
1945         UnicodeSet t;
1946         UnicodeString pat;
1947         UErrorCode ec = U_ZERO_ERROR;
1948         s.toPattern(pat, TRUE);
1949         t.applyPattern(pat, ec);
1950         if (U_FAILURE(ec)) {
1951             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1952             return;
1953         } else {
1954             checkEqual(s, t, "toPattern(true)");
1955         }
1956     }
1957 }
1958
1959 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1960   if(U_FAILURE(status)) return;
1961   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1962   if(status == U_BUFFER_OVERFLOW_ERROR) {
1963     status = U_ZERO_ERROR;
1964     serializeBuffer.resize(len);
1965     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1966     // let 2nd error stand
1967   }
1968   if(U_FAILURE(status)) {
1969     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1970     return;
1971   }
1972   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1973   if(U_FAILURE(status)) {
1974     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1975     return;
1976   }
1977
1978   checkEqual(t, deserialized, "Set was unequal when deserialized");
1979 }
1980
1981 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1982     t.clear();
1983     UnicodeSetIterator it(s);
1984     if (withRange) {
1985         while (it.nextRange()) {
1986             if (it.isString()) {
1987                 t.add(it.getString());
1988             } else {
1989                 t.add(it.getCodepoint(), it.getCodepointEnd());
1990             }
1991         }
1992     } else {
1993         while (it.next()) {
1994             if (it.isString()) {
1995                 t.add(it.getString());
1996             } else {
1997                 t.add(it.getCodepoint());
1998             }
1999         }
2000     }
2001 }
2002
2003 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2004   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2005   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2006     UnicodeString source; s.toPattern(source, TRUE);
2007     UnicodeString result; t.toPattern(result, TRUE);
2008     if (s != t) {
2009         errln((UnicodeString)"FAIL: " + message
2010               + "; source = " + source
2011               + "; result = " + result
2012               );
2013         return FALSE;
2014     } else {
2015         logln((UnicodeString)"Ok: " + message
2016               + "; source = " + source
2017               + "; result = " + result
2018               );
2019     }
2020     return TRUE;
2021 }
2022
2023 void
2024 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2025                                   const UnicodeString& charsIn,
2026                                   const UnicodeString& charsOut) {
2027     UErrorCode ec = U_ZERO_ERROR;
2028     UnicodeSet set(pat, ec);
2029     if (U_FAILURE(ec)) {
2030         dataerrln((UnicodeString)"FAIL: pattern \"" +
2031               pat + "\" => " + u_errorName(ec));
2032         return;
2033     }
2034     expectContainment(set, pat, charsIn, charsOut);
2035 }
2036
2037 void
2038 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2039                                   const UnicodeString& charsIn,
2040                                   const UnicodeString& charsOut) {
2041     UnicodeString pat;
2042     set.toPattern(pat);
2043     expectContainment(set, pat, charsIn, charsOut);
2044 }
2045
2046 void
2047 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2048                                   const UnicodeString& setName,
2049                                   const UnicodeString& charsIn,
2050                                   const UnicodeString& charsOut) {
2051     UnicodeString bad;
2052     UChar32 c;
2053     int32_t i;
2054
2055     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2056         c = charsIn.char32At(i);
2057         if (!set.contains(c)) {
2058             bad.append(c);
2059         }
2060     }
2061     if (bad.length() > 0) {
2062         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2063               ", expected containment of " + prettify(charsIn));
2064     } else {
2065         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2066     }
2067
2068     bad.truncate(0);
2069     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2070         c = charsOut.char32At(i);
2071         if (set.contains(c)) {
2072             bad.append(c);
2073         }
2074     }
2075     if (bad.length() > 0) {
2076         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2077               ", expected non-containment of " + prettify(charsOut));
2078     } else {
2079         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2080     }
2081 }
2082
2083 void
2084 UnicodeSetTest::expectPattern(UnicodeSet& set,
2085                               const UnicodeString& pattern,
2086                               const UnicodeString& expectedPairs){
2087     UErrorCode status = U_ZERO_ERROR;
2088     set.applyPattern(pattern, status);
2089     if (U_FAILURE(status)) {
2090         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2091               "\") failed");
2092         return;
2093     } else {
2094         if (getPairs(set) != expectedPairs ) {
2095             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2096                   "\") => pairs \"" +
2097                   escape(getPairs(set)) + "\", expected \"" +
2098                   escape(expectedPairs) + "\"");
2099         } else {
2100             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2101                   "\") => pairs \"" +
2102                   escape(getPairs(set)) + "\"");
2103         }
2104     }
2105     // the result of calling set.toPattern(), which is the string representation of
2106     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2107     // will produce another set that is equal to this one.
2108     UnicodeString temppattern;
2109     set.toPattern(temppattern);
2110     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2111     if (U_FAILURE(status)) {
2112         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2113         return;
2114     }
2115     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2116         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2117             escape(getPairs(set)) + "\""));
2118     } else{
2119         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2120     }
2121
2122     delete tempset;
2123
2124 }
2125
2126 void
2127 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2128     if (getPairs(set) != expectedPairs) {
2129         errln(UnicodeString("FAIL: Expected pair list \"") +
2130               escape(expectedPairs) + "\", got \"" +
2131               escape(getPairs(set)) + "\"");
2132     }
2133 }
2134
2135 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2136                                      const UnicodeString& expPat,
2137                                      const char** expStrings) {
2138     UnicodeString pat;
2139     set.toPattern(pat, TRUE);
2140     if (pat == expPat) {
2141         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2142     } else {
2143         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2144         return;
2145     }
2146     if (expStrings == NULL) {
2147         return;
2148     }
2149     UBool in = TRUE;
2150     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2151         if (expStrings[i] == NOT) { // sic; pointer comparison
2152             in = FALSE;
2153             continue;
2154         }
2155         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2156         UBool contained = set.contains(s);
2157         if (contained == in) {
2158             logln((UnicodeString)"Ok: " + expPat +
2159                   (contained ? " contains {" : " does not contain {") +
2160                   escape(expStrings[i]) + "}");
2161         } else {
2162             errln((UnicodeString)"FAIL: " + expPat +
2163                   (contained ? " contains {" : " does not contain {") +
2164                   escape(expStrings[i]) + "}");
2165         }
2166     }
2167 }
2168
2169 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2170
2171 void
2172 UnicodeSetTest::doAssert(UBool condition, const char *message)
2173 {
2174     if (!condition) {
2175         errln(UnicodeString("ERROR : ") + message);
2176     }
2177 }
2178
2179 UnicodeString
2180 UnicodeSetTest::escape(const UnicodeString& s) {
2181     UnicodeString buf;
2182     for (int32_t i=0; i<s.length(); )
2183     {
2184         UChar32 c = s.char32At(i);
2185         if (0x0020 <= c && c <= 0x007F) {
2186             buf += c;
2187         } else {
2188             if (c <= 0xFFFF) {
2189                 buf += (UChar)0x5c; buf += (UChar)0x75;
2190             } else {
2191                 buf += (UChar)0x5c; buf += (UChar)0x55;
2192                 buf += toHexString((c & 0xF0000000) >> 28);
2193                 buf += toHexString((c & 0x0F000000) >> 24);
2194                 buf += toHexString((c & 0x00F00000) >> 20);
2195                 buf += toHexString((c & 0x000F0000) >> 16);
2196             }
2197             buf += toHexString((c & 0xF000) >> 12);
2198             buf += toHexString((c & 0x0F00) >> 8);
2199             buf += toHexString((c & 0x00F0) >> 4);
2200             buf += toHexString(c & 0x000F);
2201         }
2202         i += U16_LENGTH(c);
2203     }
2204     return buf;
2205 }
2206
2207 void UnicodeSetTest::TestFreezable() {
2208     UErrorCode errorCode=U_ZERO_ERROR;
2209     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2210     UnicodeSet idSet(idPattern, errorCode);
2211     if(U_FAILURE(errorCode)) {
2212         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2213         return;
2214     }
2215
2216     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2217     UnicodeSet wsSet(wsPattern, errorCode);
2218     if(U_FAILURE(errorCode)) {
2219         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2220         return;
2221     }
2222
2223     idSet.add(idPattern);
2224     UnicodeSet frozen(idSet);
2225     frozen.freeze();
2226
2227     if(idSet.isFrozen() || !frozen.isFrozen()) {
2228         errln("FAIL: isFrozen() is wrong");
2229     }
2230     if(frozen!=idSet || !(frozen==idSet)) {
2231         errln("FAIL: a copy-constructed frozen set differs from its original");
2232     }
2233
2234     frozen=wsSet;
2235     if(frozen!=idSet || !(frozen==idSet)) {
2236         errln("FAIL: a frozen set was modified by operator=");
2237     }
2238
2239     UnicodeSet frozen2(frozen);
2240     if(frozen2!=frozen || frozen2!=idSet) {
2241         errln("FAIL: a copied frozen set differs from its frozen original");
2242     }
2243     if(!frozen2.isFrozen()) {
2244         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2245     }
2246     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2247     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2248         errln("FAIL: UnicodeSet(5, 55) failed");
2249     }
2250     frozen3=frozen;
2251     if(!frozen3.isFrozen()) {
2252         errln("FAIL: copying a frozen set results in a thawed one");
2253     }
2254
2255     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2256     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2257         errln("FAIL: clone() failed");
2258     }
2259     cloned->add(0xd802, 0xd805);
2260     if(cloned->containsSome(0xd802, 0xd805)) {
2261         errln("FAIL: unable to modify clone");
2262     }
2263     delete cloned;
2264
2265     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2266     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2267         errln("FAIL: cloneAsThawed() failed");
2268     }
2269     thawed->add(0xd802, 0xd805);
2270     if(!thawed->contains(0xd802, 0xd805)) {
2271         errln("FAIL: unable to modify thawed clone");
2272     }
2273     delete thawed;
2274
2275     frozen.set(5, 55);
2276     if(frozen!=idSet || !(frozen==idSet)) {
2277         errln("FAIL: UnicodeSet::set() modified a frozen set");
2278     }
2279
2280     frozen.clear();
2281     if(frozen!=idSet || !(frozen==idSet)) {
2282         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2283     }
2284
2285     frozen.closeOver(USET_CASE_INSENSITIVE);
2286     if(frozen!=idSet || !(frozen==idSet)) {
2287         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2288     }
2289
2290     frozen.compact();
2291     if(frozen!=idSet || !(frozen==idSet)) {
2292         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2293     }
2294
2295     ParsePosition pos;
2296     frozen.
2297         applyPattern(wsPattern, errorCode).
2298         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2299         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2300         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2301         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2302     if(frozen!=idSet || !(frozen==idSet)) {
2303         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2304     }
2305
2306     frozen.
2307         add(0xd800).
2308         add(0xd802, 0xd805).
2309         add(wsPattern).
2310         addAll(idPattern).
2311         addAll(wsSet);
2312     if(frozen!=idSet || !(frozen==idSet)) {
2313         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2314     }
2315
2316     frozen.
2317         retain(0x62).
2318         retain(0x64, 0x69).
2319         retainAll(wsPattern).
2320         retainAll(wsSet);
2321     if(frozen!=idSet || !(frozen==idSet)) {
2322         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2323     }
2324
2325     frozen.
2326         remove(0x62).
2327         remove(0x64, 0x69).
2328         remove(idPattern).
2329         removeAll(idPattern).
2330         removeAll(idSet);
2331     if(frozen!=idSet || !(frozen==idSet)) {
2332         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2333     }
2334
2335     frozen.
2336         complement().
2337         complement(0x62).
2338         complement(0x64, 0x69).
2339         complement(idPattern).
2340         complementAll(idPattern).
2341         complementAll(idSet);
2342     if(frozen!=idSet || !(frozen==idSet)) {
2343         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2344     }
2345 }
2346
2347 // Test span() etc. -------------------------------------------------------- ***
2348
2349 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2350 static int32_t
2351 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2352     UErrorCode errorCode=U_ZERO_ERROR;
2353     int32_t length8=0;
2354     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2355     if(U_SUCCESS(errorCode)) {
2356         return length8;
2357     } else {
2358         // The string contains an unpaired surrogate.
2359         // Ignore this string.
2360         return 0;
2361     }
2362 }
2363
2364 class UnicodeSetWithStringsIterator;
2365
2366 // Make the strings in a UnicodeSet easily accessible.
2367 class UnicodeSetWithStrings {
2368 public:
2369     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2370             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2371         int32_t size=set.size();
2372         if(size>0 && set.charAt(size-1)<0) {
2373             // If a set's last element is not a code point, then it must contain strings.
2374             // Iterate over the set, skip all code point ranges, and cache the strings.
2375             // Convert them to UTF-8 for spanUTF8().
2376             UnicodeSetIterator iter(set);
2377             const UnicodeString *s;
2378             char *s8=utf8;
2379             int32_t length8, utf8Count=0;
2380             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2381                 if(iter.isString()) {
2382                     // Store the pointer to the set's string element
2383                     // which we happen to know is a stable pointer.
2384                     strings[stringsLength]=s=&iter.getString();
2385                     utf8Count+=
2386                         utf8Lengths[stringsLength]=length8=
2387                         appendUTF8(s->getBuffer(), s->length(),
2388                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2389                     if(length8==0) {
2390                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2391                     }
2392                     s8+=length8;
2393                     ++stringsLength;
2394                 }
2395             }
2396         }
2397     }
2398
2399     const UnicodeSet &getSet() const {
2400         return set;
2401     }
2402
2403     UBool hasStrings() const {
2404         return (UBool)(stringsLength>0);
2405     }
2406
2407     UBool hasStringsWithSurrogates() const {
2408         return hasSurrogates;
2409     }
2410
2411 private:
2412     friend class UnicodeSetWithStringsIterator;
2413
2414     const UnicodeSet &set;
2415
2416     const UnicodeString *strings[20];
2417     int32_t stringsLength;
2418     UBool hasSurrogates;
2419
2420     char utf8[1024];
2421     int32_t utf8Lengths[20];
2422 };
2423
2424 class UnicodeSetWithStringsIterator {
2425 public:
2426     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2427             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2428     }
2429
2430     void reset() {
2431         nextStringIndex=nextUTF8Start=0;
2432     }
2433
2434     const UnicodeString *nextString() {
2435         if(nextStringIndex<fSet.stringsLength) {
2436             return fSet.strings[nextStringIndex++];
2437         } else {
2438             return NULL;
2439         }
2440     }
2441
2442     // Do not mix with calls to nextString().
2443     const char *nextUTF8(int32_t &length) {
2444         if(nextStringIndex<fSet.stringsLength) {
2445             const char *s8=fSet.utf8+nextUTF8Start;
2446             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2447             return s8;
2448         } else {
2449             length=0;
2450             return NULL;
2451         }
2452     }
2453
2454 private:
2455     const UnicodeSetWithStrings &fSet;
2456     int32_t nextStringIndex;
2457     int32_t nextUTF8Start;
2458 };
2459
2460 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2461 // at code point boundaries.
2462 // That is, each edge of a match must not be in the middle of a surrogate pair.
2463 static inline UBool
2464 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2465     s+=start;
2466     limit-=start;
2467     int32_t length=t.length();
2468     return 0==t.compare(s, length) &&
2469            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2470            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2471 }
2472
2473 // Implement span() with contains() for comparison.
2474 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2475                                  USetSpanCondition spanCondition) {
2476     const UnicodeSet &realSet(set.getSet());
2477     if(!set.hasStrings()) {
2478         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2479             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2480         }
2481
2482         UChar32 c;
2483         int32_t start=0, prev;
2484         while((prev=start)<length) {
2485             U16_NEXT(s, start, length, c);
2486             if(realSet.contains(c)!=spanCondition) {
2487                 break;
2488             }
2489         }
2490         return prev;
2491     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2492         UnicodeSetWithStringsIterator iter(set);
2493         UChar32 c;
2494         int32_t start, next;
2495         for(start=next=0; start<length;) {
2496             U16_NEXT(s, next, length, c);
2497             if(realSet.contains(c)) {
2498                 break;
2499             }
2500             const UnicodeString *str;
2501             iter.reset();
2502             while((str=iter.nextString())!=NULL) {
2503                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2504                     // spanNeedsStrings=TRUE;
2505                     return start;
2506                 }
2507             }
2508             start=next;
2509         }
2510         return start;
2511     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2512         UnicodeSetWithStringsIterator iter(set);
2513         UChar32 c;
2514         int32_t start, next, maxSpanLimit=0;
2515         for(start=next=0; start<length;) {
2516             U16_NEXT(s, next, length, c);
2517             if(!realSet.contains(c)) {
2518                 next=start;  // Do not span this single, not-contained code point.
2519             }
2520             const UnicodeString *str;
2521             iter.reset();
2522             while((str=iter.nextString())!=NULL) {
2523                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2524                     // spanNeedsStrings=TRUE;
2525                     int32_t matchLimit=start+str->length();
2526                     if(matchLimit==length) {
2527                         return length;
2528                     }
2529                     if(spanCondition==USET_SPAN_CONTAINED) {
2530                         // Iterate for the shortest match at each position.
2531                         // Recurse for each but the shortest match.
2532                         if(next==start) {
2533                             next=matchLimit;  // First match from start.
2534                         } else {
2535                             if(matchLimit<next) {
2536                                 // Remember shortest match from start for iteration.
2537                                 int32_t temp=next;
2538                                 next=matchLimit;
2539                                 matchLimit=temp;
2540                             }
2541                             // Recurse for non-shortest match from start.
2542                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2543                                                                  USET_SPAN_CONTAINED);
2544                             if((matchLimit+spanLength)>maxSpanLimit) {
2545                                 maxSpanLimit=matchLimit+spanLength;
2546                                 if(maxSpanLimit==length) {
2547                                     return length;
2548                                 }
2549                             }
2550                         }
2551                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2552                         if(matchLimit>next) {
2553                             // Remember longest match from start.
2554                             next=matchLimit;
2555                         }
2556                     }
2557                 }
2558             }
2559             if(next==start) {
2560                 break;  // No match from start.
2561             }
2562             start=next;
2563         }
2564         if(start>maxSpanLimit) {
2565             return start;
2566         } else {
2567             return maxSpanLimit;
2568         }
2569     }
2570 }
2571
2572 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2573                                      USetSpanCondition spanCondition) {
2574     if(length==0) {
2575         return 0;
2576     }
2577     const UnicodeSet &realSet(set.getSet());
2578     if(!set.hasStrings()) {
2579         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2580             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2581         }
2582
2583         UChar32 c;
2584         int32_t prev=length;
2585         do {
2586             U16_PREV(s, 0, length, c);
2587             if(realSet.contains(c)!=spanCondition) {
2588                 break;
2589             }
2590         } while((prev=length)>0);
2591         return prev;
2592     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2593         UnicodeSetWithStringsIterator iter(set);
2594         UChar32 c;
2595         int32_t prev=length, length0=length;
2596         do {
2597             U16_PREV(s, 0, length, c);
2598             if(realSet.contains(c)) {
2599                 break;
2600             }
2601             const UnicodeString *str;
2602             iter.reset();
2603             while((str=iter.nextString())!=NULL) {
2604                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2605                     // spanNeedsStrings=TRUE;
2606                     return prev;
2607                 }
2608             }
2609         } while((prev=length)>0);
2610         return prev;
2611     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2612         UnicodeSetWithStringsIterator iter(set);
2613         UChar32 c;
2614         int32_t prev=length, minSpanStart=length, length0=length;
2615         do {
2616             U16_PREV(s, 0, length, c);
2617             if(!realSet.contains(c)) {
2618                 length=prev;  // Do not span this single, not-contained code point.
2619             }
2620             const UnicodeString *str;
2621             iter.reset();
2622             while((str=iter.nextString())!=NULL) {
2623                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2624                     // spanNeedsStrings=TRUE;
2625                     int32_t matchStart=prev-str->length();
2626                     if(matchStart==0) {
2627                         return 0;
2628                     }
2629                     if(spanCondition==USET_SPAN_CONTAINED) {
2630                         // Iterate for the shortest match at each position.
2631                         // Recurse for each but the shortest match.
2632                         if(length==prev) {
2633                             length=matchStart;  // First match from prev.
2634                         } else {
2635                             if(matchStart>length) {
2636                                 // Remember shortest match from prev for iteration.
2637                                 int32_t temp=length;
2638                                 length=matchStart;
2639                                 matchStart=temp;
2640                             }
2641                             // Recurse for non-shortest match from prev.
2642                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2643                                                                     USET_SPAN_CONTAINED);
2644                             if(spanStart<minSpanStart) {
2645                                 minSpanStart=spanStart;
2646                                 if(minSpanStart==0) {
2647                                     return 0;
2648                                 }
2649                             }
2650                         }
2651                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2652                         if(matchStart<length) {
2653                             // Remember longest match from prev.
2654                             length=matchStart;
2655                         }
2656                     }
2657                 }
2658             }
2659             if(length==prev) {
2660                 break;  // No match from prev.
2661             }
2662         } while((prev=length)>0);
2663         if(prev<minSpanStart) {
2664             return prev;
2665         } else {
2666             return minSpanStart;
2667         }
2668     }
2669 }
2670
2671 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2672                                 USetSpanCondition spanCondition) {
2673     const UnicodeSet &realSet(set.getSet());
2674     if(!set.hasStrings()) {
2675         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2676             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2677         }
2678
2679         UChar32 c;
2680         int32_t start=0, prev;
2681         while((prev=start)<length) {
2682             U8_NEXT_OR_FFFD(s, start, length, c);
2683             if(realSet.contains(c)!=spanCondition) {
2684                 break;
2685             }
2686         }
2687         return prev;
2688     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2689         UnicodeSetWithStringsIterator iter(set);
2690         UChar32 c;
2691         int32_t start, next;
2692         for(start=next=0; start<length;) {
2693             U8_NEXT_OR_FFFD(s, next, length, c);
2694             if(realSet.contains(c)) {
2695                 break;
2696             }
2697             const char *s8;
2698             int32_t length8;
2699             iter.reset();
2700             while((s8=iter.nextUTF8(length8))!=NULL) {
2701                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2702                     // spanNeedsStrings=TRUE;
2703                     return start;
2704                 }
2705             }
2706             start=next;
2707         }
2708         return start;
2709     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2710         UnicodeSetWithStringsIterator iter(set);
2711         UChar32 c;
2712         int32_t start, next, maxSpanLimit=0;
2713         for(start=next=0; start<length;) {
2714             U8_NEXT_OR_FFFD(s, next, length, c);
2715             if(!realSet.contains(c)) {
2716                 next=start;  // Do not span this single, not-contained code point.
2717             }
2718             const char *s8;
2719             int32_t length8;
2720             iter.reset();
2721             while((s8=iter.nextUTF8(length8))!=NULL) {
2722                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2723                     // spanNeedsStrings=TRUE;
2724                     int32_t matchLimit=start+length8;
2725                     if(matchLimit==length) {
2726                         return length;
2727                     }
2728                     if(spanCondition==USET_SPAN_CONTAINED) {
2729                         // Iterate for the shortest match at each position.
2730                         // Recurse for each but the shortest match.
2731                         if(next==start) {
2732                             next=matchLimit;  // First match from start.
2733                         } else {
2734                             if(matchLimit<next) {
2735                                 // Remember shortest match from start for iteration.
2736                                 int32_t temp=next;
2737                                 next=matchLimit;
2738                                 matchLimit=temp;
2739                             }
2740                             // Recurse for non-shortest match from start.
2741                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2742                                                                 USET_SPAN_CONTAINED);
2743                             if((matchLimit+spanLength)>maxSpanLimit) {
2744                                 maxSpanLimit=matchLimit+spanLength;
2745                                 if(maxSpanLimit==length) {
2746                                     return length;
2747                                 }
2748                             }
2749                         }
2750                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2751                         if(matchLimit>next) {
2752                             // Remember longest match from start.
2753                             next=matchLimit;
2754                         }
2755                     }
2756                 }
2757             }
2758             if(next==start) {
2759                 break;  // No match from start.
2760             }
2761             start=next;
2762         }
2763         if(start>maxSpanLimit) {
2764             return start;
2765         } else {
2766             return maxSpanLimit;
2767         }
2768     }
2769 }
2770
2771 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2772                                     USetSpanCondition spanCondition) {
2773     if(length==0) {
2774         return 0;
2775     }
2776     const UnicodeSet &realSet(set.getSet());
2777     if(!set.hasStrings()) {
2778         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2779             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2780         }
2781
2782         UChar32 c;
2783         int32_t prev=length;
2784         do {
2785             U8_PREV_OR_FFFD(s, 0, length, c);
2786             if(realSet.contains(c)!=spanCondition) {
2787                 break;
2788             }
2789         } while((prev=length)>0);
2790         return prev;
2791     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2792         UnicodeSetWithStringsIterator iter(set);
2793         UChar32 c;
2794         int32_t prev=length;
2795         do {
2796             U8_PREV_OR_FFFD(s, 0, length, c);
2797             if(realSet.contains(c)) {
2798                 break;
2799             }
2800             const char *s8;
2801             int32_t length8;
2802             iter.reset();
2803             while((s8=iter.nextUTF8(length8))!=NULL) {
2804                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2805                     // spanNeedsStrings=TRUE;
2806                     return prev;
2807                 }
2808             }
2809         } while((prev=length)>0);
2810         return prev;
2811     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2812         UnicodeSetWithStringsIterator iter(set);
2813         UChar32 c;
2814         int32_t prev=length, minSpanStart=length;
2815         do {
2816             U8_PREV_OR_FFFD(s, 0, length, c);
2817             if(!realSet.contains(c)) {
2818                 length=prev;  // Do not span this single, not-contained code point.
2819             }
2820             const char *s8;
2821             int32_t length8;
2822             iter.reset();
2823             while((s8=iter.nextUTF8(length8))!=NULL) {
2824                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2825                     // spanNeedsStrings=TRUE;
2826                     int32_t matchStart=prev-length8;
2827                     if(matchStart==0) {
2828                         return 0;
2829                     }
2830                     if(spanCondition==USET_SPAN_CONTAINED) {
2831                         // Iterate for the shortest match at each position.
2832                         // Recurse for each but the shortest match.
2833                         if(length==prev) {
2834                             length=matchStart;  // First match from prev.
2835                         } else {
2836                             if(matchStart>length) {
2837                                 // Remember shortest match from prev for iteration.
2838                                 int32_t temp=length;
2839                                 length=matchStart;
2840                                 matchStart=temp;
2841                             }
2842                             // Recurse for non-shortest match from prev.
2843                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2844                                                                    USET_SPAN_CONTAINED);
2845                             if(spanStart<minSpanStart) {
2846                                 minSpanStart=spanStart;
2847                                 if(minSpanStart==0) {
2848                                     return 0;
2849                                 }
2850                             }
2851                         }
2852                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2853                         if(matchStart<length) {
2854                             // Remember longest match from prev.
2855                             length=matchStart;
2856                         }
2857                     }
2858                 }
2859             }
2860             if(length==prev) {
2861                 break;  // No match from prev.
2862             }
2863         } while((prev=length)>0);
2864         if(prev<minSpanStart) {
2865             return prev;
2866         } else {
2867             return minSpanStart;
2868         }
2869     }
2870 }
2871
2872 // spans to be performed and compared
2873 enum {
2874     SPAN_UTF16          =1,
2875     SPAN_UTF8           =2,
2876     SPAN_UTFS           =3,
2877
2878     SPAN_SET            =4,
2879     SPAN_COMPLEMENT     =8,
2880     SPAN_POLARITY       =0xc,
2881
2882     SPAN_FWD            =0x10,
2883     SPAN_BACK           =0x20,
2884     SPAN_DIRS           =0x30,
2885
2886     SPAN_CONTAINED      =0x100,
2887     SPAN_SIMPLE         =0x200,
2888     SPAN_CONDITION      =0x300,
2889
2890     SPAN_ALL            =0x33f
2891 };
2892
2893 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2894     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2895 }
2896
2897 static inline int32_t slen(const void *s, UBool isUTF16) {
2898     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2899 }
2900
2901 /*
2902  * Count spans on a string with the method according to type and set the span limits.
2903  * The set may be the complement of the original.
2904  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2905  * according to the expected number of spans.
2906  * Sets typeName to an empty string if there is no such type.
2907  * Returns -1 if the span option is filtered out.
2908  */
2909 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2910                         const void *s, int32_t length, UBool isUTF16,
2911                         uint32_t whichSpans,
2912                         int type, const char *&typeName,
2913                         int32_t limits[], int32_t limitsCapacity,
2914                         int32_t expectCount) {
2915     const UnicodeSet &realSet(set.getSet());
2916     int32_t start, count;
2917     USetSpanCondition spanCondition, firstSpanCondition, contained;
2918     UBool isForward;
2919
2920     if(type<0 || 7<type) {
2921         typeName="";
2922         return 0;
2923     }
2924
2925     static const char *const typeNames16[]={
2926         "contains", "contains(LM)",
2927         "span", "span(LM)",
2928         "containsBack", "containsBack(LM)",
2929         "spanBack", "spanBack(LM)"
2930     };
2931
2932     static const char *const typeNames8[]={
2933         "containsUTF8", "containsUTF8(LM)",
2934         "spanUTF8", "spanUTF8(LM)",
2935         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2936         "spanBackUTF8", "spanBackUTF8(LM)"
2937     };
2938
2939     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2940
2941     // filter span options
2942     if(type<=3) {
2943         // span forward
2944         if((whichSpans&SPAN_FWD)==0) {
2945             return -1;
2946         }
2947         isForward=TRUE;
2948     } else {
2949         // span backward
2950         if((whichSpans&SPAN_BACK)==0) {
2951             return -1;
2952         }
2953         isForward=FALSE;
2954     }
2955     if((type&1)==0) {
2956         // use USET_SPAN_CONTAINED
2957         if((whichSpans&SPAN_CONTAINED)==0) {
2958             return -1;
2959         }
2960         contained=USET_SPAN_CONTAINED;
2961     } else {
2962         // use USET_SPAN_SIMPLE
2963         if((whichSpans&SPAN_SIMPLE)==0) {
2964             return -1;
2965         }
2966         contained=USET_SPAN_SIMPLE;
2967     }
2968
2969     // Default first span condition for going forward with an uncomplemented set.
2970     spanCondition=USET_SPAN_NOT_CONTAINED;
2971     if(isComplement) {
2972         spanCondition=invertSpanCondition(spanCondition, contained);
2973     }
2974
2975     // First span condition for span(), used to terminate the spanBack() iteration.
2976     firstSpanCondition=spanCondition;
2977
2978     // spanBack(): Its initial span condition is span()'s last span condition,
2979     // which is the opposite of span()'s first span condition
2980     // if we expect an even number of spans.
2981     // (The loop inverts spanCondition (expectCount-1) times
2982     // before the expectCount'th span() call.)
2983     // If we do not compare forward and backward directions, then we do not have an
2984     // expectCount and just start with firstSpanCondition.
2985     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2986         spanCondition=invertSpanCondition(spanCondition, contained);
2987     }
2988
2989     count=0;
2990     switch(type) {
2991     case 0:
2992     case 1:
2993         start=0;
2994         if(length<0) {
2995             length=slen(s, isUTF16);
2996         }
2997         for(;;) {
2998             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2999                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3000             if(count<limitsCapacity) {
3001                 limits[count]=start;
3002             }
3003             ++count;
3004             if(start>=length) {
3005                 break;
3006             }
3007             spanCondition=invertSpanCondition(spanCondition, contained);
3008         }
3009         break;
3010     case 2:
3011     case 3:
3012         start=0;
3013         for(;;) {
3014             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3015                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3016             if(count<limitsCapacity) {
3017                 limits[count]=start;
3018             }
3019             ++count;
3020             if(length>=0 ? start>=length :
3021                            isUTF16 ? ((const UChar *)s)[start]==0 :
3022                                      ((const char *)s)[start]==0
3023             ) {
3024                 break;
3025             }
3026             spanCondition=invertSpanCondition(spanCondition, contained);
3027         }
3028         break;
3029     case 4:
3030     case 5:
3031         if(length<0) {
3032             length=slen(s, isUTF16);
3033         }
3034         for(;;) {
3035             ++count;
3036             if(count<=limitsCapacity) {
3037                 limits[limitsCapacity-count]=length;
3038             }
3039             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3040                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3041             if(length==0 && spanCondition==firstSpanCondition) {
3042                 break;
3043             }
3044             spanCondition=invertSpanCondition(spanCondition, contained);
3045         }
3046         if(count<limitsCapacity) {
3047             memmove(limits, limits+(limitsCapacity-count), count*4);
3048         }
3049         break;
3050     case 6:
3051     case 7:
3052         for(;;) {
3053             ++count;
3054             if(count<=limitsCapacity) {
3055                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3056             }
3057             // Note: Length<0 is tested only for the first spanBack().
3058             // If we wanted to keep length<0 for all spanBack()s, we would have to
3059             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3060             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3061                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3062             if(length==0 && spanCondition==firstSpanCondition) {
3063                 break;
3064             }
3065             spanCondition=invertSpanCondition(spanCondition, contained);
3066         }
3067         if(count<limitsCapacity) {
3068             memmove(limits, limits+(limitsCapacity-count), count*4);
3069         }
3070         break;
3071     default:
3072         typeName="";
3073         return -1;
3074     }
3075
3076     return count;
3077 }
3078
3079 // sets to be tested; odd index=isComplement
3080 enum {
3081     SLOW,
3082     SLOW_NOT,
3083     FAST,
3084     FAST_NOT,
3085     SET_COUNT
3086 };
3087
3088 static const char *const setNames[SET_COUNT]={
3089     "slow",
3090     "slow.not",
3091     "fast",
3092     "fast.not"
3093 };
3094
3095 /*
3096  * Verify that we get the same results whether we look at text with contains(),
3097  * span() or spanBack(), using unfrozen or frozen versions of the set,
3098  * and using the set or its complement (switching the spanConditions accordingly).
3099  * The latter verifies that
3100  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3101  *
3102  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3103  * or returned to the caller (with an input expectCount<0).
3104  */
3105 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3106                               const void *s, int32_t length, UBool isUTF16,
3107                               uint32_t whichSpans,
3108                               int32_t expectLimits[], int32_t &expectCount,
3109                               const char *testName, int32_t index) {
3110     int32_t limits[500];
3111     int32_t limitsCount;
3112     int i, j;
3113
3114     const char *typeName;
3115     int type;
3116
3117     for(i=0; i<SET_COUNT; ++i) {
3118         if((i&1)==0) {
3119             // Even-numbered sets are original, uncomplemented sets.
3120             if((whichSpans&SPAN_SET)==0) {
3121                 continue;
3122             }
3123         } else {
3124             // Odd-numbered sets are complemented.
3125             if((whichSpans&SPAN_COMPLEMENT)==0) {
3126                 continue;
3127             }
3128         }
3129         for(type=0;; ++type) {
3130             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3131                                  s, length, isUTF16,
3132                                  whichSpans,
3133                                  type, typeName,
3134                                  limits, UPRV_LENGTHOF(limits), expectCount);
3135             if(typeName[0]==0) {
3136                 break; // All types tried.
3137             }
3138             if(limitsCount<0) {
3139                 continue; // Span option filtered out.
3140             }
3141             if(expectCount<0) {
3142                 expectCount=limitsCount;
3143                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3144                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3145                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3146                     return;
3147                 }
3148                 memcpy(expectLimits, limits, limitsCount*4);
3149             } else if(limitsCount!=expectCount) {
3150                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3151                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3152             } else {
3153                 for(j=0; j<limitsCount; ++j) {
3154                     if(limits[j]!=expectLimits[j]) {
3155                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3156                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3157                               j, (long)limits[j], (long)expectLimits[j]);
3158                         break;
3159                     }
3160                 }
3161             }
3162         }
3163     }
3164
3165     // Compare span() with containsAll()/containsNone(),
3166     // but only if we have expectLimits[] from the uncomplemented set.
3167     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3168         const UChar *s16=(const UChar *)s;
3169         UnicodeString string;
3170         int32_t prev=0, limit, length;
3171         for(i=0; i<expectCount; ++i) {
3172             limit=expectLimits[i];
3173             length=limit-prev;
3174             if(length>0) {
3175                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3176                 if(i&1) {
3177                     if(!sets[SLOW]->getSet().containsAll(string)) {
3178                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3179                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3180                         return;
3181                     }
3182                     if(!sets[FAST]->getSet().containsAll(string)) {
3183                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3184                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3185                         return;
3186                     }
3187                 } else {
3188                     if(!sets[SLOW]->getSet().containsNone(string)) {
3189                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3190                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3191                         return;
3192                     }
3193                     if(!sets[FAST]->getSet().containsNone(string)) {
3194                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3195                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3196                         return;
3197                     }
3198                 }
3199             }
3200             prev=limit;
3201         }
3202     }
3203 }
3204
3205 // Specifically test either UTF-16 or UTF-8.
3206 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3207                               const void *s, int32_t length, UBool isUTF16,
3208                               uint32_t whichSpans,
3209                               const char *testName, int32_t index) {
3210     int32_t expectLimits[500];
3211     int32_t expectCount=-1;
3212     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3213 }
3214
3215 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3216     UChar c, c2;
3217
3218     if(length>=0) {
3219         while(length>0) {
3220             c=*s++;
3221             --length;
3222             if(0xd800<=c && c<0xe000) {
3223                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3224                     return TRUE;
3225                 }
3226                 --length;
3227             }
3228         }
3229     } else {
3230         while((c=*s++)!=0) {
3231             if(0xd800<=c && c<0xe000) {
3232                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3233                     return TRUE;
3234                 }
3235             }
3236         }
3237     }
3238     return FALSE;
3239 }
3240
3241 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3242 // unless either UTF is turned off in whichSpans.
3243 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3244 // have the same contains(c) value as U+FFFD.
3245 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3246                                       const UChar *s16, int32_t length16,
3247                                       uint32_t whichSpans,
3248                                       const char *testName, int32_t index) {
3249     int32_t expectLimits[500];
3250     int32_t expectCount;
3251
3252     expectCount=-1;  // Get expectLimits[] from testSpan().
3253
3254     if((whichSpans&SPAN_UTF16)!=0) {
3255         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3256     }
3257     if((whichSpans&SPAN_UTF8)==0) {
3258         return;
3259     }
3260
3261     // Convert s16[] and expectLimits[] to UTF-8.
3262     uint8_t s8[3000];
3263     int32_t offsets[3000];
3264
3265     const UChar *s16Limit=s16+length16;
3266     char *t=(char *)s8;
3267     char *tLimit=t+sizeof(s8);
3268     int32_t *o=offsets;
3269     UErrorCode errorCode=U_ZERO_ERROR;
3270
3271     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3272     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3273     if(U_FAILURE(errorCode)) {
3274         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3275               testName, (long)index, u_errorName(errorCode));
3276         ucnv_resetFromUnicode(utf8Cnv);
3277         return;
3278     }
3279     int32_t length8=(int32_t)(t-(char *)s8);
3280
3281     // Convert expectLimits[].
3282     int32_t i, j, expect;
3283     for(i=j=0; i<expectCount; ++i) {
3284         expect=expectLimits[i];
3285         if(expect==length16) {
3286             expectLimits[i]=length8;
3287         } else {
3288             while(offsets[j]<expect) {
3289                 ++j;
3290             }
3291             expectLimits[i]=j;
3292         }
3293     }
3294
3295     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3296 }
3297
3298 static UChar32 nextCodePoint(UChar32 c) {
3299     // Skip some large and boring ranges.
3300     switch(c) {
3301     case 0x3441:
3302         return 0x4d7f;
3303     case 0x5100:
3304         return 0x9f00;
3305     case 0xb040:
3306         return 0xd780;
3307     case 0xe041:
3308         return 0xf8fe;
3309     case 0x10100:
3310         return 0x20000;
3311     case 0x20041:
3312         return 0xe0000;
3313     case 0xe0101:
3314         return 0x10fffd;
3315     default:
3316         return c+1;
3317     }
3318 }
3319
3320 // Verify that all implementations represent the same set.
3321 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3322     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3323     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3324     // Skip the UTF-8 part of the test - if the string contains surrogates -
3325     // because it is likely to produce a different result.
3326     UBool inconsistentSurrogates=
3327             (!(sets[0]->getSet().contains(0xfffd) ?
3328                sets[0]->getSet().contains(0xd800, 0xdfff) :
3329                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3330              sets[0]->hasStringsWithSurrogates());
3331
3332     UChar s[1000];
3333     int32_t length=0;
3334     uint32_t localWhichSpans;
3335
3336     UChar32 c, first;
3337     for(first=c=0;; c=nextCodePoint(c)) {
3338         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3339             localWhichSpans=whichSpans;
3340             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3341                 localWhichSpans&=~SPAN_UTF8;
3342             }
3343             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3344             if(c>0x10ffff) {
3345                 break;
3346             }
3347             length=0;
3348             first=c;
3349         }
3350         U16_APPEND_UNSAFE(s, length, c);
3351     }
3352 }
3353
3354 // Test with a particular, interesting string.
3355 // Specify length and try NUL-termination.
3356 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3357     static const UChar s[]={
3358         0x61, 0x62, 0x20,                       // Latin, space
3359         0x3b1, 0x3b2, 0x3b3,                    // Greek
3360         0xd900,                                 // lead surrogate
3361         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3362         0xdc05,                                 // trail surrogate
3363         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3364         0xd900, 0xdc05,                         // unassigned supplementary
3365         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3366         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3367         0                                       // NUL
3368     };
3369
3370     if((whichSpans&SPAN_UTF16)==0) {
3371         return;
3372     }
3373     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3374     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3375 }
3376
3377 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3378     static const char s[]={
3379         "abc"                                   // Latin
3380
3381         /* trail byte in lead position */
3382         "\x80"
3383
3384         " "                                     // space
3385
3386         /* truncated multi-byte sequences */
3387         "\xd0"
3388         "\xe0"
3389         "\xe1"
3390         "\xed"
3391         "\xee"
3392         "\xf0"
3393         "\xf1"
3394         "\xf4"
3395         "\xf8"
3396         "\xfc"
3397
3398         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3399
3400         /* trail byte in lead position */
3401         "\x80"
3402
3403         "\xe0\x80"
3404         "\xe0\xa0"
3405         "\xe1\x80"
3406         "\xed\x80"
3407         "\xed\xa0"
3408         "\xee\x80"
3409         "\xf0\x80"
3410         "\xf0\x90"
3411         "\xf1\x80"
3412         "\xf4\x80"
3413         "\xf4\x90"
3414         "\xf8\x80"
3415         "\xfc\x80"
3416
3417         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3418
3419         /* trail byte in lead position */
3420         "\x80"
3421
3422         "\xf0\x80\x80"
3423         "\xf0\x90\x80"
3424         "\xf1\x80\x80"
3425         "\xf4\x80\x80"
3426         "\xf4\x90\x80"
3427         "\xf8\x80\x80"
3428         "\xfc\x80\x80"
3429
3430         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3431
3432         /* trail byte in lead position */
3433         "\x80"
3434
3435         "\xf8\x80\x80\x80"
3436         "\xfc\x80\x80\x80"
3437
3438         "\xF1\x90\x80\x85"                      // unassigned supplementary
3439
3440         /* trail byte in lead position */
3441         "\x80"
3442
3443         "\xfc\x80\x80\x80\x80"
3444
3445         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3446
3447         /* trail byte in lead position */
3448         "\x80"
3449
3450         /* complete sequences but non-shortest forms or out of range etc. */
3451         "\xc0\x80"
3452         "\xe0\x80\x80"
3453         "\xed\xa0\x80"
3454         "\xf0\x80\x80\x80"
3455         "\xf4\x90\x80\x80"
3456         "\xf8\x80\x80\x80\x80"
3457         "\xfc\x80\x80\x80\x80\x80"
3458         "\xfe"
3459         "\xff"
3460
3461         /* trail byte in lead position */
3462         "\x80"
3463
3464         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3465     };
3466
3467     if((whichSpans&SPAN_UTF8)==0) {
3468         return;
3469     }
3470     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3471     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3472 }
3473
3474 // Take a set of span options and multiply them so that
3475 // each portion only has one of the options a, b and c.
3476 // If b==0, then the set of options is just modified with mask and a.
3477 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3478 static int32_t
3479 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3480                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3481     uint32_t s;
3482     int32_t i;
3483
3484     for(i=0; i<whichSpansCount; ++i) {
3485         s=whichSpans[i]&mask;
3486         whichSpans[i]=s|a;
3487         if(b!=0) {
3488             whichSpans[whichSpansCount+i]=s|b;
3489             if(c!=0) {
3490                 whichSpans[2*whichSpansCount+i]=s|c;
3491             }
3492         }
3493     }
3494     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3495 }
3496
3497 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3498 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3499 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3500 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3501
3502 void UnicodeSetTest::TestSpan() {
3503     // "[...]" is a UnicodeSet pattern.
3504     // "*" performs tests on all Unicode code points and on a selection of
3505     //   malformed UTF-8/16 strings.
3506     // "-options" limits the scope of testing for the current set.
3507     //   By default, the test verifies that equivalent boundaries are found
3508     //   for UTF-16 and UTF-8, going forward and backward,
3509     //   alternating USET_SPAN_NOT_CONTAINED with
3510     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3511     //   Single-character options:
3512     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3513     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3514     //          or the set contains strings with unpaired surrogates
3515     //          which do not translate to valid UTF-8.
3516     //     c -- set.span() and set.complement().span() boundaries may differ.
3517     //          Cause: Set strings are not complemented.
3518     //     b -- span() and spanBack() boundaries may differ.
3519     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3520     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3521     //          match with non-overlapping substrings.
3522     //          For example, with a set containing "ab" and "ba",
3523     //          span() of "aba" yields boundaries { 0, 2, 3 }
3524     //          because the initial "ab" matches from 0 to 2,
3525     //          while spanBack() yields boundaries { 0, 1, 3 }
3526     //          because the final "ba" matches from 1 to 3.
3527     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3528     //          Cause: Strings in the set overlap, and a longer match may
3529     //          require a sequence including non-longest substrings.
3530     //          For example, with a set containing "ab", "abc" and "cd",
3531     //          span(contained) of "abcd" spans the entire string
3532     //          but span(longest match) only spans the first 3 characters.
3533     //   Each "-options" first resets all options and then applies the specified options.
3534     //   A "-" without options resets the options.
3535     //   The options are also reset for each new set.
3536     // Other strings will be spanned.
3537     static const char *const testdata[]={
3538         "[:ID_Continue:]",
3539         "*",
3540         "[:White_Space:]",
3541         "*",
3542         "[]",
3543         "*",
3544         "[\\u0000-\\U0010FFFF]",
3545         "*",
3546         "[\\u0000\\u0080\\u0800\\U00010000]",
3547         "*",
3548         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3549         "*",
3550         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3551         "-c",
3552         "*",
3553         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3554         "-c",
3555         "*",
3556
3557         // Overlapping strings cause overlapping attempts to match.
3558         "[x{xy}{xya}{axy}{ax}]",
3559         "-cl",
3560
3561         // More repetitions of "xya" would take too long with the recursive
3562         // reference implementation.
3563         // containsAll()=FALSE
3564         // test_string 0x14
3565         "xx"
3566         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3567         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3568         "xyaxyaxyaxya"
3569         "xx"
3570         "xyaxyaxyaxya"  // span() ends here.
3571         "aaa",
3572
3573         // containsAll()=TRUE
3574         // test_string 0x15
3575         "xx"
3576         "xyaxyaxyaxya"
3577         "xx"
3578         "xyaxyaxyaxya"
3579         "xx"
3580         "xyaxyaxyaxy",
3581
3582         "-bc",
3583         // test_string 0x17
3584         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3585         "-c",
3586         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3587         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3588         "-",
3589         "byaya",     // span() -> { 5 }
3590         "byay",      // span() -> { 4 }
3591         "bya",       // span() -> { 3 }
3592
3593         // span(longest match) will not span the whole string.
3594         "[a{ab}{bc}]",
3595         "-cl",
3596         // test_string 0x21
3597         "abc",
3598
3599         "[a{ab}{abc}{cd}]",
3600         "-cl",
3601         "acdabcdabccd",
3602
3603         // spanBack(longest match) will not span the whole string.
3604         "[c{ab}{bc}]",
3605         "-cl",
3606         "abc",
3607
3608         "[d{cd}{bcd}{ab}]",
3609         "-cl",
3610         "abbcdabcdabd",
3611
3612         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3613         // and UTF-8 trail bytes.
3614         // Copies of above test sets and strings, but transliterated to have
3615         // different code points with similar trail units.
3616         // Previous: a      b         c            d
3617         // Unicode:  042B   30AB      200AB        204AB
3618         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3619         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3620         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3621         "-cl",
3622         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3623
3624         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3625         "-cl",
3626         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3627
3628         // Stress bookkeeping and recursion.
3629         // The following strings are barely doable with the recursive
3630         // reference implementation.
3631         // The not-contained character at the end prevents an early exit from the span().
3632         "[b{bb}]",
3633         "-c",
3634         // test_string 0x33
3635         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3636         // On complement sets, span() and spanBack() get different results
3637         // because b is not in the complement set and there is an odd number of b's
3638         // in the test string.
3639         "-bc",
3640         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3641
3642         // Test with set strings with an initial or final code point span
3643         // longer than 254.
3644         "[a{" _64_a _64_a _64_a _64_a "b}"
3645           "{a" _64_b _64_b _64_b _64_b "}]",
3646         "-c",
3647         _64_a _64_a _64_a _63_a "b",
3648         _64_a _64_a _64_a _64_a "b",
3649         _64_a _64_a _64_a _64_a "aaaabbbb",
3650         "a" _64_b _64_b _64_b _63_b,
3651         "a" _64_b _64_b _64_b _64_b,
3652         "aaaabbbb" _64_b _64_b _64_b _64_b,
3653
3654         // Test with strings containing unpaired surrogates.
3655         // They are not representable in UTF-8, and a leading trail surrogate
3656         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3657         // U+20001 == \\uD840\\uDC01
3658         // U+20400 == \\uD841\\uDC00
3659         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3660         "-8cl",
3661         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3662     };
3663     uint32_t whichSpans[96]={ SPAN_ALL };
3664     int32_t whichSpansCount=1;
3665
3666     UnicodeSet *sets[SET_COUNT]={ NULL };
3667     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3668
3669     char testName[1024];
3670     char *testNameLimit=testName;
3671
3672     int32_t i, j;
3673     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3674         const char *s=testdata[i];
3675         if(s[0]=='[') {
3676             // Create new test sets from this pattern.
3677             for(j=0; j<SET_COUNT; ++j) {
3678                 delete sets_with_str[j];
3679                 delete sets[j];
3680             }
3681             UErrorCode errorCode=U_ZERO_ERROR;
3682             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3683             if(U_FAILURE(errorCode)) {
3684                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3685                 break;
3686             }
3687             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3688             sets[SLOW_NOT]->complement();
3689             // Intermediate set: Test cloning of a frozen set.
3690             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3691             fast->freeze();
3692             sets[FAST]=(UnicodeSet *)fast->clone();
3693             delete fast;
3694             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3695             fastNot->freeze();
3696             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3697             delete fastNot;
3698
3699             for(j=0; j<SET_COUNT; ++j) {
3700                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3701             }
3702
3703             strcpy(testName, s);
3704             testNameLimit=strchr(testName, 0);
3705             *testNameLimit++=':';
3706             *testNameLimit=0;
3707
3708             whichSpans[0]=SPAN_ALL;
3709             whichSpansCount=1;
3710         } else if(s[0]=='-') {
3711             whichSpans[0]=SPAN_ALL;
3712             whichSpansCount=1;
3713
3714             while(*++s!=0) {
3715                 switch(*s) {
3716                 case 'c':
3717                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3718                                                    ~SPAN_POLARITY,
3719                                                    SPAN_SET,
3720                                                    SPAN_COMPLEMENT,
3721                                                    0);
3722                     break;
3723                 case 'b':
3724                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3725                                                    ~SPAN_DIRS,
3726                                                    SPAN_FWD,
3727                                                    SPAN_BACK,
3728                                                    0);
3729                     break;
3730                 case 'l':
3731                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3732                     // USET_SPAN_SIMPLE only FWD, and separately
3733                     // USET_SPAN_SIMPLE only BACK
3734                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3735                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3736                                                    SPAN_DIRS|SPAN_CONTAINED,
3737                                                    SPAN_FWD|SPAN_SIMPLE,
3738                                                    SPAN_BACK|SPAN_SIMPLE);
3739                     break;
3740                 case '8':
3741                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3742                                                    ~SPAN_UTFS,
3743                                                    SPAN_UTF16,
3744                                                    SPAN_UTF8,
3745                                                    0);
3746                     break;
3747                 default:
3748                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3749                     break;
3750                 }
3751             }
3752         } else if(0==strcmp(s, "*")) {
3753             strcpy(testNameLimit, "bad_string");
3754             for(j=0; j<whichSpansCount; ++j) {
3755                 if(whichSpansCount>1) {
3756                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3757                             "%%0x%3x",
3758                             whichSpans[j]);
3759                 }
3760                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3761                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3762             }
3763
3764             strcpy(testNameLimit, "contents");
3765             for(j=0; j<whichSpansCount; ++j) {
3766                 if(whichSpansCount>1) {
3767                     sprintf(testNameLimit+8 /* strlen("contents") */,
3768                             "%%0x%3x",
3769                             whichSpans[j]);
3770                 }
3771                 testSpanContents(sets_with_str, whichSpans[j], testName);
3772             }
3773         } else {
3774             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3775             strcpy(testNameLimit, "test_string");
3776             for(j=0; j<whichSpansCount; ++j) {
3777                 if(whichSpansCount>1) {
3778                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3779                             "%%0x%3x",
3780                             whichSpans[j]);
3781                 }
3782                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3783             }
3784         }
3785     }
3786     for(j=0; j<SET_COUNT; ++j) {
3787         delete sets_with_str[j];
3788         delete sets[j];
3789     }
3790 }
3791
3792 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3793 void UnicodeSetTest::TestStringSpan() {
3794     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3795     static const char *const string=
3796         "xx"
3797         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3798         "xx"
3799         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3800         "xx"
3801         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3802         "aaaa";
3803
3804     UErrorCode errorCode=U_ZERO_ERROR;
3805     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3806     UnicodeSet set(pattern16, errorCode);
3807     if(U_FAILURE(errorCode)) {
3808         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3809         return;
3810     }
3811
3812     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3813
3814     if(set.containsAll(string16)) {
3815         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3816     }
3817
3818     // Remove trailing "aaaa".
3819     string16.truncate(string16.length()-4);
3820     if(!set.containsAll(string16)) {
3821         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3822     }
3823
3824     string16=UNICODE_STRING_SIMPLE("byayaxya");
3825     const UChar *s16=string16.getBuffer();
3826     int32_t length16=string16.length();
3827     (void)length16;   // Suppress set but not used warning.
3828     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3829         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3830         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3831         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3832         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3833         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3834     ) {
3835         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3836     }
3837
3838     pattern="[a{ab}{abc}{cd}]";
3839     pattern16=UnicodeString(pattern, -1, US_INV);
3840     set.applyPattern(pattern16, errorCode);
3841     if(U_FAILURE(errorCode)) {
3842         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3843         return;
3844     }
3845     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3846     s16=string16.getBuffer();
3847     length16=string16.length();
3848     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3849         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3850         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3851     ) {
3852         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3853     }
3854
3855     pattern="[d{cd}{bcd}{ab}]";
3856     pattern16=UnicodeString(pattern, -1, US_INV);
3857     set.applyPattern(pattern16, errorCode).freeze();
3858     if(U_FAILURE(errorCode)) {
3859         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3860         return;
3861     }
3862     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3863     s16=string16.getBuffer();
3864     length16=string16.length();
3865     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3866         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3867         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3868     ) {
3869         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3870     }
3871 }
3872
3873 /**
3874  * Including collationroot.h fails here with
3875 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3876  *  .. so, we skip this test on Windows.
3877  *
3878  * the cause is that  intltest builds with /Za which disables language extensions - which means
3879  *  windows header files can't be used.
3880  */
3881 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3882 #include "collationroot.h"
3883 #include "collationtailoring.h"
3884 #endif
3885
3886 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3887 #if U_PLATFORM_HAS_WIN32_API
3888     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3889 #elif !UCONFIG_NO_COLLATION
3890     UErrorCode errorCode = U_ZERO_ERROR;
3891
3892     // Get the unsafeBackwardsSet
3893     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3894     if(U_FAILURE(errorCode)) {
3895       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3896       return;
3897     }
3898     //const UVersionInfo &version = rootEntry->tailoring->version;
3899     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3900
3901     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3902
3903     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3904         // simple test case
3905         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3906         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3907         UnicodeSet surrogates;
3908         surrogates.add(0xd83a);  // a lead surrogate
3909         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3910         UnicodeString pat;
3911         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3912         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3913         // so that at least one type of surrogate code points are escaped,
3914         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3915         errorCode = U_ZERO_ERROR;
3916         UnicodeSet s2;
3917         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3918         if(U_FAILURE(errorCode)) {
3919             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3920         } else {
3921             checkEqual(surrogates, s2, "surrogates to/from pattern");
3922         }
3923         // This occurs in the UCA unsafe-backwards set.
3924         checkRoundTrip(*unsafeBackwardSet);
3925     }
3926 #endif
3927 }