icuSources/test/intltest/usettest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ********************************************************************************
   5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
   6 *   others. All Rights Reserved.
   7 ********************************************************************************
   8 *   Date        Name        Description
   9 *   10/20/99    alan        Creation.
  10 *   03/22/2000  Madhu       Added additional tests
  11 ********************************************************************************
  12 */
  13
  14 #include <stdio.h>
  15
  16 #include <string.h>
  17 #include "unicode/utypes.h"
  18 #include "usettest.h"
  19 #include "unicode/ucnv.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/usetiter.h"
  23 #include "unicode/ustring.h"
  24 #include "unicode/parsepos.h"
  25 #include "unicode/symtable.h"
  26 #include "unicode/utf8.h"
  27 #include "unicode/utf16.h"
  28 #include "unicode/uversion.h"
  29 #include "cmemory.h"
  30 #include "hash.h"
  31
  32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
  33     if (U_FAILURE(status)) { \
  34         dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  35                   u_errorName(status)); \
  36     } \
  37 } UPRV_BLOCK_MACRO_END
  38
  39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
  40     if (!(expr)) { \
  41         dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
  42     } \
  43 } UPRV_BLOCK_MACRO_END
  44
  45 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  46     UnicodeString pat;
  47     set.toPattern(pat);
  48     return left + UnicodeSetTest::escape(pat);
  49 }
  50
  51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
  52 }
  53
  54 UConverter *UnicodeSetTest::openUTF8Converter() {
  55     if(utf8Cnv==NULL) {
  56         UErrorCode errorCode=U_ZERO_ERROR;
  57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
  58     }
  59     return utf8Cnv;
  60 }
  61
  62 UnicodeSetTest::~UnicodeSetTest() {
  63     ucnv_close(utf8Cnv);
  64 }
  65
  66 void
  67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  68                                const char* &name, char* /*par*/) {
  69     if (exec) {
  70         logln(u"TestSuite UnicodeSetTest");
  71     }
  72     TESTCASE_AUTO_BEGIN;
  73     TESTCASE_AUTO(TestPatterns);
  74     TESTCASE_AUTO(TestAddRemove);
  75     TESTCASE_AUTO(TestCategories);
  76     TESTCASE_AUTO(TestCloneEqualHash);
  77     TESTCASE_AUTO(TestMinimalRep);
  78     TESTCASE_AUTO(TestAPI);
  79     TESTCASE_AUTO(TestScriptSet);
  80     TESTCASE_AUTO(TestPropertySet);
  81     TESTCASE_AUTO(TestClone);
  82     TESTCASE_AUTO(TestExhaustive);
  83     TESTCASE_AUTO(TestToPattern);
  84     TESTCASE_AUTO(TestIndexOf);
  85     TESTCASE_AUTO(TestStrings);
  86     TESTCASE_AUTO(Testj2268);
  87     TESTCASE_AUTO(TestCloseOver);
  88     TESTCASE_AUTO(TestEscapePattern);
  89     TESTCASE_AUTO(TestInvalidCodePoint);
  90     TESTCASE_AUTO(TestSymbolTable);
  91     TESTCASE_AUTO(TestSurrogate);
  92     TESTCASE_AUTO(TestPosixClasses);
  93     TESTCASE_AUTO(TestIteration);
  94     TESTCASE_AUTO(TestFreezable);
  95     TESTCASE_AUTO(TestSpan);
  96     TESTCASE_AUTO(TestStringSpan);
  97     TESTCASE_AUTO(TestUCAUnsafeBackwards);
  98     TESTCASE_AUTO(TestIntOverflow);
  99     TESTCASE_AUTO(TestUnusedCcc);
 100     TESTCASE_AUTO(TestDeepPattern);
 101     TESTCASE_AUTO_END;
 102 }
 103
 104 static const char NOT[] = "%%%%";
 105
 106 /**
 107  * UVector was improperly copying contents
 108  * This code will crash this is still true
 109  */
 110 void UnicodeSetTest::Testj2268() {
 111   UnicodeSet t;
 112   t.add(UnicodeString("abc"));
 113   UnicodeSet test(t);
 114   UnicodeString ustrPat;
 115   test.toPattern(ustrPat, TRUE);
 116 }
 117
 118 /**
 119  * Test toPattern().
 120  */
 121 void UnicodeSetTest::TestToPattern() {
 122     UErrorCode ec = U_ZERO_ERROR;
 123
 124     // Test that toPattern() round trips with syntax characters and
 125     // whitespace.
 126     {
 127         static const char* OTHER_TOPATTERN_TESTS[] = {
 128             "[[:latin:]&[:greek:]]",
 129             "[[:latin:]-[:greek:]]",
 130             "[:nonspacing mark:]",
 131             NULL
 132         };
 133
 134         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 135             ec = U_ZERO_ERROR;
 136             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 137             if (U_FAILURE(ec)) {
 138                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
 139                 continue;
 140             }
 141             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 142         }
 143
 144         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 145             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 146
 147                 // check various combinations to make sure they all work.
 148                 if (i != 0 && !toPatternAux(i, i)){
 149                     continue;
 150                 }
 151                 if (!toPatternAux(0, i)){
 152                     continue;
 153                 }
 154                 if (!toPatternAux(i, 0xFFFF)){
 155                     continue;
 156                 }
 157             }
 158         }
 159     }
 160
 161     // Test pattern behavior of multicharacter strings.
 162     {
 163         ec = U_ZERO_ERROR;
 164         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 165
 166         // This loop isn't a loop.  It's here to make the compiler happy.
 167         // If you're curious, try removing it and changing the 'break'
 168         // statements (except for the last) to goto's.
 169         for (;;) {
 170             if (U_FAILURE(ec)) break;
 171             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 172             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 173
 174             s->add("ac");
 175             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 176             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 177
 178             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
 179             if (U_FAILURE(ec)) break;
 180             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 181             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
 182
 183             s->add("[]");
 184             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
 186
 187             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
 188             if (U_FAILURE(ec)) break;
 189             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 190             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
 191
 192             // j2189
 193             s->clear();
 194             s->add(UnicodeString("abc", ""));
 195             s->add(UnicodeString("abc", ""));
 196             const char* exp6[] = {"abc", NOT, "ab", NULL};
 197             expectToPattern(*s, "[{abc}]", exp6);
 198
 199             break;
 200         }
 201
 202         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 203         delete s;
 204     }
 205
 206     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 207     UnicodeSet s;
 208     s.add((UChar)97, (UChar)98); // 'a', 'b'
 209     expectToPattern(s, "[ab]", NULL);
 210 }
 211
 212 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 213
 214     // use Integer.toString because Utility.hex doesn't handle ints
 215     UnicodeString pat = "";
 216     // TODO do these in hex
 217     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 218     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 219     UnicodeString source;
 220     source = source + (uint32_t)start;
 221     if (start != end)
 222         source = source + ".." + (uint32_t)end;
 223     UnicodeSet testSet;
 224     testSet.add(start, end);
 225     return checkPat(source, testSet);
 226 }
 227
 228 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 229                                const UnicodeSet& testSet) {
 230     // What we want to make sure of is that a pattern generated
 231     // by toPattern(), with or without escaped unprintables, can
 232     // be passed back into the UnicodeSet constructor.
 233     UnicodeString pat0;
 234
 235     testSet.toPattern(pat0, TRUE);
 236
 237     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 238
 239     //String pat1 = unescapeLeniently(pat0);
 240     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 241
 242     UnicodeString pat2;
 243     testSet.toPattern(pat2, FALSE);
 244     if (!checkPat(source, testSet, pat2)) return FALSE;
 245
 246     //String pat3 = unescapeLeniently(pat2);
 247     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 248
 249     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 250     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 251     return TRUE;
 252 }
 253
 254 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 255                                const UnicodeSet& testSet,
 256                                const UnicodeString& pat) {
 257     UErrorCode ec = U_ZERO_ERROR;
 258     UnicodeSet testSet2(pat, ec);
 259     if (testSet2 != testSet) {
 260         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 261         return FALSE;
 262     }
 263     return TRUE;
 264 }
 265
 266 void
 267 UnicodeSetTest::TestPatterns(void) {
 268     UnicodeSet set;
 269     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 270     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 271     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 272     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 273     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 274     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 275
 276     // Throw in a test of complement
 277     set.complement();
 278     UnicodeString exp;
 279     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 280     expectPairs(set, exp);
 281 }
 282
 283 void
 284 UnicodeSetTest::TestCategories(void) {
 285     UErrorCode status = U_ZERO_ERROR;
 286     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 287     UnicodeSet set(pat, status);
 288     if (U_FAILURE(status)) {
 289         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
 290         return;
 291     } else {
 292         expectContainment(set, pat, "ABC", "abc");
 293     }
 294
 295     UChar32 i;
 296     int32_t failures = 0;
 297     // Make sure generation of L doesn't pollute cached Lu set
 298     // First generate L, then Lu
 299     set.applyPattern("[:L:]", status);
 300     if (U_FAILURE(status)) { errln("FAIL"); return; }
 301     for (i=0; i<0x200; ++i) {
 302         UBool l = u_isalpha((UChar)i);
 303         if (l != set.contains(i)) {
 304             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 305                   set.contains(i));
 306             if (++failures == 10) break;
 307         }
 308     }
 309
 310     set.applyPattern("[:Lu:]", status);
 311     if (U_FAILURE(status)) { errln("FAIL"); return; }
 312     for (i=0; i<0x200; ++i) {
 313         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 314         if (lu != set.contains(i)) {
 315             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 316                   set.contains(i));
 317             if (++failures == 20) break;
 318         }
 319     }
 320 }
 321 void
 322 UnicodeSetTest::TestCloneEqualHash(void) {
 323     UErrorCode status = U_ZERO_ERROR;
 324     // set1 and set2 used to be built with the obsolete constructor taking
 325     // UCharCategory values; replaced with pattern constructors
 326     // markus 20030502
 327     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
 328     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
 329     if (U_FAILURE(status)){
 330         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
 331         return;
 332     }
 333     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
 334     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
 335     if (U_FAILURE(status)){
 336         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 337         return;
 338     }
 339
 340     if (*set1 != *set1a) {
 341         errln("FAIL: category constructor for Ll broken");
 342     }
 343     if (*set2 != *set2a) {
 344         errln("FAIL: category constructor for Nd broken");
 345     }
 346     delete set1a;
 347     delete set2a;
 348
 349     logln("Testing copy construction");
 350     UnicodeSet *set1copy=new UnicodeSet(*set1);
 351     if(*set1 != *set1copy || *set1 == *set2 ||
 352         getPairs(*set1) != getPairs(*set1copy) ||
 353         set1->hashCode() != set1copy->hashCode()){
 354         errln("FAIL : Error in copy construction");
 355         return;
 356     }
 357
 358     logln("Testing =operator");
 359     UnicodeSet set1equal=*set1;
 360     UnicodeSet set2equal=*set2;
 361     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 362         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 363         errln("FAIL: Error in =operator");
 364     }
 365
 366     logln("Testing clone()");
 367     UnicodeSet *set1clone=set1->clone();
 368     UnicodeSet *set2clone=set2->clone();
 369     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 370         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 371         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 372         errln("FAIL: Error in clone");
 373     }
 374
 375     logln("Testing hashcode");
 376     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 377         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 378         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 379         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 380         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 381         errln("FAIL: Error in hashCode()");
 382     }
 383
 384     delete set1;
 385     delete set1copy;
 386     delete set2;
 387     delete set1clone;
 388     delete set2clone;
 389
 390
 391 }
 392 void
 393 UnicodeSetTest::TestAddRemove(void) {
 394     UnicodeSet set; // Construct empty set
 395     doAssert(set.isEmpty() == TRUE, "set should be empty");
 396     doAssert(set.size() == 0, "size should be 0");
 397     set.complement();
 398     doAssert(set.size() == 0x110000, "size should be 0x110000");
 399     set.clear();
 400     set.add(0x0061, 0x007a);
 401     expectPairs(set, "az");
 402     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 403     doAssert(set.size() != 0, "size should not be equal to 0");
 404     doAssert(set.size() == 26, "size should be equal to 26");
 405     set.remove(0x006d, 0x0070);
 406     expectPairs(set, "alqz");
 407     doAssert(set.size() == 22, "size should be equal to 22");
 408     set.remove(0x0065, 0x0067);
 409     expectPairs(set, "adhlqz");
 410     doAssert(set.size() == 19, "size should be equal to 19");
 411     set.remove(0x0064, 0x0069);
 412     expectPairs(set, "acjlqz");
 413     doAssert(set.size() == 16, "size should be equal to 16");
 414     set.remove(0x0063, 0x0072);
 415     expectPairs(set, "absz");
 416     doAssert(set.size() == 10, "size should be equal to 10");
 417     set.add(0x0066, 0x0071);
 418     expectPairs(set, "abfqsz");
 419     doAssert(set.size() == 22, "size should be equal to 22");
 420     set.remove(0x0061, 0x0067);
 421     expectPairs(set, "hqsz");
 422     set.remove(0x0061, 0x007a);
 423     expectPairs(set, "");
 424     doAssert(set.isEmpty() == TRUE, "set should be empty");
 425     doAssert(set.size() == 0, "size should be 0");
 426     set.add(0x0061);
 427     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 428     doAssert(set.size() == 1, "size should not be equal to 1");
 429     set.add(0x0062);
 430     set.add(0x0063);
 431     expectPairs(set, "ac");
 432     doAssert(set.size() == 3, "size should not be equal to 3");
 433     set.add(0x0070);
 434     set.add(0x0071);
 435     expectPairs(set, "acpq");
 436     doAssert(set.size() == 5, "size should not be equal to 5");
 437     set.clear();
 438     expectPairs(set, "");
 439     doAssert(set.isEmpty() == TRUE, "set should be empty");
 440     doAssert(set.size() == 0, "size should be 0");
 441
 442     // Try removing an entire set from another set
 443     expectPattern(set, "[c-x]", "cx");
 444     UnicodeSet set2;
 445     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 446     set.removeAll(set2);
 447     expectPairs(set, "deluxx");
 448
 449     // Try adding an entire set to another set
 450     expectPattern(set, "[jackiemclean]", "aacceein");
 451     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 452     set.addAll(set2);
 453     expectPairs(set, "aacehort");
 454     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 455
 456     // Try retaining an set of elements contained in another set (intersection)
 457     UnicodeSet set3;
 458     expectPattern(set3, "[a-c]", "ac");
 459     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 460     set3.remove(0x0062);
 461     expectPairs(set3, "aacc");
 462     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 463     set.retainAll(set3);
 464     expectPairs(set, "aacc");
 465     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 466     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 467     set.clear();
 468     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 469
 470     // Test commutativity
 471     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 472     expectPattern(set2, "[jackiemclean]", "aacceein");
 473     set.addAll(set2);
 474     expectPairs(set, "aacehort");
 475     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 476
 477
 478
 479
 480 }
 481
 482 /**
 483  * Make sure minimal representation is maintained.
 484  */
 485 void UnicodeSetTest::TestMinimalRep() {
 486     UErrorCode status = U_ZERO_ERROR;
 487     // This is pretty thoroughly tested by checkCanonicalRep()
 488     // run against the exhaustive operation results.  Use the code
 489     // here for debugging specific spot problems.
 490
 491     // 1 overlap against 2
 492     UnicodeSet set("[h-km-q]", status);
 493     if (U_FAILURE(status)) { errln("FAIL"); return; }
 494     UnicodeSet set2("[i-o]", status);
 495     if (U_FAILURE(status)) { errln("FAIL"); return; }
 496     set.addAll(set2);
 497     expectPairs(set, "hq");
 498     // right
 499     set.applyPattern("[a-m]", status);
 500     if (U_FAILURE(status)) { errln("FAIL"); return; }
 501     set2.applyPattern("[e-o]", status);
 502     if (U_FAILURE(status)) { errln("FAIL"); return; }
 503     set.addAll(set2);
 504     expectPairs(set, "ao");
 505     // left
 506     set.applyPattern("[e-o]", status);
 507     if (U_FAILURE(status)) { errln("FAIL"); return; }
 508     set2.applyPattern("[a-m]", status);
 509     if (U_FAILURE(status)) { errln("FAIL"); return; }
 510     set.addAll(set2);
 511     expectPairs(set, "ao");
 512     // 1 overlap against 3
 513     set.applyPattern("[a-eg-mo-w]", status);
 514     if (U_FAILURE(status)) { errln("FAIL"); return; }
 515     set2.applyPattern("[d-q]", status);
 516     if (U_FAILURE(status)) { errln("FAIL"); return; }
 517     set.addAll(set2);
 518     expectPairs(set, "aw");
 519 }
 520
 521 void UnicodeSetTest::TestAPI() {
 522     UErrorCode status = U_ZERO_ERROR;
 523     // default ct
 524     UnicodeSet set;
 525     if (!set.isEmpty() || set.getRangeCount() != 0) {
 526         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 527               set);
 528     }
 529
 530     // clear(), isEmpty()
 531     set.add(0x0061);
 532     if (set.isEmpty()) {
 533         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 534               set);
 535     }
 536     set.clear();
 537     if (!set.isEmpty()) {
 538         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 539               set);
 540     }
 541
 542     // size()
 543     set.clear();
 544     if (set.size() != 0) {
 545         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 546               ": " + set);
 547     }
 548     set.add(0x0061);
 549     if (set.size() != 1) {
 550         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 551               ": " + set);
 552     }
 553     set.add(0x0031, 0x0039);
 554     if (set.size() != 10) {
 555         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 556               ": " + set);
 557     }
 558
 559     // contains(first, last)
 560     set.clear();
 561     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 562     if (U_FAILURE(status)) { errln("FAIL"); return; }
 563     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 564         UChar32 a = set.getRangeStart(i);
 565         UChar32 b = set.getRangeEnd(i);
 566         if (!set.contains(a, b)) {
 567             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 568                   " but doesn't: " + set);
 569         }
 570         if (set.contains((UChar32)(a-1), b)) {
 571             errln((UnicodeString)"FAIL, shouldn't contain " +
 572                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 573                   " but does: " + set);
 574         }
 575         if (set.contains(a, (UChar32)(b+1))) {
 576             errln((UnicodeString)"FAIL, shouldn't contain " +
 577                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 578                   " but does: " + set);
 579         }
 580     }
 581
 582     // Ported InversionList test.
 583     UnicodeSet a((UChar32)3,(UChar32)10);
 584     UnicodeSet b((UChar32)7,(UChar32)15);
 585     UnicodeSet c;
 586
 587     logln((UnicodeString)"a [3-10]: " + a);
 588     logln((UnicodeString)"b [7-15]: " + b);
 589     c = a;
 590     c.addAll(b);
 591     UnicodeSet exp((UChar32)3,(UChar32)15);
 592     if (c == exp) {
 593         logln((UnicodeString)"c.set(a).add(b): " + c);
 594     } else {
 595         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 596     }
 597     c.complement();
 598     exp.set((UChar32)0, (UChar32)2);
 599     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 600     if (c == exp) {
 601         logln((UnicodeString)"c.complement(): " + c);
 602     } else {
 603         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 604     }
 605     c.complement();
 606     exp.set((UChar32)3, (UChar32)15);
 607     if (c == exp) {
 608         logln((UnicodeString)"c.complement(): " + c);
 609     } else {
 610         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 611     }
 612     c = a;
 613     c.complementAll(b);
 614     exp.set((UChar32)3,(UChar32)6);
 615     exp.add((UChar32)11,(UChar32) 15);
 616     if (c == exp) {
 617         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 618     } else {
 619         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 620     }
 621
 622     exp = c;
 623     bitsToSet(setToBits(c), c);
 624     if (c == exp) {
 625         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 626     } else {
 627         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 628     }
 629
 630     // Additional tests for coverage JB#2118
 631     //UnicodeSet::complement(class UnicodeString const &)
 632     //UnicodeSet::complementAll(class UnicodeString const &)
 633     //UnicodeSet::containsNone(class UnicodeSet const &)
 634     //UnicodeSet::containsNone(long,long)
 635     //UnicodeSet::containsSome(class UnicodeSet const &)
 636     //UnicodeSet::containsSome(long,long)
 637     //UnicodeSet::removeAll(class UnicodeString const &)
 638     //UnicodeSet::retain(long)
 639     //UnicodeSet::retainAll(class UnicodeString const &)
 640     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 641     //UnicodeSetIterator::getString(void)
 642     set.clear();
 643     set.complement("ab");
 644     exp.applyPattern("[{ab}]", status);
 645     if (U_FAILURE(status)) { errln("FAIL"); return; }
 646     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 647
 648     UnicodeSetIterator iset(set);
 649     if (!iset.next() || !iset.isString()) {
 650         errln("FAIL: UnicodeSetIterator::next/isString");
 651     } else if (iset.getString() != "ab") {
 652         errln("FAIL: UnicodeSetIterator::getString");
 653     }
 654
 655     set.add((UChar32)0x61, (UChar32)0x7A);
 656     set.complementAll("alan");
 657     exp.applyPattern("[{ab}b-kmo-z]", status);
 658     if (U_FAILURE(status)) { errln("FAIL"); return; }
 659     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 660
 661     exp.applyPattern("[a-z]", status);
 662     if (U_FAILURE(status)) { errln("FAIL"); return; }
 663     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 664     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 665     exp.applyPattern("[aln]", status);
 666     if (U_FAILURE(status)) { errln("FAIL"); return; }
 667     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 668     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 669
 670     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 671         errln("FAIL: containsNone(UChar32, UChar32)");
 672     }
 673     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 674         errln("FAIL: containsSome(UChar32, UChar32)");
 675     }
 676     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 677         errln("FAIL: containsNone(UChar32, UChar32)");
 678     }
 679     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 680         errln("FAIL: containsSome(UChar32, UChar32)");
 681     }
 682
 683     set.removeAll("liu");
 684     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 685     if (U_FAILURE(status)) { errln("FAIL"); return; }
 686     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 687
 688     set.retainAll("star");
 689     exp.applyPattern("[rst]", status);
 690     if (U_FAILURE(status)) { errln("FAIL"); return; }
 691     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 692
 693     set.retain((UChar32)0x73);
 694     exp.applyPattern("[s]", status);
 695     if (U_FAILURE(status)) { errln("FAIL"); return; }
 696     if (set != exp) { errln("FAIL: retain('s')"); return; }
 697
 698     uint16_t buf[32];
 699     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
 700     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 701     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 702         errln("FAIL: serialize");
 703         return;
 704     }
 705
 706     // Conversions to and from USet
 707     UnicodeSet *uniset = &set;
 708     USet *uset = uniset->toUSet();
 709     TEST_ASSERT((void *)uset == (void *)uniset);
 710     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
 711     TEST_ASSERT((void *)setx == (void *)uset);
 712     const UnicodeSet *constSet = uniset;
 713     const USet *constUSet = constSet->toUSet();
 714     TEST_ASSERT((void *)constUSet == (void *)constSet);
 715     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
 716     TEST_ASSERT((void *)constSetx == (void *)constUSet);
 717
 718     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
 719     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
 720     UnicodeSet ac(0x61, 0x63);
 721     ac.remove(0x62).freeze();
 722     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
 723         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
 724         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
 725         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
 726         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 727         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
 728         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
 729         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
 730         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
 731         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
 732     ) {
 733         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
 734     }
 735     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
 736         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
 737         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
 738         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
 739         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 740         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
 741         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
 742         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
 743         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
 744         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
 745     ) {
 746         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
 747     }
 748 }
 749
 750 void UnicodeSetTest::TestIteration() {
 751     UErrorCode ec = U_ZERO_ERROR;
 752     int i = 0;
 753     int outerLoop;
 754
 755     // 6 code points, 3 ranges, 2 strings, 8 total elements
 756     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 757     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
 758     TEST_ASSERT_SUCCESS(ec);
 759     UnicodeSetIterator it(set);
 760
 761     for (outerLoop=0; outerLoop<3; outerLoop++) {
 762         // Run the test multiple times, to check that iterator.reset() is working.
 763         for (i=0; i<10; i++) {
 764             UBool         nextv        = it.next();
 765             UBool         isString     = it.isString();
 766             int32_t       codePoint    = it.getCodepoint();
 767             //int32_t       codePointEnd = it.getCodepointEnd();
 768             UnicodeString s   = it.getString();
 769             switch (i) {
 770             case 0:
 771                 TEST_ASSERT(nextv == TRUE);
 772                 TEST_ASSERT(isString == FALSE);
 773                 TEST_ASSERT(codePoint==0x61);
 774                 TEST_ASSERT(s == "a");
 775                 break;
 776             case 1:
 777                 TEST_ASSERT(nextv == TRUE);
 778                 TEST_ASSERT(isString == FALSE);
 779                 TEST_ASSERT(codePoint==0x62);
 780                 TEST_ASSERT(s == "b");
 781                 break;
 782             case 2:
 783                 TEST_ASSERT(nextv == TRUE);
 784                 TEST_ASSERT(isString == FALSE);
 785                 TEST_ASSERT(codePoint==0x63);
 786                 TEST_ASSERT(s == "c");
 787                 break;
 788             case 3:
 789                 TEST_ASSERT(nextv == TRUE);
 790                 TEST_ASSERT(isString == FALSE);
 791                 TEST_ASSERT(codePoint==0x79);
 792                 TEST_ASSERT(s == "y");
 793                 break;
 794             case 4:
 795                 TEST_ASSERT(nextv == TRUE);
 796                 TEST_ASSERT(isString == FALSE);
 797                 TEST_ASSERT(codePoint==0x7a);
 798                 TEST_ASSERT(s == "z");
 799                 break;
 800             case 5:
 801                 TEST_ASSERT(nextv == TRUE);
 802                 TEST_ASSERT(isString == FALSE);
 803                 TEST_ASSERT(codePoint==0x1abcd);
 804                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 805                 break;
 806             case 6:
 807                 TEST_ASSERT(nextv == TRUE);
 808                 TEST_ASSERT(isString == TRUE);
 809                 TEST_ASSERT(s == "str1");
 810                 break;
 811             case 7:
 812                 TEST_ASSERT(nextv == TRUE);
 813                 TEST_ASSERT(isString == TRUE);
 814                 TEST_ASSERT(s == "str2");
 815                 break;
 816             case 8:
 817                 TEST_ASSERT(nextv == FALSE);
 818                 break;
 819             case 9:
 820                 TEST_ASSERT(nextv == FALSE);
 821                 break;
 822             }
 823         }
 824         it.reset();  // prepare to run the iteration again.
 825     }
 826 }
 827
 828
 829
 830
 831 void UnicodeSetTest::TestStrings() {
 832     UErrorCode ec = U_ZERO_ERROR;
 833
 834     UnicodeSet* testList[] = {
 835         UnicodeSet::createFromAll("abc"),
 836         new UnicodeSet("[a-c]", ec),
 837
 838         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 839         new UnicodeSet("[{ll}{ch}a-z]", ec),
 840
 841         UnicodeSet::createFrom("ab}c"),
 842         new UnicodeSet("[{ab\\}c}]", ec),
 843
 844         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 845         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 846
 847         NULL
 848     };
 849
 850     if (U_FAILURE(ec)) {
 851         errln("FAIL: couldn't construct test sets");
 852     }
 853
 854     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 855         if (U_SUCCESS(ec)) {
 856             UnicodeString pat0, pat1;
 857             testList[i]->toPattern(pat0, TRUE);
 858             testList[i+1]->toPattern(pat1, TRUE);
 859             if (*testList[i] == *testList[i+1]) {
 860                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 861             } else {
 862                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 863             }
 864         }
 865         delete testList[i];
 866         delete testList[i+1];
 867     }
 868 }
 869
 870 /**
 871  * Test the [:Latin:] syntax.
 872  */
 873 void UnicodeSetTest::TestScriptSet() {
 874     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 875
 876     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 877
 878     /* Jitterbug 1423 */
 879     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 880
 881 }
 882
 883 /**
 884  * Test the [:Latin:] syntax.
 885  */
 886 void UnicodeSetTest::TestPropertySet() {
 887     static const char* const DATA[] = {
 888         // Pattern, Chars IN, Chars NOT in
 889
 890         "[:Latin:]",
 891         "aA",
 892         "\\u0391\\u03B1",
 893
 894         "[\\p{Greek}]",
 895         "\\u0391\\u03B1",
 896         "aA",
 897
 898         "\\P{ GENERAL Category = upper case letter }",
 899         "abc",
 900         "ABC",
 901
 902 #if !UCONFIG_NO_NORMALIZATION
 903         // Combining class: @since ICU 2.2
 904         // Check both symbolic and numeric
 905         "\\p{ccc=Nukta}",
 906         "\\u0ABC",
 907         "abc",
 908
 909         "\\p{Canonical Combining Class = 11}",
 910         "\\u05B1",
 911         "\\u05B2",
 912
 913         "[:c c c = iota subscript :]",
 914         "\\u0345",
 915         "xyz",
 916 #endif
 917
 918         // Bidi class: @since ICU 2.2
 919         "\\p{bidiclass=lefttoright}",
 920         "abc",
 921         "\\u0671\\u0672",
 922
 923         // Binary properties: @since ICU 2.2
 924         "\\p{ideographic}",
 925         "\\u4E0A",
 926         "x",
 927
 928         "[:math=false:]",
 929         "q)*(",
 930         // weiv: )(and * were removed from math in Unicode 4.0.1
 931         //"(*+)",
 932         "+<>^",
 933
 934         // JB#1767 \N{}, \p{ASCII}
 935         "[:Ascii:]",
 936         "abc\\u0000\\u007F",
 937         "\\u0080\\u4E00",
 938
 939         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 940         "az",
 941         "qrs",
 942
 943         // JB#2015
 944         "[:any:]",
 945         "a\\U0010FFFF",
 946         "",
 947
 948         "[:nv=0.5:]",
 949         "\\u00BD\\u0F2A",
 950         "\\u00BC",
 951
 952         // JB#2653: Age
 953         "[:Age=1.1:]",
 954         "\\u03D6", // 1.1
 955         "\\u03D8\\u03D9", // 3.2
 956
 957         "[:Age=3.1:]",
 958         "\\u1800\\u3400\\U0002f800",
 959         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 960
 961         // JB#2350: Case_Sensitive
 962         "[:Case Sensitive:]",
 963         "A\\u1FFC\\U00010410",
 964         ";\\u00B4\\U00010500",
 965
 966         // JB#2832: C99-compatibility props
 967         "[:blank:]",
 968         " \\u0009",
 969         "1-9A-Z",
 970
 971         "[:graph:]",
 972         "19AZ",
 973         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 974
 975         "[:punct:]",
 976         "!@#%&*()[]{}-_\\/;:,.?'\"",
 977         "09azAZ",
 978
 979         "[:xdigit:]",
 980         "09afAF",
 981         "gG!",
 982
 983         // Regex compatibility test
 984         "[-b]", // leading '-' is literal
 985         "-b",
 986         "ac",
 987
 988         "[^-b]", // leading '-' is literal
 989         "ac",
 990         "-b",
 991
 992         "[b-]", // trailing '-' is literal
 993         "-b",
 994         "ac",
 995
 996         "[^b-]", // trailing '-' is literal
 997         "ac",
 998         "-b",
 999
1000         "[a-b-]", // trailing '-' is literal
1001         "ab-",
1002         "c=",
1003
1004         "[[a-q]&[p-z]-]", // trailing '-' is literal
1005         "pq-",
1006         "or=",
1007
1008         "[\\s|\\)|:|$|\\>]", // from regex tests
1009         "s|):$>",
1010         "abc",
1011
1012         "[\\uDC00cd]", // JB#2906: isolated trail at start
1013         "cd\\uDC00",
1014         "ab\\uD800\\U00010000",
1015
1016         "[ab\\uD800]", // JB#2906: isolated trail at start
1017         "ab\\uD800",
1018         "cd\\uDC00\\U00010000",
1019
1020         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1021         "abcd\\uD800",
1022         "ef\\uDC00\\U00010000",
1023
1024         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1025         "abcd\\uDC00",
1026         "ef\\uD800\\U00010000",
1027
1028 #if !UCONFIG_NO_NORMALIZATION
1029         "[:^lccc=0:]", // Lead canonical class
1030         "\\u0300\\u0301",
1031         "abcd\\u00c0\\u00c5",
1032
1033         "[:^tccc=0:]", // Trail canonical class
1034         "\\u0300\\u0301\\u00c0\\u00c5",
1035         "abcd",
1036
1037         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1038         "\\u0300\\u0301\\u00c0\\u00c5",
1039         "abcd",
1040
1041         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1042         "",
1043         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1044
1045         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1046         "\\u0F73\\u0F75\\u0F81",
1047         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1048 #endif /* !UCONFIG_NO_NORMALIZATION */
1049
1050         "[:Assigned:]",
1051         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1052         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1053
1054         // Script_Extensions, new in Unicode 6.0
1055         "[:scx=Arab:]",
1056         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1057         "\\u061D\\uFDEF\\uFDFE",
1058
1059         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1060         // so scx-sc is missing U+FDF2.
1061         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1062         "\\u0640\\u064B\\u0650\\u0655",
1063         "\\uFDF2"
1064     };
1065
1066     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1067
1068     for (int32_t i=0; i<DATA_LEN; i+=3) {
1069         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1070                           CharsToUnicodeString(DATA[i+2]));
1071     }
1072 }
1073
1074 /**
1075   * Test that Posix style character classes [:digit:], etc.
1076   *   have the Unicode definitions from TR 18.
1077   */
1078 void UnicodeSetTest::TestPosixClasses() {
1079     {
1080         UErrorCode status = U_ZERO_ERROR;
1081         UnicodeSet s1("[:alpha:]", status);
1082         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1083         TEST_ASSERT_SUCCESS(status);
1084         TEST_ASSERT(s1==s2);
1085     }
1086     {
1087         UErrorCode status = U_ZERO_ERROR;
1088         UnicodeSet s1("[:lower:]", status);
1089         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1090         TEST_ASSERT_SUCCESS(status);
1091         TEST_ASSERT(s1==s2);
1092     }
1093     {
1094         UErrorCode status = U_ZERO_ERROR;
1095         UnicodeSet s1("[:upper:]", status);
1096         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1097         TEST_ASSERT_SUCCESS(status);
1098         TEST_ASSERT(s1==s2);
1099     }
1100     {
1101         UErrorCode status = U_ZERO_ERROR;
1102         UnicodeSet s1("[:punct:]", status);
1103         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1104         TEST_ASSERT_SUCCESS(status);
1105         TEST_ASSERT(s1==s2);
1106     }
1107     {
1108         UErrorCode status = U_ZERO_ERROR;
1109         UnicodeSet s1("[:digit:]", status);
1110         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1111         TEST_ASSERT_SUCCESS(status);
1112         TEST_ASSERT(s1==s2);
1113     }
1114     {
1115         UErrorCode status = U_ZERO_ERROR;
1116         UnicodeSet s1("[:xdigit:]", status);
1117         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1118         TEST_ASSERT_SUCCESS(status);
1119         TEST_ASSERT(s1==s2);
1120     }
1121     {
1122         UErrorCode status = U_ZERO_ERROR;
1123         UnicodeSet s1("[:alnum:]", status);
1124         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1125         TEST_ASSERT_SUCCESS(status);
1126         TEST_ASSERT(s1==s2);
1127     }
1128     {
1129         UErrorCode status = U_ZERO_ERROR;
1130         UnicodeSet s1("[:space:]", status);
1131         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1132         TEST_ASSERT_SUCCESS(status);
1133         TEST_ASSERT(s1==s2);
1134     }
1135     {
1136         UErrorCode status = U_ZERO_ERROR;
1137         UnicodeSet s1("[:blank:]", status);
1138         TEST_ASSERT_SUCCESS(status);
1139         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1140             status);
1141         TEST_ASSERT_SUCCESS(status);
1142         TEST_ASSERT(s1==s2);
1143     }
1144     {
1145         UErrorCode status = U_ZERO_ERROR;
1146         UnicodeSet s1("[:cntrl:]", status);
1147         TEST_ASSERT_SUCCESS(status);
1148         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1149         TEST_ASSERT_SUCCESS(status);
1150         TEST_ASSERT(s1==s2);
1151     }
1152     {
1153         UErrorCode status = U_ZERO_ERROR;
1154         UnicodeSet s1("[:graph:]", status);
1155         TEST_ASSERT_SUCCESS(status);
1156         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1157         TEST_ASSERT_SUCCESS(status);
1158         TEST_ASSERT(s1==s2);
1159     }
1160     {
1161         UErrorCode status = U_ZERO_ERROR;
1162         UnicodeSet s1("[:print:]", status);
1163         TEST_ASSERT_SUCCESS(status);
1164         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1165         TEST_ASSERT_SUCCESS(status);
1166         TEST_ASSERT(s1==s2);
1167     }
1168 }
1169 /**
1170  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1171  */
1172 void UnicodeSetTest::TestClone() {
1173     UErrorCode ec = U_ZERO_ERROR;
1174     UnicodeSet s("[abcxyz]", ec);
1175     UnicodeSet t(s);
1176     expectContainment(t, "abc", "def");
1177 }
1178
1179 /**
1180  * Test the indexOf() and charAt() methods.
1181  */
1182 void UnicodeSetTest::TestIndexOf() {
1183     UErrorCode ec = U_ZERO_ERROR;
1184     UnicodeSet set("[a-cx-y3578]", ec);
1185     if (U_FAILURE(ec)) {
1186         errln("FAIL: UnicodeSet constructor");
1187         return;
1188     }
1189     for (int32_t i=0; i<set.size(); ++i) {
1190         UChar32 c = set.charAt(i);
1191         if (set.indexOf(c) != i) {
1192             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1193                 i, c, set.indexOf(c));
1194         }
1195     }
1196     UChar32 c = set.charAt(set.size());
1197     if (c != -1) {
1198         errln("FAIL: charAt(<out of range>) = %X", c);
1199     }
1200     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1201     if (j != -1) {
1202         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1203     }
1204 }
1205
1206 /**
1207  * Test closure API.
1208  */
1209 void UnicodeSetTest::TestCloseOver() {
1210     UErrorCode ec = U_ZERO_ERROR;
1211
1212     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1213     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1214     const char* DATA[] = {
1215         // selector, input, output
1216         CASE,
1217         "[aq\\u00DF{Bc}{bC}{Fi}]",
1218         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1219
1220         CASE,
1221         "[\\u01F1]", // 'DZ'
1222         "[\\u01F1\\u01F2\\u01F3]",
1223
1224         CASE,
1225         "[\\u1FB4]",
1226         "[\\u1FB4{\\u03AC\\u03B9}]",
1227
1228         CASE,
1229         "[{F\\uFB01}]",
1230         "[\\uFB03{ffi}]",
1231
1232         CASE, // make sure binary search finds limits
1233         "[a\\uFF3A]",
1234         "[aA\\uFF3A\\uFF5A]",
1235
1236         CASE,
1237         "[a-z]","[A-Za-z\\u017F\\u212A]",
1238         CASE,
1239         "[abc]","[A-Ca-c]",
1240         CASE,
1241         "[ABC]","[A-Ca-c]",
1242
1243         CASE, "[i]", "[iI]",
1244
1245         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1246         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1247
1248         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1249
1250         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1251
1252         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1253
1254         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1255
1256         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1257
1258         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1259
1260         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1261         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1262
1263         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1264
1265         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1266
1267         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1268
1269 #if !UCONFIG_NO_FILE_IO
1270         CASE_MAPPINGS,
1271         "[aq\\u00DF{Bc}{bC}{Fi}]",
1272         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1273 #endif
1274
1275         CASE_MAPPINGS,
1276         "[\\u01F1]", // 'DZ'
1277         "[\\u01F1\\u01F2\\u01F3]",
1278
1279         CASE_MAPPINGS,
1280         "[a-z]",
1281         "[A-Za-z]",
1282
1283         NULL
1284     };
1285
1286     UnicodeSet s;
1287     UnicodeSet t;
1288     UnicodeString buf;
1289     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1290         int32_t selector = DATA[i][0];
1291         UnicodeString pat(DATA[i+1], -1, US_INV);
1292         UnicodeString exp(DATA[i+2], -1, US_INV);
1293         s.applyPattern(pat, ec);
1294         s.closeOver(selector);
1295         t.applyPattern(exp, ec);
1296         if (U_FAILURE(ec)) {
1297             errln("FAIL: applyPattern failed");
1298             continue;
1299         }
1300         if (s == t) {
1301             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1302         } else {
1303             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1304                   s.toPattern(buf, TRUE) + ", expected " + exp);
1305         }
1306     }
1307
1308 #if 0
1309     /*
1310      * Unused test code.
1311      * This was used to compare the old implementation (using USET_CASE)
1312      * with the new one (using 0x100 temporarily)
1313      * while transitioning from hardcoded case closure tables in uniset.cpp
1314      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1315      * and using ucase.c functions for closure.
1316      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1317      *
1318      * Note: The old and new implementation never fully matched because
1319      * the old implementation turned out to not map U+0130 and U+0131 correctly
1320      * (dotted I and dotless i) and because the old implementation's data tables
1321      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1322      * new implementation. (So sigmas and some other characters were not handled
1323      * according to the newer Unicode version.)
1324      */
1325     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1326     UnicodeSetIterator si(sens);
1327     UnicodeString str, buf2;
1328     const UnicodeString *pStr;
1329     UChar32 c;
1330     while(si.next()) {
1331         if(!si.isString()) {
1332             c=si.getCodepoint();
1333             s.clear();
1334             s.add(c);
1335
1336             str.setTo(c);
1337             str.foldCase();
1338             sens2.add(str);
1339
1340             t=s;
1341             s.closeOver(USET_CASE);
1342             t.closeOver(0x100);
1343             if(s!=t) {
1344                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1345                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1346             }
1347         }
1348     }
1349     // remove all code points
1350     // should contain all full case folding mapping strings
1351     sens2.remove(0, 0x10ffff);
1352     si.reset(sens2);
1353     while(si.next()) {
1354         if(si.isString()) {
1355             pStr=&si.getString();
1356             s.clear();
1357             s.add(*pStr);
1358             t=s2=s;
1359             s.closeOver(USET_CASE);
1360             t.closeOver(0x100);
1361             if(s!=t) {
1362                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1363                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1364             }
1365         }
1366     }
1367 #endif
1368
1369     // Test the pattern API
1370     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1371     if (U_FAILURE(ec)) {
1372         errln("FAIL: applyPattern failed");
1373     } else {
1374         expectContainment(s, "abcABC", "defDEF");
1375     }
1376     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1377     if (U_FAILURE(ec)) {
1378         errln("FAIL: constructor failed");
1379     } else {
1380         expectContainment(v, "defDEF", "abcABC");
1381     }
1382     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1383     if (U_FAILURE(ec)) {
1384         errln("FAIL: construct w/case mappings failed");
1385     } else {
1386         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1387     }
1388 }
1389
1390 void UnicodeSetTest::TestEscapePattern() {
1391     const char pattern[] =
1392         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1393     const char exp[] =
1394         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1395     // We test this with two passes; in the second pass we
1396     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1397     // this fails -- which is what we expect.
1398     for (int32_t pass=1; pass<=2; ++pass) {
1399         UErrorCode ec = U_ZERO_ERROR;
1400         UnicodeString pat(pattern, -1, US_INV);
1401         if (pass==2) {
1402             pat = pat.unescape();
1403         }
1404         // Pattern is only good for pass 1
1405         UBool isPatternValid = (pass==1);
1406
1407         UnicodeSet set(pat, ec);
1408         if (U_SUCCESS(ec) != isPatternValid){
1409             errln((UnicodeString)"FAIL: applyPattern(" +
1410                   escape(pat) + ") => " +
1411                   u_errorName(ec));
1412             continue;
1413         }
1414         if (U_FAILURE(ec)) {
1415             continue;
1416         }
1417         if (set.contains((UChar)0x0644)){
1418             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1419         }
1420
1421         UnicodeString newpat;
1422         set.toPattern(newpat, TRUE);
1423         if (newpat == UnicodeString(exp, -1, US_INV)) {
1424             logln(escape(pat) + " => " + newpat);
1425         } else {
1426             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1427         }
1428
1429         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1430             UnicodeString str("Range ");
1431             str.append((UChar)(0x30 + i))
1432                 .append(": ")
1433                 .append((UChar32)set.getRangeStart(i))
1434                 .append(" - ")
1435                 .append((UChar32)set.getRangeEnd(i));
1436             str = str + " (" + set.getRangeStart(i) + " - " +
1437                 set.getRangeEnd(i) + ")";
1438             if (set.getRangeStart(i) < 0) {
1439                 errln((UnicodeString)"FAIL: " + escape(str));
1440             } else {
1441                 logln(escape(str));
1442             }
1443         }
1444     }
1445 }
1446
1447 void UnicodeSetTest::expectRange(const UnicodeString& label,
1448                                  const UnicodeSet& set,
1449                                  UChar32 start, UChar32 end) {
1450     UnicodeSet exp(start, end);
1451     UnicodeString pat;
1452     if (set == exp) {
1453         logln(label + " => " + set.toPattern(pat, TRUE));
1454     } else {
1455         UnicodeString xpat;
1456         errln((UnicodeString)"FAIL: " + label + " => " +
1457               set.toPattern(pat, TRUE) +
1458               ", expected " + exp.toPattern(xpat, TRUE));
1459     }
1460 }
1461
1462 void UnicodeSetTest::TestInvalidCodePoint() {
1463
1464     const UChar32 DATA[] = {
1465         // Test range             Expected range
1466         0, 0x10FFFF,              0, 0x10FFFF,
1467         (UChar32)-1, 8,           0, 8,
1468         8, 0x110000,              8, 0x10FFFF
1469     };
1470     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1471
1472     UnicodeString pat;
1473     int32_t i;
1474
1475     for (i=0; i<DATA_LENGTH; i+=4) {
1476         UChar32 start  = DATA[i];
1477         UChar32 end    = DATA[i+1];
1478         UChar32 xstart = DATA[i+2];
1479         UChar32 xend   = DATA[i+3];
1480
1481         // Try various API using the test code points
1482
1483         UnicodeSet set(start, end);
1484         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1485                     set, xstart, xend);
1486
1487         set.clear();
1488         set.set(start, end);
1489         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1490                     set, xstart, xend);
1491
1492         UBool b = set.contains(start);
1493         b = set.contains(start, end);
1494         b = set.containsNone(start, end);
1495         b = set.containsSome(start, end);
1496         (void)b;   // Suppress set but not used warning.
1497
1498         /*int32_t index = set.indexOf(start);*/
1499
1500         set.clear();
1501         set.add(start);
1502         set.add(start, end);
1503         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1504                     set, xstart, xend);
1505
1506         set.set(0, 0x10FFFF);
1507         set.retain(start, end);
1508         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1509                     set, xstart, xend);
1510         set.retain(start);
1511
1512         set.set(0, 0x10FFFF);
1513         set.remove(start);
1514         set.remove(start, end);
1515         set.complement();
1516         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1517                     set, xstart, xend);
1518
1519         set.set(0, 0x10FFFF);
1520         set.complement(start, end);
1521         set.complement();
1522         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1523                     set, xstart, xend);
1524         set.complement(start);
1525     }
1526
1527     const UChar32 DATA2[] = {
1528         0,
1529         0x10FFFF,
1530         (UChar32)-1,
1531         0x110000
1532     };
1533     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1534
1535     for (i=0; i<DATA2_LENGTH; ++i) {
1536         UChar32 c = DATA2[i], end = 0x10FFFF;
1537         UBool valid = (c >= 0 && c <= 0x10FFFF);
1538
1539         UnicodeSet set(0, 0x10FFFF);
1540
1541         // For single-codepoint contains, invalid codepoints are NOT contained
1542         UBool b = set.contains(c);
1543         if (b == valid) {
1544             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1545                   ") = " + b);
1546         } else {
1547             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1548                   ") = " + b);
1549         }
1550
1551         // For codepoint range contains, containsNone, and containsSome,
1552         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1553         b = set.contains(c, end);
1554         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1555               "," + end + ") = " + b);
1556
1557         b = set.containsNone(c, end);
1558         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1559               "," + end + ") = " + b);
1560
1561         b = set.containsSome(c, end);
1562         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1563               "," + end + ") = " + b);
1564
1565         int32_t index = set.indexOf(c);
1566         if ((index >= 0) == valid) {
1567             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1568                   ") = " + index);
1569         } else {
1570             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1571                   ") = " + index);
1572         }
1573     }
1574 }
1575
1576 // Used by TestSymbolTable
1577 class TokenSymbolTable : public SymbolTable {
1578 public:
1579     Hashtable contents;
1580
1581     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1582         contents.setValueDeleter(uprv_deleteUObject);
1583     }
1584
1585     ~TokenSymbolTable() {}
1586
1587     /**
1588      * (Non-SymbolTable API) Add the given variable and value to
1589      * the table.  Variable should NOT contain leading '$'.
1590      */
1591     void add(const UnicodeString& var, const UnicodeString& value,
1592              UErrorCode& ec) {
1593         if (U_SUCCESS(ec)) {
1594             contents.put(var, new UnicodeString(value), ec);
1595         }
1596     }
1597
1598     /**
1599      * SymbolTable API
1600      */
1601     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1602         return (const UnicodeString*) contents.get(s);
1603     }
1604
1605     /**
1606      * SymbolTable API
1607      */
1608     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1609         return NULL;
1610     }
1611
1612     /**
1613      * SymbolTable API
1614      */
1615     virtual UnicodeString parseReference(const UnicodeString& text,
1616                                          ParsePosition& pos, int32_t limit) const {
1617         int32_t start = pos.getIndex();
1618         int32_t i = start;
1619         UnicodeString result;
1620         while (i < limit) {
1621             UChar c = text.charAt(i);
1622             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1623                 break;
1624             }
1625             ++i;
1626         }
1627         if (i == start) { // No valid name chars
1628             return result; // Indicate failure with empty string
1629         }
1630         pos.setIndex(i);
1631         text.extractBetween(start, i, result);
1632         return result;
1633     }
1634 };
1635
1636 void UnicodeSetTest::TestSymbolTable() {
1637     // Multiple test cases can be set up here.  Each test case
1638     // is terminated by null:
1639     // var, value, var, value,..., input pat., exp. output pat., null
1640     const char* DATA[] = {
1641         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1642         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1643         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1644         NULL
1645     };
1646
1647     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1648         UErrorCode ec = U_ZERO_ERROR;
1649         TokenSymbolTable sym(ec);
1650         if (U_FAILURE(ec)) {
1651             errln("FAIL: couldn't construct TokenSymbolTable");
1652             continue;
1653         }
1654
1655         // Set up variables
1656         while (DATA[i+2] != NULL) {
1657             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1658             if (U_FAILURE(ec)) {
1659                 errln("FAIL: couldn't add to TokenSymbolTable");
1660                 continue;
1661             }
1662             i += 2;
1663         }
1664
1665         // Input pattern and expected output pattern
1666         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1667         i += 2;
1668
1669         ParsePosition pos(0);
1670         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1671         if (U_FAILURE(ec)) {
1672             errln("FAIL: couldn't construct UnicodeSet");
1673             continue;
1674         }
1675
1676         // results
1677         if (pos.getIndex() != inpat.length()) {
1678             errln((UnicodeString)"Failed to read to end of string \""
1679                   + inpat + "\": read to "
1680                   + pos.getIndex() + ", length is "
1681                   + inpat.length());
1682         }
1683
1684         UnicodeSet us2(exppat, ec);
1685         if (U_FAILURE(ec)) {
1686             errln("FAIL: couldn't construct expected UnicodeSet");
1687             continue;
1688         }
1689
1690         UnicodeString a, b;
1691         if (us != us2) {
1692             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1693                   ", expected " + us2.toPattern(b, TRUE));
1694         } else {
1695             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1696         }
1697     }
1698 }
1699
1700 void UnicodeSetTest::TestSurrogate() {
1701     const char* DATA[] = {
1702         // These should all behave identically
1703         "[abc\\uD800\\uDC00]",
1704         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1705         "[abc\\U00010000]",
1706         0
1707     };
1708     for (int i=0; DATA[i] != 0; ++i) {
1709         UErrorCode ec = U_ZERO_ERROR;
1710         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1711         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1712         UnicodeSet set(str, ec);
1713         if (U_FAILURE(ec)) {
1714             errln("FAIL: UnicodeSet constructor");
1715             continue;
1716         }
1717         expectContainment(set,
1718                           CharsToUnicodeString("abc\\U00010000"),
1719                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1720         if (set.size() != 4) {
1721             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1722                   set.size() + ", expected 4");
1723         }
1724
1725         {
1726           UErrorCode subErr = U_ZERO_ERROR;
1727           checkRoundTrip(set);
1728           checkSerializeRoundTrip(set, subErr);
1729         }
1730     }
1731 }
1732
1733 void UnicodeSetTest::TestExhaustive() {
1734     // exhaustive tests. Simulate UnicodeSets with integers.
1735     // That gives us very solid tests (except for large memory tests).
1736
1737     int32_t limit = 128;
1738
1739     UnicodeSet x, y, z, aa;
1740
1741     for (int32_t i = 0; i < limit; ++i) {
1742         bitsToSet(i, x);
1743         logln((UnicodeString)"Testing " + i + ", " + x);
1744         _testComplement(i, x, y);
1745
1746         UnicodeSet &toTest = bitsToSet(i, aa);
1747
1748         // AS LONG AS WE ARE HERE, check roundtrip
1749         checkRoundTrip(toTest);
1750         UErrorCode ec = U_ZERO_ERROR;
1751         checkSerializeRoundTrip(toTest, ec);
1752
1753         for (int32_t j = 0; j < limit; ++j) {
1754             _testAdd(i,j,  x,y,z);
1755             _testXor(i,j,  x,y,z);
1756             _testRetain(i,j,  x,y,z);
1757             _testRemove(i,j,  x,y,z);
1758         }
1759     }
1760 }
1761
1762 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1763     bitsToSet(a, x);
1764     z = x;
1765     z.complement();
1766     int32_t c = setToBits(z);
1767     if (c != (~a)) {
1768         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1769         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1770     }
1771     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1772 }
1773
1774 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1775     bitsToSet(a, x);
1776     bitsToSet(b, y);
1777     z = x;
1778     z.addAll(y);
1779     int32_t c = setToBits(z);
1780     if (c != (a | b)) {
1781         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1782         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1783     }
1784     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1785 }
1786
1787 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1788     bitsToSet(a, x);
1789     bitsToSet(b, y);
1790     z = x;
1791     z.retainAll(y);
1792     int32_t c = setToBits(z);
1793     if (c != (a & b)) {
1794         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1795         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1796     }
1797     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1798 }
1799
1800 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1801     bitsToSet(a, x);
1802     bitsToSet(b, y);
1803     z = x;
1804     z.removeAll(y);
1805     int32_t c = setToBits(z);
1806     if (c != (a &~ b)) {
1807         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1808         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1809     }
1810     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1811 }
1812
1813 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1814     bitsToSet(a, x);
1815     bitsToSet(b, y);
1816     z = x;
1817     z.complementAll(y);
1818     int32_t c = setToBits(z);
1819     if (c != (a ^ b)) {
1820         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1821         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1822     }
1823     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1824 }
1825
1826 /**
1827  * Check that ranges are monotonically increasing and non-
1828  * overlapping.
1829  */
1830 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1831     int32_t n = set.getRangeCount();
1832     if (n < 0) {
1833         errln((UnicodeString)"FAIL result of " + msg +
1834               ": range count should be >= 0 but is " +
1835               n /*+ " for " + set.toPattern())*/);
1836         return;
1837     }
1838     UChar32 last = 0;
1839     for (int32_t i=0; i<n; ++i) {
1840         UChar32 start = set.getRangeStart(i);
1841         UChar32 end = set.getRangeEnd(i);
1842         if (start > end) {
1843             errln((UnicodeString)"FAIL result of " + msg +
1844                   ": range " + (i+1) +
1845                   " start > end: " + (int)start + ", " + (int)end +
1846                   " for " + set);
1847         }
1848         if (i > 0 && start <= last) {
1849             errln((UnicodeString)"FAIL result of " + msg +
1850                   ": range " + (i+1) +
1851                   " overlaps previous range: " + (int)start + ", " + (int)end +
1852                   " for " + set);
1853         }
1854         last = end;
1855     }
1856 }
1857
1858 /**
1859  * Convert a bitmask to a UnicodeSet.
1860  */
1861 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1862     result.clear();
1863     for (UChar32 i = 0; i < 32; ++i) {
1864         if ((a & (1<<i)) != 0) {
1865             result.add(i);
1866         }
1867     }
1868     return result;
1869 }
1870
1871 /**
1872  * Convert a UnicodeSet to a bitmask.  Only the characters
1873  * U+0000 to U+0020 are represented in the bitmask.
1874  */
1875 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1876     int32_t result = 0;
1877     for (int32_t i = 0; i < 32; ++i) {
1878         if (x.contains((UChar32)i)) {
1879             result |= (1<<i);
1880         }
1881     }
1882     return result;
1883 }
1884
1885 /**
1886  * Return the representation of an inversion list based UnicodeSet
1887  * as a pairs list.  Ranges are listed in ascending Unicode order.
1888  * For example, the set [a-zA-M3] is represented as "33AMaz".
1889  */
1890 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1891     UnicodeString pairs;
1892     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1893         UChar32 start = set.getRangeStart(i);
1894         UChar32 end = set.getRangeEnd(i);
1895         if (end > 0xFFFF) {
1896             end = 0xFFFF;
1897             i = set.getRangeCount(); // Should be unnecessary
1898         }
1899         pairs.append((UChar)start).append((UChar)end);
1900     }
1901     return pairs;
1902 }
1903
1904 /**
1905  * Basic consistency check for a few items.
1906  * That the iterator works, and that we can create a pattern and
1907  * get the same thing back
1908  */
1909 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1910     {
1911         UnicodeSet t(s);
1912         checkEqual(s, t, "copy ct");
1913     }
1914
1915     {
1916         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1917         t = s;
1918         checkEqual(s, t, "operator=");
1919     }
1920
1921     {
1922         UnicodeSet t;
1923         copyWithIterator(t, s, FALSE);
1924         checkEqual(s, t, "iterator roundtrip");
1925     }
1926
1927     {
1928         UnicodeSet t;
1929         copyWithIterator(t, s, TRUE); // try range
1930         checkEqual(s, t, "iterator roundtrip");
1931     }
1932
1933     {
1934         UnicodeSet t;
1935         UnicodeString pat;
1936         UErrorCode ec = U_ZERO_ERROR;
1937         s.toPattern(pat, FALSE);
1938         t.applyPattern(pat, ec);
1939         if (U_FAILURE(ec)) {
1940             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1941             return;
1942         } else {
1943             checkEqual(s, t, "toPattern(false)");
1944         }
1945     }
1946
1947     {
1948         UnicodeSet t;
1949         UnicodeString pat;
1950         UErrorCode ec = U_ZERO_ERROR;
1951         s.toPattern(pat, TRUE);
1952         t.applyPattern(pat, ec);
1953         if (U_FAILURE(ec)) {
1954             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1955             return;
1956         } else {
1957             checkEqual(s, t, "toPattern(true)");
1958         }
1959     }
1960 }
1961
1962 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1963   if(U_FAILURE(status)) return;
1964   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1965   if(status == U_BUFFER_OVERFLOW_ERROR) {
1966     status = U_ZERO_ERROR;
1967     serializeBuffer.resize(len);
1968     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1969     // let 2nd error stand
1970   }
1971   if(U_FAILURE(status)) {
1972     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1973     return;
1974   }
1975   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1976   if(U_FAILURE(status)) {
1977     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1978     return;
1979   }
1980
1981   checkEqual(t, deserialized, "Set was unequal when deserialized");
1982 }
1983
1984 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1985     t.clear();
1986     UnicodeSetIterator it(s);
1987     if (withRange) {
1988         while (it.nextRange()) {
1989             if (it.isString()) {
1990                 t.add(it.getString());
1991             } else {
1992                 t.add(it.getCodepoint(), it.getCodepointEnd());
1993             }
1994         }
1995     } else {
1996         while (it.next()) {
1997             if (it.isString()) {
1998                 t.add(it.getString());
1999             } else {
2000                 t.add(it.getCodepoint());
2001             }
2002         }
2003     }
2004 }
2005
2006 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2007   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2008   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2009     UnicodeString source; s.toPattern(source, TRUE);
2010     UnicodeString result; t.toPattern(result, TRUE);
2011     if (s != t) {
2012         errln((UnicodeString)"FAIL: " + message
2013               + "; source = " + source
2014               + "; result = " + result
2015               );
2016         return FALSE;
2017     } else {
2018         logln((UnicodeString)"Ok: " + message
2019               + "; source = " + source
2020               + "; result = " + result
2021               );
2022     }
2023     return TRUE;
2024 }
2025
2026 void
2027 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2028                                   const UnicodeString& charsIn,
2029                                   const UnicodeString& charsOut) {
2030     UErrorCode ec = U_ZERO_ERROR;
2031     UnicodeSet set(pat, ec);
2032     if (U_FAILURE(ec)) {
2033         dataerrln((UnicodeString)"FAIL: pattern \"" +
2034               pat + "\" => " + u_errorName(ec));
2035         return;
2036     }
2037     expectContainment(set, pat, charsIn, charsOut);
2038 }
2039
2040 void
2041 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2042                                   const UnicodeString& charsIn,
2043                                   const UnicodeString& charsOut) {
2044     UnicodeString pat;
2045     set.toPattern(pat);
2046     expectContainment(set, pat, charsIn, charsOut);
2047 }
2048
2049 void
2050 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2051                                   const UnicodeString& setName,
2052                                   const UnicodeString& charsIn,
2053                                   const UnicodeString& charsOut) {
2054     UnicodeString bad;
2055     UChar32 c;
2056     int32_t i;
2057
2058     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2059         c = charsIn.char32At(i);
2060         if (!set.contains(c)) {
2061             bad.append(c);
2062         }
2063     }
2064     if (bad.length() > 0) {
2065         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2066               ", expected containment of " + prettify(charsIn));
2067     } else {
2068         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2069     }
2070
2071     bad.truncate(0);
2072     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2073         c = charsOut.char32At(i);
2074         if (set.contains(c)) {
2075             bad.append(c);
2076         }
2077     }
2078     if (bad.length() > 0) {
2079         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2080               ", expected non-containment of " + prettify(charsOut));
2081     } else {
2082         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2083     }
2084 }
2085
2086 void
2087 UnicodeSetTest::expectPattern(UnicodeSet& set,
2088                               const UnicodeString& pattern,
2089                               const UnicodeString& expectedPairs){
2090     UErrorCode status = U_ZERO_ERROR;
2091     set.applyPattern(pattern, status);
2092     if (U_FAILURE(status)) {
2093         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2094               "\") failed");
2095         return;
2096     } else {
2097         if (getPairs(set) != expectedPairs ) {
2098             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2099                   "\") => pairs \"" +
2100                   escape(getPairs(set)) + "\", expected \"" +
2101                   escape(expectedPairs) + "\"");
2102         } else {
2103             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2104                   "\") => pairs \"" +
2105                   escape(getPairs(set)) + "\"");
2106         }
2107     }
2108     // the result of calling set.toPattern(), which is the string representation of
2109     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2110     // will produce another set that is equal to this one.
2111     UnicodeString temppattern;
2112     set.toPattern(temppattern);
2113     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2114     if (U_FAILURE(status)) {
2115         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2116         return;
2117     }
2118     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2119         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2120             escape(getPairs(set)) + "\""));
2121     } else{
2122         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2123     }
2124
2125     delete tempset;
2126
2127 }
2128
2129 void
2130 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2131     if (getPairs(set) != expectedPairs) {
2132         errln(UnicodeString("FAIL: Expected pair list \"") +
2133               escape(expectedPairs) + "\", got \"" +
2134               escape(getPairs(set)) + "\"");
2135     }
2136 }
2137
2138 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2139                                      const UnicodeString& expPat,
2140                                      const char** expStrings) {
2141     UnicodeString pat;
2142     set.toPattern(pat, TRUE);
2143     if (pat == expPat) {
2144         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2145     } else {
2146         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2147         return;
2148     }
2149     if (expStrings == NULL) {
2150         return;
2151     }
2152     UBool in = TRUE;
2153     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2154         if (expStrings[i] == NOT) { // sic; pointer comparison
2155             in = FALSE;
2156             continue;
2157         }
2158         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2159         UBool contained = set.contains(s);
2160         if (contained == in) {
2161             logln((UnicodeString)"Ok: " + expPat +
2162                   (contained ? " contains {" : " does not contain {") +
2163                   escape(expStrings[i]) + "}");
2164         } else {
2165             errln((UnicodeString)"FAIL: " + expPat +
2166                   (contained ? " contains {" : " does not contain {") +
2167                   escape(expStrings[i]) + "}");
2168         }
2169     }
2170 }
2171
2172 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2173
2174 void
2175 UnicodeSetTest::doAssert(UBool condition, const char *message)
2176 {
2177     if (!condition) {
2178         errln(UnicodeString("ERROR : ") + message);
2179     }
2180 }
2181
2182 UnicodeString
2183 UnicodeSetTest::escape(const UnicodeString& s) {
2184     UnicodeString buf;
2185     for (int32_t i=0; i<s.length(); )
2186     {
2187         UChar32 c = s.char32At(i);
2188         if (0x0020 <= c && c <= 0x007F) {
2189             buf += c;
2190         } else {
2191             if (c <= 0xFFFF) {
2192                 buf += (UChar)0x5c; buf += (UChar)0x75;
2193             } else {
2194                 buf += (UChar)0x5c; buf += (UChar)0x55;
2195                 buf += toHexString((c & 0xF0000000) >> 28);
2196                 buf += toHexString((c & 0x0F000000) >> 24);
2197                 buf += toHexString((c & 0x00F00000) >> 20);
2198                 buf += toHexString((c & 0x000F0000) >> 16);
2199             }
2200             buf += toHexString((c & 0xF000) >> 12);
2201             buf += toHexString((c & 0x0F00) >> 8);
2202             buf += toHexString((c & 0x00F0) >> 4);
2203             buf += toHexString(c & 0x000F);
2204         }
2205         i += U16_LENGTH(c);
2206     }
2207     return buf;
2208 }
2209
2210 void UnicodeSetTest::TestFreezable() {
2211     UErrorCode errorCode=U_ZERO_ERROR;
2212     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2213     UnicodeSet idSet(idPattern, errorCode);
2214     if(U_FAILURE(errorCode)) {
2215         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2216         return;
2217     }
2218
2219     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2220     UnicodeSet wsSet(wsPattern, errorCode);
2221     if(U_FAILURE(errorCode)) {
2222         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2223         return;
2224     }
2225
2226     idSet.add(idPattern);
2227     UnicodeSet frozen(idSet);
2228     frozen.freeze();
2229
2230     if(idSet.isFrozen() || !frozen.isFrozen()) {
2231         errln("FAIL: isFrozen() is wrong");
2232     }
2233     if(frozen!=idSet || !(frozen==idSet)) {
2234         errln("FAIL: a copy-constructed frozen set differs from its original");
2235     }
2236
2237     frozen=wsSet;
2238     if(frozen!=idSet || !(frozen==idSet)) {
2239         errln("FAIL: a frozen set was modified by operator=");
2240     }
2241
2242     UnicodeSet frozen2(frozen);
2243     if(frozen2!=frozen || frozen2!=idSet) {
2244         errln("FAIL: a copied frozen set differs from its frozen original");
2245     }
2246     if(!frozen2.isFrozen()) {
2247         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2248     }
2249     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2250     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2251         errln("FAIL: UnicodeSet(5, 55) failed");
2252     }
2253     frozen3=frozen;
2254     if(!frozen3.isFrozen()) {
2255         errln("FAIL: copying a frozen set results in a thawed one");
2256     }
2257
2258     UnicodeSet *cloned=frozen.clone();
2259     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2260         errln("FAIL: clone() failed");
2261     }
2262     cloned->add(0xd802, 0xd805);
2263     if(cloned->containsSome(0xd802, 0xd805)) {
2264         errln("FAIL: unable to modify clone");
2265     }
2266     delete cloned;
2267
2268     UnicodeSet *thawed=frozen.cloneAsThawed();
2269     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2270         errln("FAIL: cloneAsThawed() failed");
2271     }
2272     thawed->add(0xd802, 0xd805);
2273     if(!thawed->contains(0xd802, 0xd805)) {
2274         errln("FAIL: unable to modify thawed clone");
2275     }
2276     delete thawed;
2277
2278     frozen.set(5, 55);
2279     if(frozen!=idSet || !(frozen==idSet)) {
2280         errln("FAIL: UnicodeSet::set() modified a frozen set");
2281     }
2282
2283     frozen.clear();
2284     if(frozen!=idSet || !(frozen==idSet)) {
2285         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2286     }
2287
2288     frozen.closeOver(USET_CASE_INSENSITIVE);
2289     if(frozen!=idSet || !(frozen==idSet)) {
2290         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2291     }
2292
2293     frozen.compact();
2294     if(frozen!=idSet || !(frozen==idSet)) {
2295         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2296     }
2297
2298     ParsePosition pos;
2299     frozen.
2300         applyPattern(wsPattern, errorCode).
2301         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2302         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2303         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2304         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2305     if(frozen!=idSet || !(frozen==idSet)) {
2306         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2307     }
2308
2309     frozen.
2310         add(0xd800).
2311         add(0xd802, 0xd805).
2312         add(wsPattern).
2313         addAll(idPattern).
2314         addAll(wsSet);
2315     if(frozen!=idSet || !(frozen==idSet)) {
2316         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2317     }
2318
2319     frozen.
2320         retain(0x62).
2321         retain(0x64, 0x69).
2322         retainAll(wsPattern).
2323         retainAll(wsSet);
2324     if(frozen!=idSet || !(frozen==idSet)) {
2325         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2326     }
2327
2328     frozen.
2329         remove(0x62).
2330         remove(0x64, 0x69).
2331         remove(idPattern).
2332         removeAll(idPattern).
2333         removeAll(idSet);
2334     if(frozen!=idSet || !(frozen==idSet)) {
2335         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2336     }
2337
2338     frozen.
2339         complement().
2340         complement(0x62).
2341         complement(0x64, 0x69).
2342         complement(idPattern).
2343         complementAll(idPattern).
2344         complementAll(idSet);
2345     if(frozen!=idSet || !(frozen==idSet)) {
2346         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2347     }
2348 }
2349
2350 // Test span() etc. -------------------------------------------------------- ***
2351
2352 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2353 static int32_t
2354 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2355     UErrorCode errorCode=U_ZERO_ERROR;
2356     int32_t length8=0;
2357     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2358     if(U_SUCCESS(errorCode)) {
2359         return length8;
2360     } else {
2361         // The string contains an unpaired surrogate.
2362         // Ignore this string.
2363         return 0;
2364     }
2365 }
2366
2367 class UnicodeSetWithStringsIterator;
2368
2369 // Make the strings in a UnicodeSet easily accessible.
2370 class UnicodeSetWithStrings {
2371 public:
2372     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2373             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2374         int32_t size=set.size();
2375         if(size>0 && set.charAt(size-1)<0) {
2376             // If a set's last element is not a code point, then it must contain strings.
2377             // Iterate over the set, skip all code point ranges, and cache the strings.
2378             // Convert them to UTF-8 for spanUTF8().
2379             UnicodeSetIterator iter(set);
2380             const UnicodeString *s;
2381             char *s8=utf8;
2382             int32_t length8, utf8Count=0;
2383             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2384                 if(iter.isString()) {
2385                     // Store the pointer to the set's string element
2386                     // which we happen to know is a stable pointer.
2387                     strings[stringsLength]=s=&iter.getString();
2388                     utf8Count+=
2389                         utf8Lengths[stringsLength]=length8=
2390                         appendUTF8(s->getBuffer(), s->length(),
2391                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2392                     if(length8==0) {
2393                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2394                     }
2395                     s8+=length8;
2396                     ++stringsLength;
2397                 }
2398             }
2399         }
2400     }
2401
2402     const UnicodeSet &getSet() const {
2403         return set;
2404     }
2405
2406     UBool hasStrings() const {
2407         return (UBool)(stringsLength>0);
2408     }
2409
2410     UBool hasStringsWithSurrogates() const {
2411         return hasSurrogates;
2412     }
2413
2414 private:
2415     friend class UnicodeSetWithStringsIterator;
2416
2417     const UnicodeSet &set;
2418
2419     const UnicodeString *strings[20];
2420     int32_t stringsLength;
2421     UBool hasSurrogates;
2422
2423     char utf8[1024];
2424     int32_t utf8Lengths[20];
2425 };
2426
2427 class UnicodeSetWithStringsIterator {
2428 public:
2429     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2430             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2431     }
2432
2433     void reset() {
2434         nextStringIndex=nextUTF8Start=0;
2435     }
2436
2437     const UnicodeString *nextString() {
2438         if(nextStringIndex<fSet.stringsLength) {
2439             return fSet.strings[nextStringIndex++];
2440         } else {
2441             return NULL;
2442         }
2443     }
2444
2445     // Do not mix with calls to nextString().
2446     const char *nextUTF8(int32_t &length) {
2447         if(nextStringIndex<fSet.stringsLength) {
2448             const char *s8=fSet.utf8+nextUTF8Start;
2449             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2450             return s8;
2451         } else {
2452             length=0;
2453             return NULL;
2454         }
2455     }
2456
2457 private:
2458     const UnicodeSetWithStrings &fSet;
2459     int32_t nextStringIndex;
2460     int32_t nextUTF8Start;
2461 };
2462
2463 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2464 // at code point boundaries.
2465 // That is, each edge of a match must not be in the middle of a surrogate pair.
2466 static inline UBool
2467 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2468     s+=start;
2469     limit-=start;
2470     int32_t length=t.length();
2471     return 0==t.compare(s, length) &&
2472            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2473            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2474 }
2475
2476 // Implement span() with contains() for comparison.
2477 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2478                                  USetSpanCondition spanCondition) {
2479     const UnicodeSet &realSet(set.getSet());
2480     if(!set.hasStrings()) {
2481         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2482             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2483         }
2484
2485         UChar32 c;
2486         int32_t start=0, prev;
2487         while((prev=start)<length) {
2488             U16_NEXT(s, start, length, c);
2489             if(realSet.contains(c)!=spanCondition) {
2490                 break;
2491             }
2492         }
2493         return prev;
2494     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2495         UnicodeSetWithStringsIterator iter(set);
2496         UChar32 c;
2497         int32_t start, next;
2498         for(start=next=0; start<length;) {
2499             U16_NEXT(s, next, length, c);
2500             if(realSet.contains(c)) {
2501                 break;
2502             }
2503             const UnicodeString *str;
2504             iter.reset();
2505             while((str=iter.nextString())!=NULL) {
2506                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2507                     // spanNeedsStrings=TRUE;
2508                     return start;
2509                 }
2510             }
2511             start=next;
2512         }
2513         return start;
2514     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2515         UnicodeSetWithStringsIterator iter(set);
2516         UChar32 c;
2517         int32_t start, next, maxSpanLimit=0;
2518         for(start=next=0; start<length;) {
2519             U16_NEXT(s, next, length, c);
2520             if(!realSet.contains(c)) {
2521                 next=start;  // Do not span this single, not-contained code point.
2522             }
2523             const UnicodeString *str;
2524             iter.reset();
2525             while((str=iter.nextString())!=NULL) {
2526                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2527                     // spanNeedsStrings=TRUE;
2528                     int32_t matchLimit=start+str->length();
2529                     if(matchLimit==length) {
2530                         return length;
2531                     }
2532                     if(spanCondition==USET_SPAN_CONTAINED) {
2533                         // Iterate for the shortest match at each position.
2534                         // Recurse for each but the shortest match.
2535                         if(next==start) {
2536                             next=matchLimit;  // First match from start.
2537                         } else {
2538                             if(matchLimit<next) {
2539                                 // Remember shortest match from start for iteration.
2540                                 int32_t temp=next;
2541                                 next=matchLimit;
2542                                 matchLimit=temp;
2543                             }
2544                             // Recurse for non-shortest match from start.
2545                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2546                                                                  USET_SPAN_CONTAINED);
2547                             if((matchLimit+spanLength)>maxSpanLimit) {
2548                                 maxSpanLimit=matchLimit+spanLength;
2549                                 if(maxSpanLimit==length) {
2550                                     return length;
2551                                 }
2552                             }
2553                         }
2554                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2555                         if(matchLimit>next) {
2556                             // Remember longest match from start.
2557                             next=matchLimit;
2558                         }
2559                     }
2560                 }
2561             }
2562             if(next==start) {
2563                 break;  // No match from start.
2564             }
2565             start=next;
2566         }
2567         if(start>maxSpanLimit) {
2568             return start;
2569         } else {
2570             return maxSpanLimit;
2571         }
2572     }
2573 }
2574
2575 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2576                                      USetSpanCondition spanCondition) {
2577     if(length==0) {
2578         return 0;
2579     }
2580     const UnicodeSet &realSet(set.getSet());
2581     if(!set.hasStrings()) {
2582         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2583             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2584         }
2585
2586         UChar32 c;
2587         int32_t prev=length;
2588         do {
2589             U16_PREV(s, 0, length, c);
2590             if(realSet.contains(c)!=spanCondition) {
2591                 break;
2592             }
2593         } while((prev=length)>0);
2594         return prev;
2595     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2596         UnicodeSetWithStringsIterator iter(set);
2597         UChar32 c;
2598         int32_t prev=length, length0=length;
2599         do {
2600             U16_PREV(s, 0, length, c);
2601             if(realSet.contains(c)) {
2602                 break;
2603             }
2604             const UnicodeString *str;
2605             iter.reset();
2606             while((str=iter.nextString())!=NULL) {
2607                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2608                     // spanNeedsStrings=TRUE;
2609                     return prev;
2610                 }
2611             }
2612         } while((prev=length)>0);
2613         return prev;
2614     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2615         UnicodeSetWithStringsIterator iter(set);
2616         UChar32 c;
2617         int32_t prev=length, minSpanStart=length, length0=length;
2618         do {
2619             U16_PREV(s, 0, length, c);
2620             if(!realSet.contains(c)) {
2621                 length=prev;  // Do not span this single, not-contained code point.
2622             }
2623             const UnicodeString *str;
2624             iter.reset();
2625             while((str=iter.nextString())!=NULL) {
2626                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2627                     // spanNeedsStrings=TRUE;
2628                     int32_t matchStart=prev-str->length();
2629                     if(matchStart==0) {
2630                         return 0;
2631                     }
2632                     if(spanCondition==USET_SPAN_CONTAINED) {
2633                         // Iterate for the shortest match at each position.
2634                         // Recurse for each but the shortest match.
2635                         if(length==prev) {
2636                             length=matchStart;  // First match from prev.
2637                         } else {
2638                             if(matchStart>length) {
2639                                 // Remember shortest match from prev for iteration.
2640                                 int32_t temp=length;
2641                                 length=matchStart;
2642                                 matchStart=temp;
2643                             }
2644                             // Recurse for non-shortest match from prev.
2645                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2646                                                                     USET_SPAN_CONTAINED);
2647                             if(spanStart<minSpanStart) {
2648                                 minSpanStart=spanStart;
2649                                 if(minSpanStart==0) {
2650                                     return 0;
2651                                 }
2652                             }
2653                         }
2654                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2655                         if(matchStart<length) {
2656                             // Remember longest match from prev.
2657                             length=matchStart;
2658                         }
2659                     }
2660                 }
2661             }
2662             if(length==prev) {
2663                 break;  // No match from prev.
2664             }
2665         } while((prev=length)>0);
2666         if(prev<minSpanStart) {
2667             return prev;
2668         } else {
2669             return minSpanStart;
2670         }
2671     }
2672 }
2673
2674 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2675                                 USetSpanCondition spanCondition) {
2676     const UnicodeSet &realSet(set.getSet());
2677     if(!set.hasStrings()) {
2678         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2679             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2680         }
2681
2682         UChar32 c;
2683         int32_t start=0, prev;
2684         while((prev=start)<length) {
2685             U8_NEXT_OR_FFFD(s, start, length, c);
2686             if(realSet.contains(c)!=spanCondition) {
2687                 break;
2688             }
2689         }
2690         return prev;
2691     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2692         UnicodeSetWithStringsIterator iter(set);
2693         UChar32 c;
2694         int32_t start, next;
2695         for(start=next=0; start<length;) {
2696             U8_NEXT_OR_FFFD(s, next, length, c);
2697             if(realSet.contains(c)) {
2698                 break;
2699             }
2700             const char *s8;
2701             int32_t length8;
2702             iter.reset();
2703             while((s8=iter.nextUTF8(length8))!=NULL) {
2704                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2705                     // spanNeedsStrings=TRUE;
2706                     return start;
2707                 }
2708             }
2709             start=next;
2710         }
2711         return start;
2712     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2713         UnicodeSetWithStringsIterator iter(set);
2714         UChar32 c;
2715         int32_t start, next, maxSpanLimit=0;
2716         for(start=next=0; start<length;) {
2717             U8_NEXT_OR_FFFD(s, next, length, c);
2718             if(!realSet.contains(c)) {
2719                 next=start;  // Do not span this single, not-contained code point.
2720             }
2721             const char *s8;
2722             int32_t length8;
2723             iter.reset();
2724             while((s8=iter.nextUTF8(length8))!=NULL) {
2725                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2726                     // spanNeedsStrings=TRUE;
2727                     int32_t matchLimit=start+length8;
2728                     if(matchLimit==length) {
2729                         return length;
2730                     }
2731                     if(spanCondition==USET_SPAN_CONTAINED) {
2732                         // Iterate for the shortest match at each position.
2733                         // Recurse for each but the shortest match.
2734                         if(next==start) {
2735                             next=matchLimit;  // First match from start.
2736                         } else {
2737                             if(matchLimit<next) {
2738                                 // Remember shortest match from start for iteration.
2739                                 int32_t temp=next;
2740                                 next=matchLimit;
2741                                 matchLimit=temp;
2742                             }
2743                             // Recurse for non-shortest match from start.
2744                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2745                                                                 USET_SPAN_CONTAINED);
2746                             if((matchLimit+spanLength)>maxSpanLimit) {
2747                                 maxSpanLimit=matchLimit+spanLength;
2748                                 if(maxSpanLimit==length) {
2749                                     return length;
2750                                 }
2751                             }
2752                         }
2753                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2754                         if(matchLimit>next) {
2755                             // Remember longest match from start.
2756                             next=matchLimit;
2757                         }
2758                     }
2759                 }
2760             }
2761             if(next==start) {
2762                 break;  // No match from start.
2763             }
2764             start=next;
2765         }
2766         if(start>maxSpanLimit) {
2767             return start;
2768         } else {
2769             return maxSpanLimit;
2770         }
2771     }
2772 }
2773
2774 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2775                                     USetSpanCondition spanCondition) {
2776     if(length==0) {
2777         return 0;
2778     }
2779     const UnicodeSet &realSet(set.getSet());
2780     if(!set.hasStrings()) {
2781         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2782             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2783         }
2784
2785         UChar32 c;
2786         int32_t prev=length;
2787         do {
2788             U8_PREV_OR_FFFD(s, 0, length, c);
2789             if(realSet.contains(c)!=spanCondition) {
2790                 break;
2791             }
2792         } while((prev=length)>0);
2793         return prev;
2794     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2795         UnicodeSetWithStringsIterator iter(set);
2796         UChar32 c;
2797         int32_t prev=length;
2798         do {
2799             U8_PREV_OR_FFFD(s, 0, length, c);
2800             if(realSet.contains(c)) {
2801                 break;
2802             }
2803             const char *s8;
2804             int32_t length8;
2805             iter.reset();
2806             while((s8=iter.nextUTF8(length8))!=NULL) {
2807                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2808                     // spanNeedsStrings=TRUE;
2809                     return prev;
2810                 }
2811             }
2812         } while((prev=length)>0);
2813         return prev;
2814     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2815         UnicodeSetWithStringsIterator iter(set);
2816         UChar32 c;
2817         int32_t prev=length, minSpanStart=length;
2818         do {
2819             U8_PREV_OR_FFFD(s, 0, length, c);
2820             if(!realSet.contains(c)) {
2821                 length=prev;  // Do not span this single, not-contained code point.
2822             }
2823             const char *s8;
2824             int32_t length8;
2825             iter.reset();
2826             while((s8=iter.nextUTF8(length8))!=NULL) {
2827                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2828                     // spanNeedsStrings=TRUE;
2829                     int32_t matchStart=prev-length8;
2830                     if(matchStart==0) {
2831                         return 0;
2832                     }
2833                     if(spanCondition==USET_SPAN_CONTAINED) {
2834                         // Iterate for the shortest match at each position.
2835                         // Recurse for each but the shortest match.
2836                         if(length==prev) {
2837                             length=matchStart;  // First match from prev.
2838                         } else {
2839                             if(matchStart>length) {
2840                                 // Remember shortest match from prev for iteration.
2841                                 int32_t temp=length;
2842                                 length=matchStart;
2843                                 matchStart=temp;
2844                             }
2845                             // Recurse for non-shortest match from prev.
2846                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2847                                                                    USET_SPAN_CONTAINED);
2848                             if(spanStart<minSpanStart) {
2849                                 minSpanStart=spanStart;
2850                                 if(minSpanStart==0) {
2851                                     return 0;
2852                                 }
2853                             }
2854                         }
2855                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2856                         if(matchStart<length) {
2857                             // Remember longest match from prev.
2858                             length=matchStart;
2859                         }
2860                     }
2861                 }
2862             }
2863             if(length==prev) {
2864                 break;  // No match from prev.
2865             }
2866         } while((prev=length)>0);
2867         if(prev<minSpanStart) {
2868             return prev;
2869         } else {
2870             return minSpanStart;
2871         }
2872     }
2873 }
2874
2875 // spans to be performed and compared
2876 enum {
2877     SPAN_UTF16          =1,
2878     SPAN_UTF8           =2,
2879     SPAN_UTFS           =3,
2880
2881     SPAN_SET            =4,
2882     SPAN_COMPLEMENT     =8,
2883     SPAN_POLARITY       =0xc,
2884
2885     SPAN_FWD            =0x10,
2886     SPAN_BACK           =0x20,
2887     SPAN_DIRS           =0x30,
2888
2889     SPAN_CONTAINED      =0x100,
2890     SPAN_SIMPLE         =0x200,
2891     SPAN_CONDITION      =0x300,
2892
2893     SPAN_ALL            =0x33f
2894 };
2895
2896 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2897     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2898 }
2899
2900 static inline int32_t slen(const void *s, UBool isUTF16) {
2901     return isUTF16 ? u_strlen((const UChar *)s) : static_cast<int32_t>(strlen((const char *)s));
2902 }
2903
2904 /*
2905  * Count spans on a string with the method according to type and set the span limits.
2906  * The set may be the complement of the original.
2907  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2908  * according to the expected number of spans.
2909  * Sets typeName to an empty string if there is no such type.
2910  * Returns -1 if the span option is filtered out.
2911  */
2912 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2913                         const void *s, int32_t length, UBool isUTF16,
2914                         uint32_t whichSpans,
2915                         int type, const char *&typeName,
2916                         int32_t limits[], int32_t limitsCapacity,
2917                         int32_t expectCount) {
2918     const UnicodeSet &realSet(set.getSet());
2919     int32_t start, count;
2920     USetSpanCondition spanCondition, firstSpanCondition, contained;
2921     UBool isForward;
2922
2923     if(type<0 || 7<type) {
2924         typeName="";
2925         return 0;
2926     }
2927
2928     static const char *const typeNames16[]={
2929         "contains", "contains(LM)",
2930         "span", "span(LM)",
2931         "containsBack", "containsBack(LM)",
2932         "spanBack", "spanBack(LM)"
2933     };
2934
2935     static const char *const typeNames8[]={
2936         "containsUTF8", "containsUTF8(LM)",
2937         "spanUTF8", "spanUTF8(LM)",
2938         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2939         "spanBackUTF8", "spanBackUTF8(LM)"
2940     };
2941
2942     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2943
2944     // filter span options
2945     if(type<=3) {
2946         // span forward
2947         if((whichSpans&SPAN_FWD)==0) {
2948             return -1;
2949         }
2950         isForward=TRUE;
2951     } else {
2952         // span backward
2953         if((whichSpans&SPAN_BACK)==0) {
2954             return -1;
2955         }
2956         isForward=FALSE;
2957     }
2958     if((type&1)==0) {
2959         // use USET_SPAN_CONTAINED
2960         if((whichSpans&SPAN_CONTAINED)==0) {
2961             return -1;
2962         }
2963         contained=USET_SPAN_CONTAINED;
2964     } else {
2965         // use USET_SPAN_SIMPLE
2966         if((whichSpans&SPAN_SIMPLE)==0) {
2967             return -1;
2968         }
2969         contained=USET_SPAN_SIMPLE;
2970     }
2971
2972     // Default first span condition for going forward with an uncomplemented set.
2973     spanCondition=USET_SPAN_NOT_CONTAINED;
2974     if(isComplement) {
2975         spanCondition=invertSpanCondition(spanCondition, contained);
2976     }
2977
2978     // First span condition for span(), used to terminate the spanBack() iteration.
2979     firstSpanCondition=spanCondition;
2980
2981     // spanBack(): Its initial span condition is span()'s last span condition,
2982     // which is the opposite of span()'s first span condition
2983     // if we expect an even number of spans.
2984     // (The loop inverts spanCondition (expectCount-1) times
2985     // before the expectCount'th span() call.)
2986     // If we do not compare forward and backward directions, then we do not have an
2987     // expectCount and just start with firstSpanCondition.
2988     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2989         spanCondition=invertSpanCondition(spanCondition, contained);
2990     }
2991
2992     count=0;
2993     switch(type) {
2994     case 0:
2995     case 1:
2996         start=0;
2997         if(length<0) {
2998             length=slen(s, isUTF16);
2999         }
3000         for(;;) {
3001             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
3002                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3003             if(count<limitsCapacity) {
3004                 limits[count]=start;
3005             }
3006             ++count;
3007             if(start>=length) {
3008                 break;
3009             }
3010             spanCondition=invertSpanCondition(spanCondition, contained);
3011         }
3012         break;
3013     case 2:
3014     case 3:
3015         start=0;
3016         for(;;) {
3017             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3018                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3019             if(count<limitsCapacity) {
3020                 limits[count]=start;
3021             }
3022             ++count;
3023             if(length>=0 ? start>=length :
3024                            isUTF16 ? ((const UChar *)s)[start]==0 :
3025                                      ((const char *)s)[start]==0
3026             ) {
3027                 break;
3028             }
3029             spanCondition=invertSpanCondition(spanCondition, contained);
3030         }
3031         break;
3032     case 4:
3033     case 5:
3034         if(length<0) {
3035             length=slen(s, isUTF16);
3036         }
3037         for(;;) {
3038             ++count;
3039             if(count<=limitsCapacity) {
3040                 limits[limitsCapacity-count]=length;
3041             }
3042             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3043                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3044             if(length==0 && spanCondition==firstSpanCondition) {
3045                 break;
3046             }
3047             spanCondition=invertSpanCondition(spanCondition, contained);
3048         }
3049         if(count<limitsCapacity) {
3050             memmove(limits, limits+(limitsCapacity-count), count*4);
3051         }
3052         break;
3053     case 6:
3054     case 7:
3055         for(;;) {
3056             ++count;
3057             if(count<=limitsCapacity) {
3058                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3059             }
3060             // Note: Length<0 is tested only for the first spanBack().
3061             // If we wanted to keep length<0 for all spanBack()s, we would have to
3062             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3063             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3064                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3065             if(length==0 && spanCondition==firstSpanCondition) {
3066                 break;
3067             }
3068             spanCondition=invertSpanCondition(spanCondition, contained);
3069         }
3070         if(count<limitsCapacity) {
3071             memmove(limits, limits+(limitsCapacity-count), count*4);
3072         }
3073         break;
3074     default:
3075         typeName="";
3076         return -1;
3077     }
3078
3079     return count;
3080 }
3081
3082 // sets to be tested; odd index=isComplement
3083 enum {
3084     SLOW,
3085     SLOW_NOT,
3086     FAST,
3087     FAST_NOT,
3088     SET_COUNT
3089 };
3090
3091 static const char *const setNames[SET_COUNT]={
3092     "slow",
3093     "slow.not",
3094     "fast",
3095     "fast.not"
3096 };
3097
3098 /*
3099  * Verify that we get the same results whether we look at text with contains(),
3100  * span() or spanBack(), using unfrozen or frozen versions of the set,
3101  * and using the set or its complement (switching the spanConditions accordingly).
3102  * The latter verifies that
3103  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3104  *
3105  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3106  * or returned to the caller (with an input expectCount<0).
3107  */
3108 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3109                               const void *s, int32_t length, UBool isUTF16,
3110                               uint32_t whichSpans,
3111                               int32_t expectLimits[], int32_t &expectCount,
3112                               const char *testName, int32_t index) {
3113     int32_t limits[500];
3114     int32_t limitsCount;
3115     int i, j;
3116
3117     const char *typeName;
3118     int type;
3119
3120     for(i=0; i<SET_COUNT; ++i) {
3121         if((i&1)==0) {
3122             // Even-numbered sets are original, uncomplemented sets.
3123             if((whichSpans&SPAN_SET)==0) {
3124                 continue;
3125             }
3126         } else {
3127             // Odd-numbered sets are complemented.
3128             if((whichSpans&SPAN_COMPLEMENT)==0) {
3129                 continue;
3130             }
3131         }
3132         for(type=0;; ++type) {
3133             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3134                                  s, length, isUTF16,
3135                                  whichSpans,
3136                                  type, typeName,
3137                                  limits, UPRV_LENGTHOF(limits), expectCount);
3138             if(typeName[0]==0) {
3139                 break; // All types tried.
3140             }
3141             if(limitsCount<0) {
3142                 continue; // Span option filtered out.
3143             }
3144             if(expectCount<0) {
3145                 expectCount=limitsCount;
3146                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3147                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3148                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3149                     return;
3150                 }
3151                 memcpy(expectLimits, limits, limitsCount*4);
3152             } else if(limitsCount!=expectCount) {
3153                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3154                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3155             } else {
3156                 for(j=0; j<limitsCount; ++j) {
3157                     if(limits[j]!=expectLimits[j]) {
3158                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3159                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3160                               j, (long)limits[j], (long)expectLimits[j]);
3161                         break;
3162                     }
3163                 }
3164             }
3165         }
3166     }
3167
3168     // Compare span() with containsAll()/containsNone(),
3169     // but only if we have expectLimits[] from the uncomplemented set.
3170     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3171         const UChar *s16=(const UChar *)s;
3172         UnicodeString string;
3173         int32_t prev=0, limit, length;
3174         for(i=0; i<expectCount; ++i) {
3175             limit=expectLimits[i];
3176             length=limit-prev;
3177             if(length>0) {
3178                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3179                 if(i&1) {
3180                     if(!sets[SLOW]->getSet().containsAll(string)) {
3181                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3182                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3183                         return;
3184                     }
3185                     if(!sets[FAST]->getSet().containsAll(string)) {
3186                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3187                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3188                         return;
3189                     }
3190                 } else {
3191                     if(!sets[SLOW]->getSet().containsNone(string)) {
3192                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3193                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3194                         return;
3195                     }
3196                     if(!sets[FAST]->getSet().containsNone(string)) {
3197                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3198                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3199                         return;
3200                     }
3201                 }
3202             }
3203             prev=limit;
3204         }
3205     }
3206 }
3207
3208 // Specifically test either UTF-16 or UTF-8.
3209 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3210                               const void *s, int32_t length, UBool isUTF16,
3211                               uint32_t whichSpans,
3212                               const char *testName, int32_t index) {
3213     int32_t expectLimits[500];
3214     int32_t expectCount=-1;
3215     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3216 }
3217
3218 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3219     UChar c, c2;
3220
3221     if(length>=0) {
3222         while(length>0) {
3223             c=*s++;
3224             --length;
3225             if(0xd800<=c && c<0xe000) {
3226                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3227                     return TRUE;
3228                 }
3229                 --length;
3230             }
3231         }
3232     } else {
3233         while((c=*s++)!=0) {
3234             if(0xd800<=c && c<0xe000) {
3235                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3236                     return TRUE;
3237                 }
3238             }
3239         }
3240     }
3241     return FALSE;
3242 }
3243
3244 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3245 // unless either UTF is turned off in whichSpans.
3246 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3247 // have the same contains(c) value as U+FFFD.
3248 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3249                                       const UChar *s16, int32_t length16,
3250                                       uint32_t whichSpans,
3251                                       const char *testName, int32_t index) {
3252     int32_t expectLimits[500];
3253     int32_t expectCount;
3254
3255     expectCount=-1;  // Get expectLimits[] from testSpan().
3256
3257     if((whichSpans&SPAN_UTF16)!=0) {
3258         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3259     }
3260     if((whichSpans&SPAN_UTF8)==0) {
3261         return;
3262     }
3263
3264     // Convert s16[] and expectLimits[] to UTF-8.
3265     uint8_t s8[3000];
3266     int32_t offsets[3000];
3267
3268     const UChar *s16Limit=s16+length16;
3269     char *t=(char *)s8;
3270     char *tLimit=t+sizeof(s8);
3271     int32_t *o=offsets;
3272     UErrorCode errorCode=U_ZERO_ERROR;
3273
3274     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3275     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3276     if(U_FAILURE(errorCode)) {
3277         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3278               testName, (long)index, u_errorName(errorCode));
3279         ucnv_resetFromUnicode(utf8Cnv);
3280         return;
3281     }
3282     int32_t length8=(int32_t)(t-(char *)s8);
3283
3284     // Convert expectLimits[].
3285     int32_t i, j, expect;
3286     for(i=j=0; i<expectCount; ++i) {
3287         expect=expectLimits[i];
3288         if(expect==length16) {
3289             expectLimits[i]=length8;
3290         } else {
3291             while(offsets[j]<expect) {
3292                 ++j;
3293             }
3294             expectLimits[i]=j;
3295         }
3296     }
3297
3298     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3299 }
3300
3301 static UChar32 nextCodePoint(UChar32 c) {
3302     // Skip some large and boring ranges.
3303     switch(c) {
3304     case 0x3441:
3305         return 0x4d7f;
3306     case 0x5100:
3307         return 0x9f00;
3308     case 0xb040:
3309         return 0xd780;
3310     case 0xe041:
3311         return 0xf8fe;
3312     case 0x10100:
3313         return 0x20000;
3314     case 0x20041:
3315         return 0xe0000;
3316     case 0xe0101:
3317         return 0x10fffd;
3318     default:
3319         return c+1;
3320     }
3321 }
3322
3323 // Verify that all implementations represent the same set.
3324 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3325     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3326     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3327     // Skip the UTF-8 part of the test - if the string contains surrogates -
3328     // because it is likely to produce a different result.
3329     UBool inconsistentSurrogates=
3330             (!(sets[0]->getSet().contains(0xfffd) ?
3331                sets[0]->getSet().contains(0xd800, 0xdfff) :
3332                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3333              sets[0]->hasStringsWithSurrogates());
3334
3335     UChar s[1000];
3336     int32_t length=0;
3337     uint32_t localWhichSpans;
3338
3339     UChar32 c, first;
3340     for(first=c=0;; c=nextCodePoint(c)) {
3341         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3342             localWhichSpans=whichSpans;
3343             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3344                 localWhichSpans&=~SPAN_UTF8;
3345             }
3346             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3347             if(c>0x10ffff) {
3348                 break;
3349             }
3350             length=0;
3351             first=c;
3352         }
3353         U16_APPEND_UNSAFE(s, length, c);
3354     }
3355 }
3356
3357 // Test with a particular, interesting string.
3358 // Specify length and try NUL-termination.
3359 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3360     static const UChar s[]={
3361         0x61, 0x62, 0x20,                       // Latin, space
3362         0x3b1, 0x3b2, 0x3b3,                    // Greek
3363         0xd900,                                 // lead surrogate
3364         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3365         0xdc05,                                 // trail surrogate
3366         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3367         0xd900, 0xdc05,                         // unassigned supplementary
3368         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3369         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3370         0                                       // NUL
3371     };
3372
3373     if((whichSpans&SPAN_UTF16)==0) {
3374         return;
3375     }
3376     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3377     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3378 }
3379
3380 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3381     static const char s[]={
3382         "abc"                                   // Latin
3383
3384         /* trail byte in lead position */
3385         "\x80"
3386
3387         " "                                     // space
3388
3389         /* truncated multi-byte sequences */
3390         "\xd0"
3391         "\xe0"
3392         "\xe1"
3393         "\xed"
3394         "\xee"
3395         "\xf0"
3396         "\xf1"
3397         "\xf4"
3398         "\xf8"
3399         "\xfc"
3400
3401         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3402
3403         /* trail byte in lead position */
3404         "\x80"
3405
3406         "\xe0\x80"
3407         "\xe0\xa0"
3408         "\xe1\x80"
3409         "\xed\x80"
3410         "\xed\xa0"
3411         "\xee\x80"
3412         "\xf0\x80"
3413         "\xf0\x90"
3414         "\xf1\x80"
3415         "\xf4\x80"
3416         "\xf4\x90"
3417         "\xf8\x80"
3418         "\xfc\x80"
3419
3420         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3421
3422         /* trail byte in lead position */
3423         "\x80"
3424
3425         "\xf0\x80\x80"
3426         "\xf0\x90\x80"
3427         "\xf1\x80\x80"
3428         "\xf4\x80\x80"
3429         "\xf4\x90\x80"
3430         "\xf8\x80\x80"
3431         "\xfc\x80\x80"
3432
3433         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3434
3435         /* trail byte in lead position */
3436         "\x80"
3437
3438         "\xf8\x80\x80\x80"
3439         "\xfc\x80\x80\x80"
3440
3441         "\xF1\x90\x80\x85"                      // unassigned supplementary
3442
3443         /* trail byte in lead position */
3444         "\x80"
3445
3446         "\xfc\x80\x80\x80\x80"
3447
3448         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3449
3450         /* trail byte in lead position */
3451         "\x80"
3452
3453         /* complete sequences but non-shortest forms or out of range etc. */
3454         "\xc0\x80"
3455         "\xe0\x80\x80"
3456         "\xed\xa0\x80"
3457         "\xf0\x80\x80\x80"
3458         "\xf4\x90\x80\x80"
3459         "\xf8\x80\x80\x80\x80"
3460         "\xfc\x80\x80\x80\x80\x80"
3461         "\xfe"
3462         "\xff"
3463
3464         /* trail byte in lead position */
3465         "\x80"
3466
3467         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3468     };
3469
3470     if((whichSpans&SPAN_UTF8)==0) {
3471         return;
3472     }
3473     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3474     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3475 }
3476
3477 // Take a set of span options and multiply them so that
3478 // each portion only has one of the options a, b and c.
3479 // If b==0, then the set of options is just modified with mask and a.
3480 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3481 static int32_t
3482 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3483                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3484     uint32_t s;
3485     int32_t i;
3486
3487     for(i=0; i<whichSpansCount; ++i) {
3488         s=whichSpans[i]&mask;
3489         whichSpans[i]=s|a;
3490         if(b!=0) {
3491             whichSpans[whichSpansCount+i]=s|b;
3492             if(c!=0) {
3493                 whichSpans[2*whichSpansCount+i]=s|c;
3494             }
3495         }
3496     }
3497     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3498 }
3499
3500 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3501 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3502 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3503 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3504
3505 void UnicodeSetTest::TestSpan() {
3506     // "[...]" is a UnicodeSet pattern.
3507     // "*" performs tests on all Unicode code points and on a selection of
3508     //   malformed UTF-8/16 strings.
3509     // "-options" limits the scope of testing for the current set.
3510     //   By default, the test verifies that equivalent boundaries are found
3511     //   for UTF-16 and UTF-8, going forward and backward,
3512     //   alternating USET_SPAN_NOT_CONTAINED with
3513     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3514     //   Single-character options:
3515     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3516     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3517     //          or the set contains strings with unpaired surrogates
3518     //          which do not translate to valid UTF-8.
3519     //     c -- set.span() and set.complement().span() boundaries may differ.
3520     //          Cause: Set strings are not complemented.
3521     //     b -- span() and spanBack() boundaries may differ.
3522     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3523     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3524     //          match with non-overlapping substrings.
3525     //          For example, with a set containing "ab" and "ba",
3526     //          span() of "aba" yields boundaries { 0, 2, 3 }
3527     //          because the initial "ab" matches from 0 to 2,
3528     //          while spanBack() yields boundaries { 0, 1, 3 }
3529     //          because the final "ba" matches from 1 to 3.
3530     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3531     //          Cause: Strings in the set overlap, and a longer match may
3532     //          require a sequence including non-longest substrings.
3533     //          For example, with a set containing "ab", "abc" and "cd",
3534     //          span(contained) of "abcd" spans the entire string
3535     //          but span(longest match) only spans the first 3 characters.
3536     //   Each "-options" first resets all options and then applies the specified options.
3537     //   A "-" without options resets the options.
3538     //   The options are also reset for each new set.
3539     // Other strings will be spanned.
3540     static const char *const testdata[]={
3541         "[:ID_Continue:]",
3542         "*",
3543         "[:White_Space:]",
3544         "*",
3545         "[]",
3546         "*",
3547         "[\\u0000-\\U0010FFFF]",
3548         "*",
3549         "[\\u0000\\u0080\\u0800\\U00010000]",
3550         "*",
3551         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3552         "*",
3553         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3554         "-c",
3555         "*",
3556         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3557         "-c",
3558         "*",
3559
3560         // Overlapping strings cause overlapping attempts to match.
3561         "[x{xy}{xya}{axy}{ax}]",
3562         "-cl",
3563
3564         // More repetitions of "xya" would take too long with the recursive
3565         // reference implementation.
3566         // containsAll()=FALSE
3567         // test_string 0x14
3568         "xx"
3569         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3570         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3571         "xyaxyaxyaxya"
3572         "xx"
3573         "xyaxyaxyaxya"  // span() ends here.
3574         "aaa",
3575
3576         // containsAll()=TRUE
3577         // test_string 0x15
3578         "xx"
3579         "xyaxyaxyaxya"
3580         "xx"
3581         "xyaxyaxyaxya"
3582         "xx"
3583         "xyaxyaxyaxy",
3584
3585         "-bc",
3586         // test_string 0x17
3587         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3588         "-c",
3589         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3590         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3591         "-",
3592         "byaya",     // span() -> { 5 }
3593         "byay",      // span() -> { 4 }
3594         "bya",       // span() -> { 3 }
3595
3596         // span(longest match) will not span the whole string.
3597         "[a{ab}{bc}]",
3598         "-cl",
3599         // test_string 0x21
3600         "abc",
3601
3602         "[a{ab}{abc}{cd}]",
3603         "-cl",
3604         "acdabcdabccd",
3605
3606         // spanBack(longest match) will not span the whole string.
3607         "[c{ab}{bc}]",
3608         "-cl",
3609         "abc",
3610
3611         "[d{cd}{bcd}{ab}]",
3612         "-cl",
3613         "abbcdabcdabd",
3614
3615         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3616         // and UTF-8 trail bytes.
3617         // Copies of above test sets and strings, but transliterated to have
3618         // different code points with similar trail units.
3619         // Previous: a      b         c            d
3620         // Unicode:  042B   30AB      200AB        204AB
3621         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3622         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3623         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3624         "-cl",
3625         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3626
3627         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3628         "-cl",
3629         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3630
3631         // Stress bookkeeping and recursion.
3632         // The following strings are barely doable with the recursive
3633         // reference implementation.
3634         // The not-contained character at the end prevents an early exit from the span().
3635         "[b{bb}]",
3636         "-c",
3637         // test_string 0x33
3638         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3639         // On complement sets, span() and spanBack() get different results
3640         // because b is not in the complement set and there is an odd number of b's
3641         // in the test string.
3642         "-bc",
3643         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3644
3645         // Test with set strings with an initial or final code point span
3646         // longer than 254.
3647         "[a{" _64_a _64_a _64_a _64_a "b}"
3648           "{a" _64_b _64_b _64_b _64_b "}]",
3649         "-c",
3650         _64_a _64_a _64_a _63_a "b",
3651         _64_a _64_a _64_a _64_a "b",
3652         _64_a _64_a _64_a _64_a "aaaabbbb",
3653         "a" _64_b _64_b _64_b _63_b,
3654         "a" _64_b _64_b _64_b _64_b,
3655         "aaaabbbb" _64_b _64_b _64_b _64_b,
3656
3657         // Test with strings containing unpaired surrogates.
3658         // They are not representable in UTF-8, and a leading trail surrogate
3659         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3660         // U+20001 == \\uD840\\uDC01
3661         // U+20400 == \\uD841\\uDC00
3662         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3663         "-8cl",
3664         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3665     };
3666     uint32_t whichSpans[96]={ SPAN_ALL };
3667     int32_t whichSpansCount=1;
3668
3669     UnicodeSet *sets[SET_COUNT]={ NULL };
3670     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3671
3672     char testName[1024];
3673     char *testNameLimit=testName;
3674
3675     int32_t i, j;
3676     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3677         const char *s=testdata[i];
3678         if(s[0]=='[') {
3679             // Create new test sets from this pattern.
3680             for(j=0; j<SET_COUNT; ++j) {
3681                 delete sets_with_str[j];
3682                 delete sets[j];
3683             }
3684             UErrorCode errorCode=U_ZERO_ERROR;
3685             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3686             if(U_FAILURE(errorCode)) {
3687                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3688                 break;
3689             }
3690             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3691             sets[SLOW_NOT]->complement();
3692             // Intermediate set: Test cloning of a frozen set.
3693             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3694             fast->freeze();
3695             sets[FAST]=fast->clone();
3696             delete fast;
3697             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3698             fastNot->freeze();
3699             sets[FAST_NOT]=fastNot->clone();
3700             delete fastNot;
3701
3702             for(j=0; j<SET_COUNT; ++j) {
3703                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3704             }
3705
3706             strcpy(testName, s);
3707             testNameLimit=strchr(testName, 0);
3708             *testNameLimit++=':';
3709             *testNameLimit=0;
3710
3711             whichSpans[0]=SPAN_ALL;
3712             whichSpansCount=1;
3713         } else if(s[0]=='-') {
3714             whichSpans[0]=SPAN_ALL;
3715             whichSpansCount=1;
3716
3717             while(*++s!=0) {
3718                 switch(*s) {
3719                 case 'c':
3720                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3721                                                    ~SPAN_POLARITY,
3722                                                    SPAN_SET,
3723                                                    SPAN_COMPLEMENT,
3724                                                    0);
3725                     break;
3726                 case 'b':
3727                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3728                                                    ~SPAN_DIRS,
3729                                                    SPAN_FWD,
3730                                                    SPAN_BACK,
3731                                                    0);
3732                     break;
3733                 case 'l':
3734                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3735                     // USET_SPAN_SIMPLE only FWD, and separately
3736                     // USET_SPAN_SIMPLE only BACK
3737                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3738                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3739                                                    SPAN_DIRS|SPAN_CONTAINED,
3740                                                    SPAN_FWD|SPAN_SIMPLE,
3741                                                    SPAN_BACK|SPAN_SIMPLE);
3742                     break;
3743                 case '8':
3744                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3745                                                    ~SPAN_UTFS,
3746                                                    SPAN_UTF16,
3747                                                    SPAN_UTF8,
3748                                                    0);
3749                     break;
3750                 default:
3751                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3752                     break;
3753                 }
3754             }
3755         } else if(0==strcmp(s, "*")) {
3756             strcpy(testNameLimit, "bad_string");
3757             for(j=0; j<whichSpansCount; ++j) {
3758                 if(whichSpansCount>1) {
3759                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3760                             "%%0x%3x",
3761                             whichSpans[j]);
3762                 }
3763                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3764                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3765             }
3766
3767             strcpy(testNameLimit, "contents");
3768             for(j=0; j<whichSpansCount; ++j) {
3769                 if(whichSpansCount>1) {
3770                     sprintf(testNameLimit+8 /* strlen("contents") */,
3771                             "%%0x%3x",
3772                             whichSpans[j]);
3773                 }
3774                 testSpanContents(sets_with_str, whichSpans[j], testName);
3775             }
3776         } else {
3777             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3778             strcpy(testNameLimit, "test_string");
3779             for(j=0; j<whichSpansCount; ++j) {
3780                 if(whichSpansCount>1) {
3781                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3782                             "%%0x%3x",
3783                             whichSpans[j]);
3784                 }
3785                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3786             }
3787         }
3788     }
3789     for(j=0; j<SET_COUNT; ++j) {
3790         delete sets_with_str[j];
3791         delete sets[j];
3792     }
3793 }
3794
3795 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3796 void UnicodeSetTest::TestStringSpan() {
3797     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3798     static const char *const string=
3799         "xx"
3800         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3801         "xx"
3802         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3803         "xx"
3804         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3805         "aaaa";
3806
3807     UErrorCode errorCode=U_ZERO_ERROR;
3808     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3809     UnicodeSet set(pattern16, errorCode);
3810     if(U_FAILURE(errorCode)) {
3811         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3812         return;
3813     }
3814
3815     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3816
3817     if(set.containsAll(string16)) {
3818         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3819     }
3820
3821     // Remove trailing "aaaa".
3822     string16.truncate(string16.length()-4);
3823     if(!set.containsAll(string16)) {
3824         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3825     }
3826
3827     string16=UNICODE_STRING_SIMPLE("byayaxya");
3828     const UChar *s16=string16.getBuffer();
3829     int32_t length16=string16.length();
3830     (void)length16;   // Suppress set but not used warning.
3831     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3832         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3833         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3834         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3835         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3836         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3837     ) {
3838         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3839     }
3840
3841     pattern="[a{ab}{abc}{cd}]";
3842     pattern16=UnicodeString(pattern, -1, US_INV);
3843     set.applyPattern(pattern16, errorCode);
3844     if(U_FAILURE(errorCode)) {
3845         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3846         return;
3847     }
3848     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3849     s16=string16.getBuffer();
3850     length16=string16.length();
3851     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3852         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3853         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3854     ) {
3855         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3856     }
3857
3858     pattern="[d{cd}{bcd}{ab}]";
3859     pattern16=UnicodeString(pattern, -1, US_INV);
3860     set.applyPattern(pattern16, errorCode).freeze();
3861     if(U_FAILURE(errorCode)) {
3862         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3863         return;
3864     }
3865     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3866     s16=string16.getBuffer();
3867     length16=string16.length();
3868     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3869         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3870         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3871     ) {
3872         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3873     }
3874 }
3875
3876 /**
3877  * Including collationroot.h fails here with
3878 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3879  *  .. so, we skip this test on Windows.
3880  *
3881  * the cause is that  intltest builds with /Za which disables language extensions - which means
3882  *  windows header files can't be used.
3883  */
3884 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3885 #include "collationroot.h"
3886 #include "collationtailoring.h"
3887 #endif
3888
3889 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3890 #if U_PLATFORM_HAS_WIN32_API
3891     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3892 #elif !UCONFIG_NO_COLLATION
3893     UErrorCode errorCode = U_ZERO_ERROR;
3894
3895     // Get the unsafeBackwardsSet
3896     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3897     if(U_FAILURE(errorCode)) {
3898       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3899       return;
3900     }
3901     //const UVersionInfo &version = rootEntry->tailoring->version;
3902     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3903
3904     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3905
3906     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3907         // simple test case
3908         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3909         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3910         UnicodeSet surrogates;
3911         surrogates.add(0xd83a);  // a lead surrogate
3912         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3913         UnicodeString pat;
3914         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3915         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3916         // so that at least one type of surrogate code points are escaped,
3917         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3918         errorCode = U_ZERO_ERROR;
3919         UnicodeSet s2;
3920         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3921         if(U_FAILURE(errorCode)) {
3922             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3923         } else {
3924             checkEqual(surrogates, s2, "surrogates to/from pattern");
3925         }
3926         // This occurs in the UCA unsafe-backwards set.
3927         checkRoundTrip(*unsafeBackwardSet);
3928     }
3929 #endif
3930 }
3931
3932 void UnicodeSetTest::TestIntOverflow() {
3933     // This test triggers undefined double->int conversion behavior
3934     // if the implementation is not careful.
3935     IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3936     UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3937     assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3938     assertEquals("[:ccc=int_overflow:] -> illegal argument",
3939                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3940 }
3941
3942 void UnicodeSetTest::TestUnusedCcc() {
3943 #if !UCONFIG_NO_NORMALIZATION
3944     // All numeric ccc values 0..255 are valid, but many are unused.
3945     IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3946     UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3947     assertSuccess("[:ccc=2:]", errorCode);
3948     assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3949
3950     UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3951     assertSuccess("[:ccc=255:]", errorCode);
3952     assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3953
3954     // Non-integer values and values outside 0..255 are invalid.
3955     UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3956     assertEquals("[:ccc=-1:] -> illegal argument",
3957                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3958     assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3959
3960     UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3961     assertEquals("[:ccc=256:] -> illegal argument",
3962                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3963     assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3964
3965     UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3966     assertEquals("[:ccc=1.1:] -> illegal argument",
3967                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3968     assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
3969 #endif
3970 }
3971
3972 void UnicodeSetTest::TestDeepPattern() {
3973     IcuTestErrorCode errorCode(*this, "TestDeepPattern");
3974     // Nested ranges are parsed via recursion which can use a lot of stack space.
3975     // After a reasonable limit, we should get an error.
3976     constexpr int32_t DEPTH = 20000;
3977     UnicodeString pattern, suffix;
3978     for (int32_t i = 0; i < DEPTH; ++i) {
3979         pattern.append(u"[a", 2);
3980         suffix.append(']');
3981     }
3982     pattern.append(suffix);
3983     UnicodeSet set(pattern, errorCode);
3984     assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
3985     errorCode.reset();
3986 }