icuSources/test/intltest/usettest.cpp

   1 /*
   2 **************************************************************************************
   3 *   Copyright (C) 1999-2006 International Business Machines Corporation and
   4 *   others. All Rights Reserved.
   5 **************************************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 *   03/22/2000  Madhu       Added additional tests
   9 **************************************************************************************
  10 */
  11
  12 #include "unicode/utypes.h"
  13 #include "usettest.h"
  14 #include "unicode/uniset.h"
  15 #include "unicode/uchar.h"
  16 #include "unicode/usetiter.h"
  17 #include "unicode/ustring.h"
  18 #include "unicode/parsepos.h"
  19 #include "unicode/symtable.h"
  20 #include "unicode/uversion.h"
  21 #include "hash.h"
  22
  23
  24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  25     errln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  26     u_errorName(status));}}
  27
  28 #define TEST_ASSERT(expr) {if (!(expr)) { \
  29     errln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
  30
  31 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  32     UnicodeString pat;
  33     set.toPattern(pat);
  34     return left + UnicodeSetTest::escape(pat);
  35 }
  36
  37 #define CASE(id,test) case id:                          \
  38                           name = #test;                 \
  39                           if (exec) {                   \
  40                               logln(#test "---");       \
  41                               logln((UnicodeString)""); \
  42                               test();                   \
  43                           }                             \
  44                           break
  45
  46 void
  47 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  48                                const char* &name, char* /*par*/) {
  49     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  50     switch (index) {
  51         CASE(0,TestPatterns);
  52         CASE(1,TestAddRemove);
  53         CASE(2,TestCategories);
  54         CASE(3,TestCloneEqualHash);
  55         CASE(4,TestMinimalRep);
  56         CASE(5,TestAPI);
  57         CASE(6,TestScriptSet);
  58         CASE(7,TestPropertySet);
  59         CASE(8,TestClone);
  60         CASE(9,TestExhaustive);
  61         CASE(10,TestToPattern);
  62         CASE(11,TestIndexOf);
  63         CASE(12,TestStrings);
  64         CASE(13,Testj2268);
  65         CASE(14,TestCloseOver);
  66         CASE(15,TestEscapePattern);
  67         CASE(16,TestInvalidCodePoint);
  68         CASE(17,TestSymbolTable);
  69         CASE(18,TestSurrogate);
  70         CASE(19,TestPosixClasses);
  71         CASE(20,TestIteration);
  72         default: name = ""; break;
  73     }
  74 }
  75
  76 static const char NOT[] = "%%%%";
  77
  78 /**
  79  * UVector was improperly copying contents
  80  * This code will crash this is still true
  81  */
  82 void UnicodeSetTest::Testj2268() {
  83   UnicodeSet t;
  84   t.add(UnicodeString("abc"));
  85   UnicodeSet test(t);
  86   UnicodeString ustrPat;
  87   test.toPattern(ustrPat, TRUE);
  88 }
  89
  90 /**
  91  * Test toPattern().
  92  */
  93 void UnicodeSetTest::TestToPattern() {
  94     UErrorCode ec = U_ZERO_ERROR;
  95
  96     // Test that toPattern() round trips with syntax characters and
  97     // whitespace.
  98     {
  99         static const char* OTHER_TOPATTERN_TESTS[] = {
 100             "[[:latin:]&[:greek:]]",
 101             "[[:latin:]-[:greek:]]",
 102             "[:nonspacing mark:]",
 103             NULL
 104         };
 105
 106         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 107             ec = U_ZERO_ERROR;
 108             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 109             if (U_FAILURE(ec)) {
 110                 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
 111                 continue;
 112             }
 113             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 114         }
 115
 116         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 117             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 118
 119                 // check various combinations to make sure they all work.
 120                 if (i != 0 && !toPatternAux(i, i)){
 121                     continue;
 122                 }
 123                 if (!toPatternAux(0, i)){
 124                     continue;
 125                 }
 126                 if (!toPatternAux(i, 0xFFFF)){
 127                     continue;
 128                 }
 129             }
 130         }
 131     }
 132
 133     // Test pattern behavior of multicharacter strings.
 134     {
 135         ec = U_ZERO_ERROR;
 136         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 137
 138         // This loop isn't a loop.  It's here to make the compiler happy.
 139         // If you're curious, try removing it and changing the 'break'
 140         // statements (except for the last) to goto's.
 141         for (;;) {
 142             if (U_FAILURE(ec)) break;
 143             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 144             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 145
 146             s->add("ac");
 147             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 148             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 149
 150             s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
 151             if (U_FAILURE(ec)) break;
 152             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 153             expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3);
 154
 155             s->add("[]");
 156             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 157             expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
 158
 159             s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
 160             if (U_FAILURE(ec)) break;
 161             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 162             expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
 163
 164             // j2189
 165             s->clear();
 166             s->add(UnicodeString("abc", ""));
 167             s->add(UnicodeString("abc", ""));
 168             const char* exp6[] = {"abc", NOT, "ab", NULL};
 169             expectToPattern(*s, "[{abc}]", exp6);
 170
 171             break;
 172         }
 173
 174         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 175         delete s;
 176     }
 177
 178     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 179     UnicodeSet s;
 180     s.add((UChar)97, (UChar)98); // 'a', 'b'
 181     expectToPattern(s, "[ab]", NULL);
 182 }
 183
 184 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 185
 186     // use Integer.toString because Utility.hex doesn't handle ints
 187     UnicodeString pat = "";
 188     // TODO do these in hex
 189     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 190     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 191     UnicodeString source;
 192     source = source + (uint32_t)start;
 193     if (start != end)
 194         source = source + ".." + (uint32_t)end;
 195     UnicodeSet testSet;
 196     testSet.add(start, end);
 197     return checkPat(source, testSet);
 198 }
 199
 200 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 201                                const UnicodeSet& testSet) {
 202     // What we want to make sure of is that a pattern generated
 203     // by toPattern(), with or without escaped unprintables, can
 204     // be passed back into the UnicodeSet constructor.
 205     UnicodeString pat0;
 206
 207     testSet.toPattern(pat0, TRUE);
 208
 209     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 210
 211     //String pat1 = unescapeLeniently(pat0);
 212     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 213
 214     UnicodeString pat2;
 215     testSet.toPattern(pat2, FALSE);
 216     if (!checkPat(source, testSet, pat2)) return FALSE;
 217
 218     //String pat3 = unescapeLeniently(pat2);
 219     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 220
 221     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 222     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 223     return TRUE;
 224 }
 225
 226 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 227                                const UnicodeSet& testSet,
 228                                const UnicodeString& pat) {
 229     UErrorCode ec = U_ZERO_ERROR;
 230     UnicodeSet testSet2(pat, ec);
 231     if (testSet2 != testSet) {
 232         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 233         return FALSE;
 234     }
 235     return TRUE;
 236 }
 237
 238 void
 239 UnicodeSetTest::TestPatterns(void) {
 240     UnicodeSet set;
 241     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 242     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 243     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 244     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 245     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 246     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 247
 248     // Throw in a test of complement
 249     set.complement();
 250     UnicodeString exp;
 251     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 252     expectPairs(set, exp);
 253 }
 254
 255 void
 256 UnicodeSetTest::TestCategories(void) {
 257     UErrorCode status = U_ZERO_ERROR;
 258     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 259     UnicodeSet set(pat, status);
 260     if (U_FAILURE(status)) {
 261         errln((UnicodeString)"Fail: Can't construct set with " + pat);
 262     } else {
 263         expectContainment(set, pat, "ABC", "abc");
 264     }
 265
 266     UChar32 i;
 267     int32_t failures = 0;
 268     // Make sure generation of L doesn't pollute cached Lu set
 269     // First generate L, then Lu
 270     set.applyPattern("[:L:]", status);
 271     if (U_FAILURE(status)) { errln("FAIL"); return; }
 272     for (i=0; i<0x200; ++i) {
 273         UBool l = u_isalpha((UChar)i);
 274         if (l != set.contains(i)) {
 275             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 276                   set.contains(i));
 277             if (++failures == 10) break;
 278         }
 279     }
 280
 281     set.applyPattern("[:Lu:]", status);
 282     if (U_FAILURE(status)) { errln("FAIL"); return; }
 283     for (i=0; i<0x200; ++i) {
 284         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 285         if (lu != set.contains(i)) {
 286             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 287                   set.contains(i));
 288             if (++failures == 20) break;
 289         }
 290     }
 291 }
 292 void
 293 UnicodeSetTest::TestCloneEqualHash(void) {
 294     UErrorCode status = U_ZERO_ERROR;
 295     // set1 and set2 used to be built with the obsolete constructor taking
 296     // UCharCategory values; replaced with pattern constructors
 297     // markus 20030502
 298     UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); //  :Ll: Letter, lowercase
 299     UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); //  Letter, lowercase
 300     if (U_FAILURE(status)){
 301         errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
 302         return;
 303     }
 304     UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status);   //Number, Decimal digit
 305     UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status);   //Number, Decimal digit
 306     if (U_FAILURE(status)){
 307         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 308         return;
 309     }
 310
 311     if (*set1 != *set1a) {
 312         errln("FAIL: category constructor for Ll broken");
 313     }
 314     if (*set2 != *set2a) {
 315         errln("FAIL: category constructor for Nd broken");
 316     }
 317     delete set1a;
 318     delete set2a;
 319
 320     logln("Testing copy construction");
 321     UnicodeSet *set1copy=new UnicodeSet(*set1);
 322     if(*set1 != *set1copy || *set1 == *set2 ||
 323         getPairs(*set1) != getPairs(*set1copy) ||
 324         set1->hashCode() != set1copy->hashCode()){
 325         errln("FAIL : Error in copy construction");
 326         return;
 327     }
 328
 329     logln("Testing =operator");
 330     UnicodeSet set1equal=*set1;
 331     UnicodeSet set2equal=*set2;
 332     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 333         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 334         errln("FAIL: Error in =operator");
 335     }
 336
 337     logln("Testing clone()");
 338     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 339     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 340     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 341         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 342         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 343         errln("FAIL: Error in clone");
 344     }
 345
 346     logln("Testing hashcode");
 347     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 348         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 349         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 350         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 351         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 352         errln("FAIL: Error in hashCode()");
 353     }
 354
 355     delete set1;
 356     delete set1copy;
 357     delete set2;
 358     delete set1clone;
 359     delete set2clone;
 360
 361
 362 }
 363 void
 364 UnicodeSetTest::TestAddRemove(void) {
 365     UnicodeSet set; // Construct empty set
 366     doAssert(set.isEmpty() == TRUE, "set should be empty");
 367     doAssert(set.size() == 0, "size should be 0");
 368     set.complement();
 369     doAssert(set.size() == 0x110000, "size should be 0x110000");
 370     set.clear();
 371     set.add(0x0061, 0x007a);
 372     expectPairs(set, "az");
 373     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 374     doAssert(set.size() != 0, "size should not be equal to 0");
 375     doAssert(set.size() == 26, "size should be equal to 26");
 376     set.remove(0x006d, 0x0070);
 377     expectPairs(set, "alqz");
 378     doAssert(set.size() == 22, "size should be equal to 22");
 379     set.remove(0x0065, 0x0067);
 380     expectPairs(set, "adhlqz");
 381     doAssert(set.size() == 19, "size should be equal to 19");
 382     set.remove(0x0064, 0x0069);
 383     expectPairs(set, "acjlqz");
 384     doAssert(set.size() == 16, "size should be equal to 16");
 385     set.remove(0x0063, 0x0072);
 386     expectPairs(set, "absz");
 387     doAssert(set.size() == 10, "size should be equal to 10");
 388     set.add(0x0066, 0x0071);
 389     expectPairs(set, "abfqsz");
 390     doAssert(set.size() == 22, "size should be equal to 22");
 391     set.remove(0x0061, 0x0067);
 392     expectPairs(set, "hqsz");
 393     set.remove(0x0061, 0x007a);
 394     expectPairs(set, "");
 395     doAssert(set.isEmpty() == TRUE, "set should be empty");
 396     doAssert(set.size() == 0, "size should be 0");
 397     set.add(0x0061);
 398     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 399     doAssert(set.size() == 1, "size should not be equal to 1");
 400     set.add(0x0062);
 401     set.add(0x0063);
 402     expectPairs(set, "ac");
 403     doAssert(set.size() == 3, "size should not be equal to 3");
 404     set.add(0x0070);
 405     set.add(0x0071);
 406     expectPairs(set, "acpq");
 407     doAssert(set.size() == 5, "size should not be equal to 5");
 408     set.clear();
 409     expectPairs(set, "");
 410     doAssert(set.isEmpty() == TRUE, "set should be empty");
 411     doAssert(set.size() == 0, "size should be 0");
 412
 413     // Try removing an entire set from another set
 414     expectPattern(set, "[c-x]", "cx");
 415     UnicodeSet set2;
 416     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 417     set.removeAll(set2);
 418     expectPairs(set, "deluxx");
 419
 420     // Try adding an entire set to another set
 421     expectPattern(set, "[jackiemclean]", "aacceein");
 422     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 423     set.addAll(set2);
 424     expectPairs(set, "aacehort");
 425     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 426
 427     // Try retaining an set of elements contained in another set (intersection)
 428     UnicodeSet set3;
 429     expectPattern(set3, "[a-c]", "ac");
 430     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 431     set3.remove(0x0062);
 432     expectPairs(set3, "aacc");
 433     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 434     set.retainAll(set3);
 435     expectPairs(set, "aacc");
 436     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 437     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 438     set.clear();
 439     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 440
 441     // Test commutativity
 442     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 443     expectPattern(set2, "[jackiemclean]", "aacceein");
 444     set.addAll(set2);
 445     expectPairs(set, "aacehort");
 446     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 447
 448
 449
 450
 451 }
 452
 453 /**
 454  * Make sure minimal representation is maintained.
 455  */
 456 void UnicodeSetTest::TestMinimalRep() {
 457     UErrorCode status = U_ZERO_ERROR;
 458     // This is pretty thoroughly tested by checkCanonicalRep()
 459     // run against the exhaustive operation results.  Use the code
 460     // here for debugging specific spot problems.
 461
 462     // 1 overlap against 2
 463     UnicodeSet set("[h-km-q]", status);
 464     if (U_FAILURE(status)) { errln("FAIL"); return; }
 465     UnicodeSet set2("[i-o]", status);
 466     if (U_FAILURE(status)) { errln("FAIL"); return; }
 467     set.addAll(set2);
 468     expectPairs(set, "hq");
 469     // right
 470     set.applyPattern("[a-m]", status);
 471     if (U_FAILURE(status)) { errln("FAIL"); return; }
 472     set2.applyPattern("[e-o]", status);
 473     if (U_FAILURE(status)) { errln("FAIL"); return; }
 474     set.addAll(set2);
 475     expectPairs(set, "ao");
 476     // left
 477     set.applyPattern("[e-o]", status);
 478     if (U_FAILURE(status)) { errln("FAIL"); return; }
 479     set2.applyPattern("[a-m]", status);
 480     if (U_FAILURE(status)) { errln("FAIL"); return; }
 481     set.addAll(set2);
 482     expectPairs(set, "ao");
 483     // 1 overlap against 3
 484     set.applyPattern("[a-eg-mo-w]", status);
 485     if (U_FAILURE(status)) { errln("FAIL"); return; }
 486     set2.applyPattern("[d-q]", status);
 487     if (U_FAILURE(status)) { errln("FAIL"); return; }
 488     set.addAll(set2);
 489     expectPairs(set, "aw");
 490 }
 491
 492 void UnicodeSetTest::TestAPI() {
 493     UErrorCode status = U_ZERO_ERROR;
 494     // default ct
 495     UnicodeSet set;
 496     if (!set.isEmpty() || set.getRangeCount() != 0) {
 497         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 498               set);
 499     }
 500
 501     // clear(), isEmpty()
 502     set.add(0x0061);
 503     if (set.isEmpty()) {
 504         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 505               set);
 506     }
 507     set.clear();
 508     if (!set.isEmpty()) {
 509         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 510               set);
 511     }
 512
 513     // size()
 514     set.clear();
 515     if (set.size() != 0) {
 516         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 517               ": " + set);
 518     }
 519     set.add(0x0061);
 520     if (set.size() != 1) {
 521         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 522               ": " + set);
 523     }
 524     set.add(0x0031, 0x0039);
 525     if (set.size() != 10) {
 526         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 527               ": " + set);
 528     }
 529
 530     // contains(first, last)
 531     set.clear();
 532     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 533     if (U_FAILURE(status)) { errln("FAIL"); return; }
 534     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 535         UChar32 a = set.getRangeStart(i);
 536         UChar32 b = set.getRangeEnd(i);
 537         if (!set.contains(a, b)) {
 538             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 539                   " but doesn't: " + set);
 540         }
 541         if (set.contains((UChar32)(a-1), b)) {
 542             errln((UnicodeString)"FAIL, shouldn't contain " +
 543                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 544                   " but does: " + set);
 545         }
 546         if (set.contains(a, (UChar32)(b+1))) {
 547             errln((UnicodeString)"FAIL, shouldn't contain " +
 548                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 549                   " but does: " + set);
 550         }
 551     }
 552
 553     // Ported InversionList test.
 554     UnicodeSet a((UChar32)3,(UChar32)10);
 555     UnicodeSet b((UChar32)7,(UChar32)15);
 556     UnicodeSet c;
 557
 558     logln((UnicodeString)"a [3-10]: " + a);
 559     logln((UnicodeString)"b [7-15]: " + b);
 560     c = a;
 561     c.addAll(b);
 562     UnicodeSet exp((UChar32)3,(UChar32)15);
 563     if (c == exp) {
 564         logln((UnicodeString)"c.set(a).add(b): " + c);
 565     } else {
 566         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 567     }
 568     c.complement();
 569     exp.set((UChar32)0, (UChar32)2);
 570     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 571     if (c == exp) {
 572         logln((UnicodeString)"c.complement(): " + c);
 573     } else {
 574         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 575     }
 576     c.complement();
 577     exp.set((UChar32)3, (UChar32)15);
 578     if (c == exp) {
 579         logln((UnicodeString)"c.complement(): " + c);
 580     } else {
 581         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 582     }
 583     c = a;
 584     c.complementAll(b);
 585     exp.set((UChar32)3,(UChar32)6);
 586     exp.add((UChar32)11,(UChar32) 15);
 587     if (c == exp) {
 588         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 589     } else {
 590         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 591     }
 592
 593     exp = c;
 594     bitsToSet(setToBits(c), c);
 595     if (c == exp) {
 596         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 597     } else {
 598         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 599     }
 600
 601     // Additional tests for coverage JB#2118
 602     //UnicodeSet::complement(class UnicodeString const &)
 603     //UnicodeSet::complementAll(class UnicodeString const &)
 604     //UnicodeSet::containsNone(class UnicodeSet const &)
 605     //UnicodeSet::containsNone(long,long)
 606     //UnicodeSet::containsSome(class UnicodeSet const &)
 607     //UnicodeSet::containsSome(long,long)
 608     //UnicodeSet::removeAll(class UnicodeString const &)
 609     //UnicodeSet::retain(long)
 610     //UnicodeSet::retainAll(class UnicodeString const &)
 611     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 612     //UnicodeSetIterator::getString(void)
 613     set.clear();
 614     set.complement("ab");
 615     exp.applyPattern("[{ab}]", status);
 616     if (U_FAILURE(status)) { errln("FAIL"); return; }
 617     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 618
 619     UnicodeSetIterator iset(set);
 620     if (!iset.next() || !iset.isString()) {
 621         errln("FAIL: UnicodeSetIterator::next/isString");
 622     } else if (iset.getString() != "ab") {
 623         errln("FAIL: UnicodeSetIterator::getString");
 624     }
 625
 626     set.add((UChar32)0x61, (UChar32)0x7A);
 627     set.complementAll("alan");
 628     exp.applyPattern("[{ab}b-kmo-z]", status);
 629     if (U_FAILURE(status)) { errln("FAIL"); return; }
 630     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 631
 632     exp.applyPattern("[a-z]", status);
 633     if (U_FAILURE(status)) { errln("FAIL"); return; }
 634     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 635     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 636     exp.applyPattern("[aln]", status);
 637     if (U_FAILURE(status)) { errln("FAIL"); return; }
 638     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 639     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 640
 641     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 642         errln("FAIL: containsNone(UChar32, UChar32)");
 643     }
 644     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 645         errln("FAIL: containsSome(UChar32, UChar32)");
 646     }
 647     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 648         errln("FAIL: containsNone(UChar32, UChar32)");
 649     }
 650     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 651         errln("FAIL: containsSome(UChar32, UChar32)");
 652     }
 653
 654     set.removeAll("liu");
 655     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 656     if (U_FAILURE(status)) { errln("FAIL"); return; }
 657     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 658
 659     set.retainAll("star");
 660     exp.applyPattern("[rst]", status);
 661     if (U_FAILURE(status)) { errln("FAIL"); return; }
 662     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 663
 664     set.retain((UChar32)0x73);
 665     exp.applyPattern("[s]", status);
 666     if (U_FAILURE(status)) { errln("FAIL"); return; }
 667     if (set != exp) { errln("FAIL: retain('s')"); return; }
 668
 669     uint16_t buf[32];
 670     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
 671     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 672     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 673         errln("FAIL: serialize");
 674         return;
 675     }
 676 }
 677
 678 void UnicodeSetTest::TestIteration() {
 679     UErrorCode ec = U_ZERO_ERROR;
 680     int i = 0;
 681     int outerLoop;
 682
 683     // 6 code points, 3 ranges, 2 strings, 8 total elements
 684     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 685     UnicodeSet set("[zabyc\\U0001abcd{str1}{str2}]", ec);
 686     TEST_ASSERT_SUCCESS(ec);
 687     UnicodeSetIterator it(set);
 688
 689     for (outerLoop=0; outerLoop<3; outerLoop++) {
 690         // Run the test multiple times, to check that iterator.reset() is working.
 691         for (i=0; i<10; i++) {
 692             UBool         nextv        = it.next();
 693             UBool         isString     = it.isString();
 694             int32_t       codePoint    = it.getCodepoint();
 695             //int32_t       codePointEnd = it.getCodepointEnd();
 696             UnicodeString s   = it.getString();
 697             switch (i) {
 698             case 0:
 699                 TEST_ASSERT(nextv == TRUE);
 700                 TEST_ASSERT(isString == FALSE);
 701                 TEST_ASSERT(codePoint==0x61);
 702                 TEST_ASSERT(s == "a");
 703                 break;
 704             case 1:
 705                 TEST_ASSERT(nextv == TRUE);
 706                 TEST_ASSERT(isString == FALSE);
 707                 TEST_ASSERT(codePoint==0x62);
 708                 TEST_ASSERT(s == "b");
 709                 break;
 710             case 2:
 711                 TEST_ASSERT(nextv == TRUE);
 712                 TEST_ASSERT(isString == FALSE);
 713                 TEST_ASSERT(codePoint==0x63);
 714                 TEST_ASSERT(s == "c");
 715                 break;
 716             case 3:
 717                 TEST_ASSERT(nextv == TRUE);
 718                 TEST_ASSERT(isString == FALSE);
 719                 TEST_ASSERT(codePoint==0x79);
 720                 TEST_ASSERT(s == "y");
 721                 break;
 722             case 4:
 723                 TEST_ASSERT(nextv == TRUE);
 724                 TEST_ASSERT(isString == FALSE);
 725                 TEST_ASSERT(codePoint==0x7a);
 726                 TEST_ASSERT(s == "z");
 727                 break;
 728             case 5:
 729                 TEST_ASSERT(nextv == TRUE);
 730                 TEST_ASSERT(isString == FALSE);
 731                 TEST_ASSERT(codePoint==0x1abcd);
 732                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 733                 break;
 734             case 6:
 735                 TEST_ASSERT(nextv == TRUE);
 736                 TEST_ASSERT(isString == TRUE);
 737                 TEST_ASSERT(s == "str1");
 738                 break;
 739             case 7:
 740                 TEST_ASSERT(nextv == TRUE);
 741                 TEST_ASSERT(isString == TRUE);
 742                 TEST_ASSERT(s == "str2");
 743                 break;
 744             case 8:
 745                 TEST_ASSERT(nextv == FALSE);
 746                 break;
 747             case 9:
 748                 TEST_ASSERT(nextv == FALSE);
 749                 break;
 750             }
 751         }
 752         it.reset();  // prepare to run the iteration again.
 753     }
 754 }
 755
 756
 757
 758
 759 void UnicodeSetTest::TestStrings() {
 760     UErrorCode ec = U_ZERO_ERROR;
 761
 762     UnicodeSet* testList[] = {
 763         UnicodeSet::createFromAll("abc"),
 764         new UnicodeSet("[a-c]", ec),
 765
 766         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 767         new UnicodeSet("[{ll}{ch}a-z]", ec),
 768
 769         UnicodeSet::createFrom("ab}c"),
 770         new UnicodeSet("[{ab\\}c}]", ec),
 771
 772         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 773         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 774
 775         NULL
 776     };
 777
 778     if (U_FAILURE(ec)) {
 779         errln("FAIL: couldn't construct test sets");
 780     }
 781
 782     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 783         if (U_SUCCESS(ec)) {
 784             UnicodeString pat0, pat1;
 785             testList[i]->toPattern(pat0, TRUE);
 786             testList[i+1]->toPattern(pat1, TRUE);
 787             if (*testList[i] == *testList[i+1]) {
 788                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 789             } else {
 790                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 791             }
 792         }
 793         delete testList[i];
 794         delete testList[i+1];
 795     }
 796 }
 797
 798 /**
 799  * Test the [:Latin:] syntax.
 800  */
 801 void UnicodeSetTest::TestScriptSet() {
 802     expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 803
 804     expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 805
 806     /* Jitterbug 1423 */
 807     expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 808
 809 }
 810
 811 /**
 812  * Test the [:Latin:] syntax.
 813  */
 814 void UnicodeSetTest::TestPropertySet() {
 815     static const char* DATA[] = {
 816         // Pattern, Chars IN, Chars NOT in
 817
 818         "[:Latin:]",
 819         "aA",
 820         "\\u0391\\u03B1",
 821
 822         "[\\p{Greek}]",
 823         "\\u0391\\u03B1",
 824         "aA",
 825
 826         "\\P{ GENERAL Category = upper case letter }",
 827         "abc",
 828         "ABC",
 829
 830         // Combining class: @since ICU 2.2
 831         // Check both symbolic and numeric
 832         "\\p{ccc=Nukta}",
 833         "\\u0ABC",
 834         "abc",
 835
 836         "\\p{Canonical Combining Class = 11}",
 837         "\\u05B1",
 838         "\\u05B2",
 839
 840         "[:c c c = iota subscript :]",
 841         "\\u0345",
 842         "xyz",
 843
 844         // Bidi class: @since ICU 2.2
 845         "\\p{bidiclass=lefttoright}",
 846         "abc",
 847         "\\u0671\\u0672",
 848
 849         // Binary properties: @since ICU 2.2
 850         "\\p{ideographic}",
 851         "\\u4E0A",
 852         "x",
 853
 854         "[:math=false:]",
 855         "q)*(",
 856         // weiv: )(and * were removed from math in Unicode 4.0.1
 857         //"(*+)",
 858         "+<>^",
 859
 860         // JB#1767 \N{}, \p{ASCII}
 861         "[:Ascii:]",
 862         "abc\\u0000\\u007F",
 863         "\\u0080\\u4E00",
 864
 865         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 866         "az",
 867         "qrs",
 868
 869         // JB#2015
 870         "[:any:]",
 871         "a\\U0010FFFF",
 872         "",
 873
 874         "[:nv=0.5:]",
 875         "\\u00BD\\u0F2A",
 876         "\\u00BC",
 877
 878         // JB#2653: Age
 879         "[:Age=1.1:]",
 880         "\\u03D6", // 1.1
 881         "\\u03D8\\u03D9", // 3.2
 882
 883         "[:Age=3.1:]",
 884         "\\u1800\\u3400\\U0002f800",
 885         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 886
 887         // JB#2350: Case_Sensitive
 888         "[:Case Sensitive:]",
 889         "A\\u1FFC\\U00010410",
 890         ";\\u00B4\\U00010500",
 891
 892         // JB#2832: C99-compatibility props
 893         "[:blank:]",
 894         " \\u0009",
 895         "1-9A-Z",
 896
 897         "[:graph:]",
 898         "19AZ",
 899         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 900
 901         "[:punct:]",
 902         "!@#%&*()[]{}-_\\/;:,.?'\"",
 903         "09azAZ",
 904
 905         "[:xdigit:]",
 906         "09afAF",
 907         "gG!",
 908
 909         // Regex compatibility test
 910         "[-b]", // leading '-' is literal
 911         "-b",
 912         "ac",
 913
 914         "[^-b]", // leading '-' is literal
 915         "ac",
 916         "-b",
 917
 918         "[b-]", // trailing '-' is literal
 919         "-b",
 920         "ac",
 921
 922         "[^b-]", // trailing '-' is literal
 923         "ac",
 924         "-b",
 925
 926         "[a-b-]", // trailing '-' is literal
 927         "ab-",
 928         "c=",
 929
 930         "[[a-q]&[p-z]-]", // trailing '-' is literal
 931         "pq-",
 932         "or=",
 933
 934         "[\\s|\\)|:|$|\\>]", // from regex tests
 935         "s|):$>",
 936         "abc",
 937
 938         "[\\uDC00cd]", // JB#2906: isolated trail at start
 939         "cd\\uDC00",
 940         "ab\\uD800\\U00010000",
 941
 942         "[ab\\uD800]", // JB#2906: isolated trail at start
 943         "ab\\uD800",
 944         "cd\\uDC00\\U00010000",
 945
 946         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
 947         "abcd\\uD800",
 948         "ef\\uDC00\\U00010000",
 949
 950         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
 951         "abcd\\uDC00",
 952         "ef\\uD800\\U00010000",
 953
 954         "[:^lccc=0:]", // Lead canonical class
 955         "\\u0300\\u0301",
 956         "abcd\\u00c0\\u00c5",
 957
 958         "[:^tccc=0:]", // Trail canonical class
 959         "\\u0300\\u0301\\u00c0\\u00c5",
 960         "abcd",
 961
 962         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
 963         "\\u0300\\u0301\\u00c0\\u00c5",
 964         "abcd",
 965
 966         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
 967         "",
 968         "abcd\\u0300\\u0301\\u00c0\\u00c5",
 969
 970         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
 971         "\\u0F73\\u0F75\\u0F81",
 972         "abcd\\u0300\\u0301\\u00c0\\u00c5",
 973
 974         "[:Assigned:]",
 975         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
 976         "\\u0888\\uFDD3\\uFFFE\\U00050005"
 977     };
 978
 979     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
 980
 981     for (int32_t i=0; i<DATA_LEN; i+=3) {
 982         expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]),
 983                           CharsToUnicodeString(DATA[i+2]));
 984     }
 985 }
 986
 987 /**
 988   * Test that Posix style character classes [:digit:], etc.
 989   *   have the Unicode definitions from TR 18.
 990   */
 991 void UnicodeSetTest::TestPosixClasses() {
 992     {
 993         UErrorCode status = U_ZERO_ERROR;
 994         UnicodeSet s1("[:alpha:]", status);
 995         UnicodeSet s2("\\p{Alphabetic}", status);
 996         TEST_ASSERT_SUCCESS(status);
 997         TEST_ASSERT(s1==s2);
 998     }
 999     {
1000         UErrorCode status = U_ZERO_ERROR;
1001         UnicodeSet s1("[:lower:]", status);
1002         UnicodeSet s2("\\p{lowercase}", status);
1003         TEST_ASSERT_SUCCESS(status);
1004         TEST_ASSERT(s1==s2);
1005     }
1006     {
1007         UErrorCode status = U_ZERO_ERROR;
1008         UnicodeSet s1("[:upper:]", status);
1009         UnicodeSet s2("\\p{Uppercase}", status);
1010         TEST_ASSERT_SUCCESS(status);
1011         TEST_ASSERT(s1==s2);
1012     }
1013     {
1014         UErrorCode status = U_ZERO_ERROR;
1015         UnicodeSet s1("[:punct:]", status);
1016         UnicodeSet s2("\\p{gc=Punctuation}", status);
1017         TEST_ASSERT_SUCCESS(status);
1018         TEST_ASSERT(s1==s2);
1019     }
1020     {
1021         UErrorCode status = U_ZERO_ERROR;
1022         UnicodeSet s1("[:digit:]", status);
1023         UnicodeSet s2("\\p{gc=DecimalNumber}", status);
1024         TEST_ASSERT_SUCCESS(status);
1025         TEST_ASSERT(s1==s2);
1026     }
1027     {
1028         UErrorCode status = U_ZERO_ERROR;
1029         UnicodeSet s1("[:xdigit:]", status);
1030         UnicodeSet s2("[\\p{DecimalNumber}\\p{HexDigit}]", status);
1031         TEST_ASSERT_SUCCESS(status);
1032         TEST_ASSERT(s1==s2);
1033     }
1034     {
1035         UErrorCode status = U_ZERO_ERROR;
1036         UnicodeSet s1("[:alnum:]", status);
1037         UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1038         TEST_ASSERT_SUCCESS(status);
1039         TEST_ASSERT(s1==s2);
1040     }
1041     {
1042         UErrorCode status = U_ZERO_ERROR;
1043         UnicodeSet s1("[:space:]", status);
1044         UnicodeSet s2("\\p{Whitespace}", status);
1045         TEST_ASSERT_SUCCESS(status);
1046         TEST_ASSERT(s1==s2);
1047     }
1048     {
1049         UErrorCode status = U_ZERO_ERROR;
1050         UnicodeSet s1("[:blank:]", status);
1051         TEST_ASSERT_SUCCESS(status);
1052         UnicodeSet s2("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1053             status);
1054         TEST_ASSERT_SUCCESS(status);
1055         TEST_ASSERT(s1==s2);
1056     }
1057     {
1058         UErrorCode status = U_ZERO_ERROR;
1059         UnicodeSet s1("[:cntrl:]", status);
1060         TEST_ASSERT_SUCCESS(status);
1061         UnicodeSet s2("\\p{Control}", status);
1062         TEST_ASSERT_SUCCESS(status);
1063         TEST_ASSERT(s1==s2);
1064     }
1065     {
1066         UErrorCode status = U_ZERO_ERROR;
1067         UnicodeSet s1("[:graph:]", status);
1068         TEST_ASSERT_SUCCESS(status);
1069         UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1070         TEST_ASSERT_SUCCESS(status);
1071         TEST_ASSERT(s1==s2);
1072     }
1073     {
1074         UErrorCode status = U_ZERO_ERROR;
1075         UnicodeSet s1("[:print:]", status);
1076         TEST_ASSERT_SUCCESS(status);
1077         UnicodeSet s2("[[:graph:][:blank:]-[\\p{Control}]]" ,status);
1078         TEST_ASSERT_SUCCESS(status);
1079         TEST_ASSERT(s1==s2);
1080     }
1081 }
1082 /**
1083  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1084  */
1085 void UnicodeSetTest::TestClone() {
1086     UErrorCode ec = U_ZERO_ERROR;
1087     UnicodeSet s("[abcxyz]", ec);
1088     UnicodeSet t(s);
1089     expectContainment(t, "abc", "def");
1090 }
1091
1092 /**
1093  * Test the indexOf() and charAt() methods.
1094  */
1095 void UnicodeSetTest::TestIndexOf() {
1096     UErrorCode ec = U_ZERO_ERROR;
1097     UnicodeSet set("[a-cx-y3578]", ec);
1098     if (U_FAILURE(ec)) {
1099         errln("FAIL: UnicodeSet constructor");
1100         return;
1101     }
1102     for (int32_t i=0; i<set.size(); ++i) {
1103         UChar32 c = set.charAt(i);
1104         if (set.indexOf(c) != i) {
1105             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1106                 i, c, set.indexOf(c));
1107         }
1108     }
1109     UChar32 c = set.charAt(set.size());
1110     if (c != -1) {
1111         errln("FAIL: charAt(<out of range>) = %X", c);
1112     }
1113     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1114     if (j != -1) {
1115         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1116     }
1117 }
1118
1119 /**
1120  * Test closure API.
1121  */
1122 void UnicodeSetTest::TestCloseOver() {
1123     UErrorCode ec = U_ZERO_ERROR;
1124
1125     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1126     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1127     const char* DATA[] = {
1128         // selector, input, output
1129         CASE,
1130         "[aq\\u00DF{Bc}{bC}{Fi}]",
1131         "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
1132
1133         CASE,
1134         "[\\u01F1]", // 'DZ'
1135         "[\\u01F1\\u01F2\\u01F3]",
1136
1137         CASE,
1138         "[\\u1FB4]",
1139         "[\\u1FB4{\\u03AC\\u03B9}]",
1140
1141         CASE,
1142         "[{F\\uFB01}]",
1143         "[\\uFB03{ffi}]",
1144
1145         CASE, // make sure binary search finds limits
1146         "[a\\uFF3A]",
1147         "[aA\\uFF3A\\uFF5A]",
1148
1149         CASE,
1150         "[a-z]","[A-Za-z\\u017F\\u212A]",
1151         CASE,
1152         "[abc]","[A-Ca-c]",
1153         CASE,
1154         "[ABC]","[A-Ca-c]",
1155
1156         CASE, "[i]", "[iI]",
1157
1158         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1159         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1160
1161         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1162
1163         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1164
1165         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1166
1167         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1168
1169         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1170
1171         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1172
1173         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1174         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1175
1176         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1177
1178         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1179
1180         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1181
1182         CASE_MAPPINGS,
1183         "[aq\\u00DF{Bc}{bC}{Fi}]",
1184         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1185
1186         CASE_MAPPINGS,
1187         "[\\u01F1]", // 'DZ'
1188         "[\\u01F1\\u01F2\\u01F3]",
1189
1190         CASE_MAPPINGS,
1191         "[a-z]",
1192         "[A-Za-z]",
1193
1194         NULL
1195     };
1196
1197     UnicodeSet s;
1198     UnicodeSet t;
1199     UnicodeString buf;
1200     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1201         int32_t selector = DATA[i][0];
1202         UnicodeString pat(DATA[i+1]);
1203         UnicodeString exp(DATA[i+2]);
1204         s.applyPattern(pat, ec);
1205         s.closeOver(selector);
1206         t.applyPattern(exp, ec);
1207         if (U_FAILURE(ec)) {
1208             errln("FAIL: applyPattern failed");
1209             continue;
1210         }
1211         if (s == t) {
1212             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1213         } else {
1214             errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1215                   s.toPattern(buf, TRUE) + ", expected " + exp);
1216         }
1217     }
1218
1219 #if 0
1220     /*
1221      * Unused test code.
1222      * This was used to compare the old implementation (using USET_CASE)
1223      * with the new one (using 0x100 temporarily)
1224      * while transitioning from hardcoded case closure tables in uniset.cpp
1225      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1226      * and using ucase.c functions for closure.
1227      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1228      *
1229      * Note: The old and new implementation never fully matched because
1230      * the old implementation turned out to not map U+0130 and U+0131 correctly
1231      * (dotted I and dotless i) and because the old implementation's data tables
1232      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1233      * new implementation. (So sigmas and some other characters were not handled
1234      * according to the newer Unicode version.)
1235      */
1236     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1237     UnicodeSetIterator si(sens);
1238     UnicodeString str, buf2;
1239     const UnicodeString *pStr;
1240     UChar32 c;
1241     while(si.next()) {
1242         if(!si.isString()) {
1243             c=si.getCodepoint();
1244             s.clear();
1245             s.add(c);
1246
1247             str.setTo(c);
1248             str.foldCase();
1249             sens2.add(str);
1250
1251             t=s;
1252             s.closeOver(USET_CASE);
1253             t.closeOver(0x100);
1254             if(s!=t) {
1255                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1256                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1257             }
1258         }
1259     }
1260     // remove all code points
1261     // should contain all full case folding mapping strings
1262     sens2.remove(0, 0x10ffff);
1263     si.reset(sens2);
1264     while(si.next()) {
1265         if(si.isString()) {
1266             pStr=&si.getString();
1267             s.clear();
1268             s.add(*pStr);
1269             t=s2=s;
1270             s.closeOver(USET_CASE);
1271             t.closeOver(0x100);
1272             if(s!=t) {
1273                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1274                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1275             }
1276         }
1277     }
1278 #endif
1279
1280     // Test the pattern API
1281     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1282     if (U_FAILURE(ec)) {
1283         errln("FAIL: applyPattern failed");
1284     } else {
1285         expectContainment(s, "abcABC", "defDEF");
1286     }
1287     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1288     if (U_FAILURE(ec)) {
1289         errln("FAIL: constructor failed");
1290     } else {
1291         expectContainment(v, "defDEF", "abcABC");
1292     }
1293     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1294     if (U_FAILURE(ec)) {
1295         errln("FAIL: construct w/case mappings failed");
1296     } else {
1297         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1298     }
1299 }
1300
1301 void UnicodeSetTest::TestEscapePattern() {
1302     const char pattern[] =
1303         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1304     const char exp[] =
1305         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1306     // We test this with two passes; in the second pass we
1307     // pre-unescape the pattern.  Since U+200E is rule whitespace,
1308     // this fails -- which is what we expect.
1309     for (int32_t pass=1; pass<=2; ++pass) {
1310         UErrorCode ec = U_ZERO_ERROR;
1311         UnicodeString pat(pattern);
1312         if (pass==2) {
1313             pat = pat.unescape();
1314         }
1315         // Pattern is only good for pass 1
1316         UBool isPatternValid = (pass==1);
1317
1318         UnicodeSet set(pat, ec);
1319         if (U_SUCCESS(ec) != isPatternValid){
1320             errln((UnicodeString)"FAIL: applyPattern(" +
1321                   escape(pat) + ") => " +
1322                   u_errorName(ec));
1323             continue;
1324         }
1325         if (U_FAILURE(ec)) {
1326             continue;
1327         }
1328         if (set.contains((UChar)0x0644)){
1329             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1330         }
1331
1332         UnicodeString newpat;
1333         set.toPattern(newpat, TRUE);
1334         if (newpat == exp) {
1335             logln(escape(pat) + " => " + newpat);
1336         } else {
1337             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1338         }
1339
1340         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1341             UnicodeString str("Range ");
1342             str.append((UChar)(0x30 + i))
1343                 .append(": ")
1344                 .append((UChar32)set.getRangeStart(i))
1345                 .append(" - ")
1346                 .append((UChar32)set.getRangeEnd(i));
1347             str = str + " (" + set.getRangeStart(i) + " - " +
1348                 set.getRangeEnd(i) + ")";
1349             if (set.getRangeStart(i) < 0) {
1350                 errln((UnicodeString)"FAIL: " + escape(str));
1351             } else {
1352                 logln(escape(str));
1353             }
1354         }
1355     }
1356 }
1357
1358 void UnicodeSetTest::expectRange(const UnicodeString& label,
1359                                  const UnicodeSet& set,
1360                                  UChar32 start, UChar32 end) {
1361     UnicodeSet exp(start, end);
1362     UnicodeString pat;
1363     if (set == exp) {
1364         logln(label + " => " + set.toPattern(pat, TRUE));
1365     } else {
1366         UnicodeString xpat;
1367         errln((UnicodeString)"FAIL: " + label + " => " +
1368               set.toPattern(pat, TRUE) +
1369               ", expected " + exp.toPattern(xpat, TRUE));
1370     }
1371 }
1372
1373 void UnicodeSetTest::TestInvalidCodePoint() {
1374
1375     const UChar32 DATA[] = {
1376         // Test range             Expected range
1377         0, 0x10FFFF,              0, 0x10FFFF,
1378         (UChar32)-1, 8,           0, 8,
1379         8, 0x110000,              8, 0x10FFFF
1380     };
1381     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1382
1383     UnicodeString pat;
1384     int32_t i;
1385
1386     for (i=0; i<DATA_LENGTH; i+=4) {
1387         UChar32 start  = DATA[i];
1388         UChar32 end    = DATA[i+1];
1389         UChar32 xstart = DATA[i+2];
1390         UChar32 xend   = DATA[i+3];
1391
1392         // Try various API using the test code points
1393
1394         UnicodeSet set(start, end);
1395         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1396                     set, xstart, xend);
1397
1398         set.clear();
1399         set.set(start, end);
1400         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1401                     set, xstart, xend);
1402
1403         UBool b = set.contains(start);
1404         b = set.contains(start, end);
1405         b = set.containsNone(start, end);
1406         b = set.containsSome(start, end);
1407
1408         /*int32_t index = set.indexOf(start);*/
1409
1410         set.clear();
1411         set.add(start);
1412         set.add(start, end);
1413         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1414                     set, xstart, xend);
1415
1416         set.set(0, 0x10FFFF);
1417         set.retain(start, end);
1418         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1419                     set, xstart, xend);
1420         set.retain(start);
1421
1422         set.set(0, 0x10FFFF);
1423         set.remove(start);
1424         set.remove(start, end);
1425         set.complement();
1426         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1427                     set, xstart, xend);
1428
1429         set.set(0, 0x10FFFF);
1430         set.complement(start, end);
1431         set.complement();
1432         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1433                     set, xstart, xend);
1434         set.complement(start);
1435     }
1436
1437     const UChar32 DATA2[] = {
1438         0,
1439         0x10FFFF,
1440         (UChar32)-1,
1441         0x110000
1442     };
1443     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1444
1445     for (i=0; i<DATA2_LENGTH; ++i) {
1446         UChar32 c = DATA2[i], end = 0x10FFFF;
1447         UBool valid = (c >= 0 && c <= 0x10FFFF);
1448
1449         UnicodeSet set(0, 0x10FFFF);
1450
1451         // For single-codepoint contains, invalid codepoints are NOT contained
1452         UBool b = set.contains(c);
1453         if (b == valid) {
1454             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1455                   ") = " + b);
1456         } else {
1457             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1458                   ") = " + b);
1459         }
1460
1461         // For codepoint range contains, containsNone, and containsSome,
1462         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1463         b = set.contains(c, end);
1464         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1465               "," + end + ") = " + b);
1466
1467         b = set.containsNone(c, end);
1468         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1469               "," + end + ") = " + b);
1470
1471         b = set.containsSome(c, end);
1472         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1473               "," + end + ") = " + b);
1474
1475         int32_t index = set.indexOf(c);
1476         if ((index >= 0) == valid) {
1477             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1478                   ") = " + index);
1479         } else {
1480             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1481                   ") = " + index);
1482         }
1483     }
1484 }
1485
1486 // Used by TestSymbolTable
1487 class TokenSymbolTable : public SymbolTable {
1488 public:
1489     Hashtable contents;
1490
1491     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1492         contents.setValueDeleter(uhash_deleteUnicodeString);
1493     }
1494
1495     ~TokenSymbolTable() {}
1496
1497     /**
1498      * (Non-SymbolTable API) Add the given variable and value to
1499      * the table.  Variable should NOT contain leading '$'.
1500      */
1501     void add(const UnicodeString& var, const UnicodeString& value,
1502              UErrorCode& ec) {
1503         if (U_SUCCESS(ec)) {
1504             contents.put(var, new UnicodeString(value), ec);
1505         }
1506     }
1507
1508     /**
1509      * SymbolTable API
1510      */
1511     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1512         return (const UnicodeString*) contents.get(s);
1513     }
1514
1515     /**
1516      * SymbolTable API
1517      */
1518     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1519         return NULL;
1520     }
1521
1522     /**
1523      * SymbolTable API
1524      */
1525     virtual UnicodeString parseReference(const UnicodeString& text,
1526                                          ParsePosition& pos, int32_t limit) const {
1527         int32_t start = pos.getIndex();
1528         int32_t i = start;
1529         UnicodeString result;
1530         while (i < limit) {
1531             UChar c = text.charAt(i);
1532             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1533                 break;
1534             }
1535             ++i;
1536         }
1537         if (i == start) { // No valid name chars
1538             return result; // Indicate failure with empty string
1539         }
1540         pos.setIndex(i);
1541         text.extractBetween(start, i, result);
1542         return result;
1543     }
1544 };
1545
1546 void UnicodeSetTest::TestSymbolTable() {
1547     // Multiple test cases can be set up here.  Each test case
1548     // is terminated by null:
1549     // var, value, var, value,..., input pat., exp. output pat., null
1550     const char* DATA[] = {
1551         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1552         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1553         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1554         NULL
1555     };
1556
1557     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1558         UErrorCode ec = U_ZERO_ERROR;
1559         TokenSymbolTable sym(ec);
1560         if (U_FAILURE(ec)) {
1561             errln("FAIL: couldn't construct TokenSymbolTable");
1562             continue;
1563         }
1564
1565         // Set up variables
1566         while (DATA[i+2] != NULL) {
1567             sym.add(DATA[i], DATA[i+1], ec);
1568             if (U_FAILURE(ec)) {
1569                 errln("FAIL: couldn't add to TokenSymbolTable");
1570                 continue;
1571             }
1572             i += 2;
1573         }
1574
1575         // Input pattern and expected output pattern
1576         UnicodeString inpat = DATA[i], exppat = DATA[i+1];
1577         i += 2;
1578
1579         ParsePosition pos(0);
1580         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1581         if (U_FAILURE(ec)) {
1582             errln("FAIL: couldn't construct UnicodeSet");
1583             continue;
1584         }
1585
1586         // results
1587         if (pos.getIndex() != inpat.length()) {
1588             errln((UnicodeString)"Failed to read to end of string \""
1589                   + inpat + "\": read to "
1590                   + pos.getIndex() + ", length is "
1591                   + inpat.length());
1592         }
1593
1594         UnicodeSet us2(exppat, ec);
1595         if (U_FAILURE(ec)) {
1596             errln("FAIL: couldn't construct expected UnicodeSet");
1597             continue;
1598         }
1599
1600         UnicodeString a, b;
1601         if (us != us2) {
1602             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1603                   ", expected " + us2.toPattern(b, TRUE));
1604         } else {
1605             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1606         }
1607     }
1608 }
1609
1610 void UnicodeSetTest::TestSurrogate() {
1611     const char* DATA[] = {
1612         // These should all behave identically
1613         "[abc\\uD800\\uDC00]",
1614         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1615         "[abc\\U00010000]",
1616         0
1617     };
1618     for (int i=0; DATA[i] != 0; ++i) {
1619         UErrorCode ec = U_ZERO_ERROR;
1620         logln((UnicodeString)"Test pattern " + i + " :" + DATA[i]);
1621         UnicodeSet set(DATA[i], ec);
1622         if (U_FAILURE(ec)) {
1623             errln("FAIL: UnicodeSet constructor");
1624             continue;
1625         }
1626         expectContainment(set,
1627                           CharsToUnicodeString("abc\\U00010000"),
1628                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1629         if (set.size() != 4) {
1630             errln((UnicodeString)"FAIL: " + DATA[i] + ".size() == " +
1631                   set.size() + ", expected 4");
1632         }
1633     }
1634 }
1635
1636 void UnicodeSetTest::TestExhaustive() {
1637     // exhaustive tests. Simulate UnicodeSets with integers.
1638     // That gives us very solid tests (except for large memory tests).
1639
1640     int32_t limit = 128;
1641
1642     UnicodeSet x, y, z, aa;
1643
1644     for (int32_t i = 0; i < limit; ++i) {
1645         bitsToSet(i, x);
1646         logln((UnicodeString)"Testing " + i + ", " + x);
1647         _testComplement(i, x, y);
1648
1649         // AS LONG AS WE ARE HERE, check roundtrip
1650         checkRoundTrip(bitsToSet(i, aa));
1651
1652         for (int32_t j = 0; j < limit; ++j) {
1653             _testAdd(i,j,  x,y,z);
1654             _testXor(i,j,  x,y,z);
1655             _testRetain(i,j,  x,y,z);
1656             _testRemove(i,j,  x,y,z);
1657         }
1658     }
1659 }
1660
1661 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1662     bitsToSet(a, x);
1663     z = x;
1664     z.complement();
1665     int32_t c = setToBits(z);
1666     if (c != (~a)) {
1667         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1668         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1669     }
1670     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1671 }
1672
1673 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1674     bitsToSet(a, x);
1675     bitsToSet(b, y);
1676     z = x;
1677     z.addAll(y);
1678     int32_t c = setToBits(z);
1679     if (c != (a | b)) {
1680         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1681         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1682     }
1683     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1684 }
1685
1686 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1687     bitsToSet(a, x);
1688     bitsToSet(b, y);
1689     z = x;
1690     z.retainAll(y);
1691     int32_t c = setToBits(z);
1692     if (c != (a & b)) {
1693         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1694         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1695     }
1696     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1697 }
1698
1699 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1700     bitsToSet(a, x);
1701     bitsToSet(b, y);
1702     z = x;
1703     z.removeAll(y);
1704     int32_t c = setToBits(z);
1705     if (c != (a &~ b)) {
1706         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1707         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1708     }
1709     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1710 }
1711
1712 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1713     bitsToSet(a, x);
1714     bitsToSet(b, y);
1715     z = x;
1716     z.complementAll(y);
1717     int32_t c = setToBits(z);
1718     if (c != (a ^ b)) {
1719         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1720         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1721     }
1722     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1723 }
1724
1725 /**
1726  * Check that ranges are monotonically increasing and non-
1727  * overlapping.
1728  */
1729 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1730     int32_t n = set.getRangeCount();
1731     if (n < 0) {
1732         errln((UnicodeString)"FAIL result of " + msg +
1733               ": range count should be >= 0 but is " +
1734               n /*+ " for " + set.toPattern())*/);
1735         return;
1736     }
1737     UChar32 last = 0;
1738     for (int32_t i=0; i<n; ++i) {
1739         UChar32 start = set.getRangeStart(i);
1740         UChar32 end = set.getRangeEnd(i);
1741         if (start > end) {
1742             errln((UnicodeString)"FAIL result of " + msg +
1743                   ": range " + (i+1) +
1744                   " start > end: " + (int)start + ", " + (int)end +
1745                   " for " + set);
1746         }
1747         if (i > 0 && start <= last) {
1748             errln((UnicodeString)"FAIL result of " + msg +
1749                   ": range " + (i+1) +
1750                   " overlaps previous range: " + (int)start + ", " + (int)end +
1751                   " for " + set);
1752         }
1753         last = end;
1754     }
1755 }
1756
1757 /**
1758  * Convert a bitmask to a UnicodeSet.
1759  */
1760 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1761     result.clear();
1762     for (UChar32 i = 0; i < 32; ++i) {
1763         if ((a & (1<<i)) != 0) {
1764             result.add(i);
1765         }
1766     }
1767     return result;
1768 }
1769
1770 /**
1771  * Convert a UnicodeSet to a bitmask.  Only the characters
1772  * U+0000 to U+0020 are represented in the bitmask.
1773  */
1774 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1775     int32_t result = 0;
1776     for (int32_t i = 0; i < 32; ++i) {
1777         if (x.contains((UChar32)i)) {
1778             result |= (1<<i);
1779         }
1780     }
1781     return result;
1782 }
1783
1784 /**
1785  * Return the representation of an inversion list based UnicodeSet
1786  * as a pairs list.  Ranges are listed in ascending Unicode order.
1787  * For example, the set [a-zA-M3] is represented as "33AMaz".
1788  */
1789 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1790     UnicodeString pairs;
1791     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1792         UChar32 start = set.getRangeStart(i);
1793         UChar32 end = set.getRangeEnd(i);
1794         if (end > 0xFFFF) {
1795             end = 0xFFFF;
1796             i = set.getRangeCount(); // Should be unnecessary
1797         }
1798         pairs.append((UChar)start).append((UChar)end);
1799     }
1800     return pairs;
1801 }
1802
1803 /**
1804  * Basic consistency check for a few items.
1805  * That the iterator works, and that we can create a pattern and
1806  * get the same thing back
1807  */
1808 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1809     UErrorCode ec = U_ZERO_ERROR;
1810
1811     UnicodeSet t(s);
1812     checkEqual(s, t, "copy ct");
1813
1814     t = s;
1815     checkEqual(s, t, "operator=");
1816
1817     copyWithIterator(t, s, FALSE);
1818     checkEqual(s, t, "iterator roundtrip");
1819
1820     copyWithIterator(t, s, TRUE); // try range
1821     checkEqual(s, t, "iterator roundtrip");
1822
1823     UnicodeString pat; s.toPattern(pat, FALSE);
1824     t.applyPattern(pat, ec);
1825     if (U_FAILURE(ec)) {
1826         errln("FAIL: applyPattern");
1827         return;
1828     } else {
1829         checkEqual(s, t, "toPattern(false)");
1830     }
1831
1832     s.toPattern(pat, TRUE);
1833     t.applyPattern(pat, ec);
1834     if (U_FAILURE(ec)) {
1835         errln("FAIL: applyPattern");
1836         return;
1837     } else {
1838         checkEqual(s, t, "toPattern(true)");
1839     }
1840 }
1841
1842 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1843     t.clear();
1844     UnicodeSetIterator it(s);
1845     if (withRange) {
1846         while (it.nextRange()) {
1847             if (it.isString()) {
1848                 t.add(it.getString());
1849             } else {
1850                 t.add(it.getCodepoint(), it.getCodepointEnd());
1851             }
1852         }
1853     } else {
1854         while (it.next()) {
1855             if (it.isString()) {
1856                 t.add(it.getString());
1857             } else {
1858                 t.add(it.getCodepoint());
1859             }
1860         }
1861     }
1862 }
1863
1864 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1865     UnicodeString source; s.toPattern(source, TRUE);
1866     UnicodeString result; t.toPattern(result, TRUE);
1867     if (s != t) {
1868         errln((UnicodeString)"FAIL: " + message
1869               + "; source = " + source
1870               + "; result = " + result
1871               );
1872         return FALSE;
1873     } else {
1874         logln((UnicodeString)"Ok: " + message
1875               + "; source = " + source
1876               + "; result = " + result
1877               );
1878     }
1879     return TRUE;
1880 }
1881
1882 void
1883 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1884                                   const UnicodeString& charsIn,
1885                                   const UnicodeString& charsOut) {
1886     UErrorCode ec = U_ZERO_ERROR;
1887     UnicodeSet set(pat, ec);
1888     if (U_FAILURE(ec)) {
1889         errln((UnicodeString)"FAIL: pattern \"" +
1890               pat + "\" => " + u_errorName(ec));
1891         return;
1892     }
1893     expectContainment(set, pat, charsIn, charsOut);
1894 }
1895
1896 void
1897 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1898                                   const UnicodeString& charsIn,
1899                                   const UnicodeString& charsOut) {
1900     UnicodeString pat;
1901     set.toPattern(pat);
1902     expectContainment(set, pat, charsIn, charsOut);
1903 }
1904
1905 void
1906 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1907                                   const UnicodeString& setName,
1908                                   const UnicodeString& charsIn,
1909                                   const UnicodeString& charsOut) {
1910     UnicodeString bad;
1911     UChar32 c;
1912     int32_t i;
1913
1914     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1915         c = charsIn.char32At(i);
1916         if (!set.contains(c)) {
1917             bad.append(c);
1918         }
1919     }
1920     if (bad.length() > 0) {
1921         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1922               ", expected containment of " + prettify(charsIn));
1923     } else {
1924         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1925     }
1926
1927     bad.truncate(0);
1928     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1929         c = charsOut.char32At(i);
1930         if (set.contains(c)) {
1931             bad.append(c);
1932         }
1933     }
1934     if (bad.length() > 0) {
1935         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1936               ", expected non-containment of " + prettify(charsOut));
1937     } else {
1938         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1939     }
1940 }
1941
1942 void
1943 UnicodeSetTest::expectPattern(UnicodeSet& set,
1944                               const UnicodeString& pattern,
1945                               const UnicodeString& expectedPairs){
1946     UErrorCode status = U_ZERO_ERROR;
1947     set.applyPattern(pattern, status);
1948     if (U_FAILURE(status)) {
1949         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1950               "\") failed");
1951         return;
1952     } else {
1953         if (getPairs(set) != expectedPairs ) {
1954             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1955                   "\") => pairs \"" +
1956                   escape(getPairs(set)) + "\", expected \"" +
1957                   escape(expectedPairs) + "\"");
1958         } else {
1959             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
1960                   "\") => pairs \"" +
1961                   escape(getPairs(set)) + "\"");
1962         }
1963     }
1964     // the result of calling set.toPattern(), which is the string representation of
1965     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
1966     // will produce another set that is equal to this one.
1967     UnicodeString temppattern;
1968     set.toPattern(temppattern);
1969     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1970     if (U_FAILURE(status)) {
1971         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1972         return;
1973     }
1974     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1975         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1976             escape(getPairs(set)) + "\""));
1977     } else{
1978         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
1979     }
1980
1981     delete tempset;
1982
1983 }
1984
1985 void
1986 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
1987     if (getPairs(set) != expectedPairs) {
1988         errln(UnicodeString("FAIL: Expected pair list \"") +
1989               escape(expectedPairs) + "\", got \"" +
1990               escape(getPairs(set)) + "\"");
1991     }
1992 }
1993
1994 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
1995                                      const UnicodeString& expPat,
1996                                      const char** expStrings) {
1997     UnicodeString pat;
1998     set.toPattern(pat, TRUE);
1999     if (pat == expPat) {
2000         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2001     } else {
2002         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2003         return;
2004     }
2005     if (expStrings == NULL) {
2006         return;
2007     }
2008     UBool in = TRUE;
2009     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2010         if (expStrings[i] == NOT) { // sic; pointer comparison
2011             in = FALSE;
2012             continue;
2013         }
2014         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2015         UBool contained = set.contains(s);
2016         if (contained == in) {
2017             logln((UnicodeString)"Ok: " + expPat +
2018                   (contained ? " contains {" : " does not contain {") +
2019                   escape(expStrings[i]) + "}");
2020         } else {
2021             errln((UnicodeString)"FAIL: " + expPat +
2022                   (contained ? " contains {" : " does not contain {") +
2023                   escape(expStrings[i]) + "}");
2024         }
2025     }
2026 }
2027
2028 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2029
2030 void
2031 UnicodeSetTest::doAssert(UBool condition, const char *message)
2032 {
2033     if (!condition) {
2034         errln(UnicodeString("ERROR : ") + message);
2035     }
2036 }
2037
2038 UnicodeString
2039 UnicodeSetTest::escape(const UnicodeString& s) {
2040     UnicodeString buf;
2041     for (int32_t i=0; i<s.length(); )
2042     {
2043         UChar32 c = s.char32At(i);
2044         if (0x0020 <= c && c <= 0x007F) {
2045             buf += c;
2046         } else {
2047             if (c <= 0xFFFF) {
2048                 buf += (UChar)0x5c; buf += (UChar)0x75;
2049             } else {
2050                 buf += (UChar)0x5c; buf += (UChar)0x55;
2051                 buf += toHexString((c & 0xF0000000) >> 28);
2052                 buf += toHexString((c & 0x0F000000) >> 24);
2053                 buf += toHexString((c & 0x00F00000) >> 20);
2054                 buf += toHexString((c & 0x000F0000) >> 16);
2055             }
2056             buf += toHexString((c & 0xF000) >> 12);
2057             buf += toHexString((c & 0x0F00) >> 8);
2058             buf += toHexString((c & 0x00F0) >> 4);
2059             buf += toHexString(c & 0x000F);
2060         }
2061         i += U16_LENGTH(c);
2062     }
2063     return buf;
2064 }