icuSources/test/intltest/usettest.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1999-2004 Alan Liu ,International Business Machines Corporation and
   4 *   others. All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 *   03/22/2000  Madhu       Added additional tests
   9 **********************************************************************
  10 */
  11
  12 #include "unicode/utypes.h"
  13 #include "usettest.h"
  14 #include "unicode/uniset.h"
  15 #include "unicode/uchar.h"
  16 #include "unicode/usetiter.h"
  17 #include "unicode/ustring.h"
  18 #include "unicode/parsepos.h"
  19 #include "unicode/symtable.h"
  20 #include "hash.h"
  21
  22 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  23     UnicodeString pat;
  24     set.toPattern(pat);
  25     return left + UnicodeSetTest::escape(pat);
  26 }
  27
  28 #define CASE(id,test) case id:                          \
  29                           name = #test;                 \
  30                           if (exec) {                   \
  31                               logln(#test "---");       \
  32                               logln((UnicodeString)""); \
  33                               test();                   \
  34                           }                             \
  35                           break
  36
  37 void
  38 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  39                                const char* &name, char* /*par*/) {
  40     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  41     switch (index) {
  42         CASE(0,TestPatterns);
  43         CASE(1,TestAddRemove);
  44         CASE(2,TestCategories);
  45         CASE(3,TestCloneEqualHash);
  46         CASE(4,TestMinimalRep);
  47         CASE(5,TestAPI);
  48         CASE(6,TestScriptSet);
  49         CASE(7,TestPropertySet);
  50         CASE(8,TestClone);
  51         CASE(9,TestExhaustive);
  52         CASE(10,TestToPattern);
  53         CASE(11,TestIndexOf);
  54         CASE(12,TestStrings);
  55         CASE(13,Testj2268);
  56         CASE(14,TestCloseOver);
  57         CASE(15,TestEscapePattern);
  58         CASE(16,TestInvalidCodePoint);
  59         CASE(17,TestSymbolTable);
  60         CASE(18,TestSurrogate);
  61         default: name = ""; break;
  62     }
  63 }
  64
  65 static const char NOT[] = "%%%%";
  66
  67 /**
  68  * UVector was improperly copying contents
  69  * This code will crash this is still true
  70  */
  71 void UnicodeSetTest::Testj2268() {
  72   UnicodeSet t;
  73   t.add(UnicodeString("abc"));
  74   UnicodeSet test(t);
  75   UnicodeString ustrPat;
  76   test.toPattern(ustrPat, TRUE);
  77 }
  78
  79 /**
  80  * Test toPattern().
  81  */
  82 void UnicodeSetTest::TestToPattern() {
  83     UErrorCode ec = U_ZERO_ERROR;
  84
  85     // Test that toPattern() round trips with syntax characters and
  86     // whitespace.
  87     {
  88         static const char* OTHER_TOPATTERN_TESTS[] = {
  89             "[[:latin:]&[:greek:]]",
  90             "[[:latin:]-[:greek:]]",
  91             "[:nonspacing mark:]",
  92             NULL
  93         };
  94
  95         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
  96             ec = U_ZERO_ERROR;
  97             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
  98             if (U_FAILURE(ec)) {
  99                 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
 100                 continue;
 101             }
 102             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 103         }
 104
 105         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 106             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 107
 108                 // check various combinations to make sure they all work.
 109                 if (i != 0 && !toPatternAux(i, i)){
 110                     continue;
 111                 }
 112                 if (!toPatternAux(0, i)){
 113                     continue;
 114                 }
 115                 if (!toPatternAux(i, 0xFFFF)){
 116                     continue;
 117                 }
 118             }
 119         }
 120     }
 121
 122     // Test pattern behavior of multicharacter strings.
 123     {
 124         ec = U_ZERO_ERROR;
 125         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 126
 127         // This loop isn't a loop.  It's here to make the compiler happy.
 128         // If you're curious, try removing it and changing the 'break'
 129         // statements (except for the last) to goto's.
 130         for (;;) {
 131             if (U_FAILURE(ec)) break;
 132             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 133             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 134
 135             s->add("ac");
 136             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 137             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 138
 139             s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
 140             if (U_FAILURE(ec)) break;
 141             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 142             expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3);
 143
 144             s->add("[]");
 145             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 146             expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
 147
 148             s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
 149             if (U_FAILURE(ec)) break;
 150             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 151             expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
 152
 153             // j2189
 154             s->clear();
 155             s->add(UnicodeString("abc", ""));
 156             s->add(UnicodeString("abc", ""));
 157             const char* exp6[] = {"abc", NOT, "ab", NULL};
 158             expectToPattern(*s, "[{abc}]", exp6);
 159
 160             break;
 161         }
 162
 163         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 164         delete s;
 165     }
 166
 167     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 168     UnicodeSet s;
 169     s.add((UChar)97, (UChar)98); // 'a', 'b'
 170     expectToPattern(s, "[ab]", NULL);
 171 }
 172
 173 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 174
 175     // use Integer.toString because Utility.hex doesn't handle ints
 176     UnicodeString pat = "";
 177     // TODO do these in hex
 178     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 179     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 180     UnicodeString source;
 181     source = source + (uint32_t)start;
 182     if (start != end)
 183         source = source + ".." + (uint32_t)end;
 184     UnicodeSet testSet;
 185     testSet.add(start, end);
 186     return checkPat(source, testSet);
 187 }
 188
 189 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 190                                const UnicodeSet& testSet) {
 191     // What we want to make sure of is that a pattern generated
 192     // by toPattern(), with or without escaped unprintables, can
 193     // be passed back into the UnicodeSet constructor.
 194     UnicodeString pat0;
 195
 196     testSet.toPattern(pat0, TRUE);
 197
 198     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 199
 200     //String pat1 = unescapeLeniently(pat0);
 201     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 202
 203     UnicodeString pat2;
 204     testSet.toPattern(pat2, FALSE);
 205     if (!checkPat(source, testSet, pat2)) return FALSE;
 206
 207     //String pat3 = unescapeLeniently(pat2);
 208     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 209
 210     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 211     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 212     return TRUE;
 213 }
 214
 215 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 216                                const UnicodeSet& testSet,
 217                                const UnicodeString& pat) {
 218     UErrorCode ec = U_ZERO_ERROR;
 219     UnicodeSet testSet2(pat, ec);
 220     if (testSet2 != testSet) {
 221         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 222         return FALSE;
 223     }
 224     return TRUE;
 225 }
 226
 227 void
 228 UnicodeSetTest::TestPatterns(void) {
 229     UnicodeSet set;
 230     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 231     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 232     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 233     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 234     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 235     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 236
 237     // Throw in a test of complement
 238     set.complement();
 239     UnicodeString exp;
 240     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 241     expectPairs(set, exp);
 242 }
 243
 244 void
 245 UnicodeSetTest::TestCategories(void) {
 246     UErrorCode status = U_ZERO_ERROR;
 247     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 248     UnicodeSet set(pat, status);
 249     if (U_FAILURE(status)) {
 250         errln((UnicodeString)"Fail: Can't construct set with " + pat);
 251     } else {
 252         expectContainment(set, pat, "ABC", "abc");
 253     }
 254
 255     UChar32 i;
 256     int32_t failures = 0;
 257     // Make sure generation of L doesn't pollute cached Lu set
 258     // First generate L, then Lu
 259     set.applyPattern("[:L:]", status);
 260     if (U_FAILURE(status)) { errln("FAIL"); return; }
 261     for (i=0; i<0x200; ++i) {
 262         UBool l = u_isalpha((UChar)i);
 263         if (l != set.contains(i)) {
 264             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 265                   set.contains(i));
 266             if (++failures == 10) break;
 267         }
 268     }
 269
 270     set.applyPattern("[:Lu:]", status);
 271     if (U_FAILURE(status)) { errln("FAIL"); return; }
 272     for (i=0; i<0x200; ++i) {
 273         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 274         if (lu != set.contains(i)) {
 275             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 276                   set.contains(i));
 277             if (++failures == 20) break;
 278         }
 279     }
 280 }
 281 void
 282 UnicodeSetTest::TestCloneEqualHash(void) {
 283     UErrorCode status = U_ZERO_ERROR;
 284     // set1 and set2 used to be built with the obsolete constructor taking
 285     // UCharCategory values; replaced with pattern constructors
 286     // markus 20030502
 287     UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); //  :Ll: Letter, lowercase
 288     UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); //  Letter, lowercase
 289     if (U_FAILURE(status)){
 290         errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
 291         return;
 292     }
 293     UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status);   //Number, Decimal digit
 294     UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status);   //Number, Decimal digit
 295     if (U_FAILURE(status)){
 296         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 297         return;
 298     }
 299
 300     if (*set1 != *set1a) {
 301         errln("FAIL: category constructor for Ll broken");
 302     }
 303     if (*set2 != *set2a) {
 304         errln("FAIL: category constructor for Nd broken");
 305     }
 306     delete set1a;
 307     delete set2a;
 308
 309     logln("Testing copy construction");
 310     UnicodeSet *set1copy=new UnicodeSet(*set1);
 311     if(*set1 != *set1copy || *set1 == *set2 ||
 312         getPairs(*set1) != getPairs(*set1copy) ||
 313         set1->hashCode() != set1copy->hashCode()){
 314         errln("FAIL : Error in copy construction");
 315         return;
 316     }
 317
 318     logln("Testing =operator");
 319     UnicodeSet set1equal=*set1;
 320     UnicodeSet set2equal=*set2;
 321     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 322         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 323         errln("FAIL: Error in =operator");
 324     }
 325
 326     logln("Testing clone()");
 327     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 328     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 329     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 330         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 331         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 332         errln("FAIL: Error in clone");
 333     }
 334
 335     logln("Testing hashcode");
 336     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 337         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 338         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 339         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 340         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 341         errln("FAIL: Error in hashCode()");
 342     }
 343
 344     delete set1;
 345     delete set1copy;
 346     delete set2;
 347     delete set1clone;
 348     delete set2clone;
 349
 350
 351 }
 352 void
 353 UnicodeSetTest::TestAddRemove(void) {
 354     UnicodeSet set; // Construct empty set
 355     doAssert(set.isEmpty() == TRUE, "set should be empty");
 356     doAssert(set.size() == 0, "size should be 0");
 357     set.complement();
 358     doAssert(set.size() == 0x110000, "size should be 0x110000");
 359     set.clear();
 360     set.add(0x0061, 0x007a);
 361     expectPairs(set, "az");
 362     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 363     doAssert(set.size() != 0, "size should not be equal to 0");
 364     doAssert(set.size() == 26, "size should be equal to 26");
 365     set.remove(0x006d, 0x0070);
 366     expectPairs(set, "alqz");
 367     doAssert(set.size() == 22, "size should be equal to 22");
 368     set.remove(0x0065, 0x0067);
 369     expectPairs(set, "adhlqz");
 370     doAssert(set.size() == 19, "size should be equal to 19");
 371     set.remove(0x0064, 0x0069);
 372     expectPairs(set, "acjlqz");
 373     doAssert(set.size() == 16, "size should be equal to 16");
 374     set.remove(0x0063, 0x0072);
 375     expectPairs(set, "absz");
 376     doAssert(set.size() == 10, "size should be equal to 10");
 377     set.add(0x0066, 0x0071);
 378     expectPairs(set, "abfqsz");
 379     doAssert(set.size() == 22, "size should be equal to 22");
 380     set.remove(0x0061, 0x0067);
 381     expectPairs(set, "hqsz");
 382     set.remove(0x0061, 0x007a);
 383     expectPairs(set, "");
 384     doAssert(set.isEmpty() == TRUE, "set should be empty");
 385     doAssert(set.size() == 0, "size should be 0");
 386     set.add(0x0061);
 387     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 388     doAssert(set.size() == 1, "size should not be equal to 1");
 389     set.add(0x0062);
 390     set.add(0x0063);
 391     expectPairs(set, "ac");
 392     doAssert(set.size() == 3, "size should not be equal to 3");
 393     set.add(0x0070);
 394     set.add(0x0071);
 395     expectPairs(set, "acpq");
 396     doAssert(set.size() == 5, "size should not be equal to 5");
 397     set.clear();
 398     expectPairs(set, "");
 399     doAssert(set.isEmpty() == TRUE, "set should be empty");
 400     doAssert(set.size() == 0, "size should be 0");
 401
 402     // Try removing an entire set from another set
 403     expectPattern(set, "[c-x]", "cx");
 404     UnicodeSet set2;
 405     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 406     set.removeAll(set2);
 407     expectPairs(set, "deluxx");
 408
 409     // Try adding an entire set to another set
 410     expectPattern(set, "[jackiemclean]", "aacceein");
 411     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 412     set.addAll(set2);
 413     expectPairs(set, "aacehort");
 414     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 415
 416     // Try retaining an set of elements contained in another set (intersection)
 417     UnicodeSet set3;
 418     expectPattern(set3, "[a-c]", "ac");
 419     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 420     set3.remove(0x0062);
 421     expectPairs(set3, "aacc");
 422     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 423     set.retainAll(set3);
 424     expectPairs(set, "aacc");
 425     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 426     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 427     set.clear();
 428     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 429
 430     // Test commutativity
 431     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 432     expectPattern(set2, "[jackiemclean]", "aacceein");
 433     set.addAll(set2);
 434     expectPairs(set, "aacehort");
 435     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 436
 437
 438
 439
 440 }
 441
 442 /**
 443  * Make sure minimal representation is maintained.
 444  */
 445 void UnicodeSetTest::TestMinimalRep() {
 446     UErrorCode status = U_ZERO_ERROR;
 447     // This is pretty thoroughly tested by checkCanonicalRep()
 448     // run against the exhaustive operation results.  Use the code
 449     // here for debugging specific spot problems.
 450
 451     // 1 overlap against 2
 452     UnicodeSet set("[h-km-q]", status);
 453     if (U_FAILURE(status)) { errln("FAIL"); return; }
 454     UnicodeSet set2("[i-o]", status);
 455     if (U_FAILURE(status)) { errln("FAIL"); return; }
 456     set.addAll(set2);
 457     expectPairs(set, "hq");
 458     // right
 459     set.applyPattern("[a-m]", status);
 460     if (U_FAILURE(status)) { errln("FAIL"); return; }
 461     set2.applyPattern("[e-o]", status);
 462     if (U_FAILURE(status)) { errln("FAIL"); return; }
 463     set.addAll(set2);
 464     expectPairs(set, "ao");
 465     // left
 466     set.applyPattern("[e-o]", status);
 467     if (U_FAILURE(status)) { errln("FAIL"); return; }
 468     set2.applyPattern("[a-m]", status);
 469     if (U_FAILURE(status)) { errln("FAIL"); return; }
 470     set.addAll(set2);
 471     expectPairs(set, "ao");
 472     // 1 overlap against 3
 473     set.applyPattern("[a-eg-mo-w]", status);
 474     if (U_FAILURE(status)) { errln("FAIL"); return; }
 475     set2.applyPattern("[d-q]", status);
 476     if (U_FAILURE(status)) { errln("FAIL"); return; }
 477     set.addAll(set2);
 478     expectPairs(set, "aw");
 479 }
 480
 481 void UnicodeSetTest::TestAPI() {
 482     UErrorCode status = U_ZERO_ERROR;
 483     // default ct
 484     UnicodeSet set;
 485     if (!set.isEmpty() || set.getRangeCount() != 0) {
 486         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 487               set);
 488     }
 489
 490     // clear(), isEmpty()
 491     set.add(0x0061);
 492     if (set.isEmpty()) {
 493         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 494               set);
 495     }
 496     set.clear();
 497     if (!set.isEmpty()) {
 498         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 499               set);
 500     }
 501
 502     // size()
 503     set.clear();
 504     if (set.size() != 0) {
 505         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 506               ": " + set);
 507     }
 508     set.add(0x0061);
 509     if (set.size() != 1) {
 510         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 511               ": " + set);
 512     }
 513     set.add(0x0031, 0x0039);
 514     if (set.size() != 10) {
 515         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 516               ": " + set);
 517     }
 518
 519     // contains(first, last)
 520     set.clear();
 521     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 522     if (U_FAILURE(status)) { errln("FAIL"); return; }
 523     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 524         UChar32 a = set.getRangeStart(i);
 525         UChar32 b = set.getRangeEnd(i);
 526         if (!set.contains(a, b)) {
 527             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 528                   " but doesn't: " + set);
 529         }
 530         if (set.contains((UChar32)(a-1), b)) {
 531             errln((UnicodeString)"FAIL, shouldn't contain " +
 532                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 533                   " but does: " + set);
 534         }
 535         if (set.contains(a, (UChar32)(b+1))) {
 536             errln((UnicodeString)"FAIL, shouldn't contain " +
 537                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 538                   " but does: " + set);
 539         }
 540     }
 541
 542     // Ported InversionList test.
 543     UnicodeSet a((UChar32)3,(UChar32)10);
 544     UnicodeSet b((UChar32)7,(UChar32)15);
 545     UnicodeSet c;
 546
 547     logln((UnicodeString)"a [3-10]: " + a);
 548     logln((UnicodeString)"b [7-15]: " + b);
 549     c = a;
 550     c.addAll(b);
 551     UnicodeSet exp((UChar32)3,(UChar32)15);
 552     if (c == exp) {
 553         logln((UnicodeString)"c.set(a).add(b): " + c);
 554     } else {
 555         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 556     }
 557     c.complement();
 558     exp.set((UChar32)0, (UChar32)2);
 559     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 560     if (c == exp) {
 561         logln((UnicodeString)"c.complement(): " + c);
 562     } else {
 563         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 564     }
 565     c.complement();
 566     exp.set((UChar32)3, (UChar32)15);
 567     if (c == exp) {
 568         logln((UnicodeString)"c.complement(): " + c);
 569     } else {
 570         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 571     }
 572     c = a;
 573     c.complementAll(b);
 574     exp.set((UChar32)3,(UChar32)6);
 575     exp.add((UChar32)11,(UChar32) 15);
 576     if (c == exp) {
 577         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 578     } else {
 579         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 580     }
 581
 582     exp = c;
 583     bitsToSet(setToBits(c), c);
 584     if (c == exp) {
 585         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 586     } else {
 587         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 588     }
 589
 590     // Additional tests for coverage JB#2118
 591     //UnicodeSet::complement(class UnicodeString const &)
 592     //UnicodeSet::complementAll(class UnicodeString const &)
 593     //UnicodeSet::containsNone(class UnicodeSet const &)
 594     //UnicodeSet::containsNone(long,long)
 595     //UnicodeSet::containsSome(class UnicodeSet const &)
 596     //UnicodeSet::containsSome(long,long)
 597     //UnicodeSet::removeAll(class UnicodeString const &)
 598     //UnicodeSet::retain(long)
 599     //UnicodeSet::retainAll(class UnicodeString const &)
 600     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 601     //UnicodeSetIterator::getString(void)
 602     set.clear();
 603     set.complement("ab");
 604     exp.applyPattern("[{ab}]", status);
 605     if (U_FAILURE(status)) { errln("FAIL"); return; }
 606     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 607
 608     UnicodeSetIterator iset(set);
 609     if (!iset.next() || !iset.isString()) {
 610         errln("FAIL: UnicodeSetIterator::next/isString");
 611     } else if (iset.getString() != "ab") {
 612         errln("FAIL: UnicodeSetIterator::getString");
 613     }
 614
 615     set.add((UChar32)0x61, (UChar32)0x7A);
 616     set.complementAll("alan");
 617     exp.applyPattern("[{ab}b-kmo-z]", status);
 618     if (U_FAILURE(status)) { errln("FAIL"); return; }
 619     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 620
 621     exp.applyPattern("[a-z]", status);
 622     if (U_FAILURE(status)) { errln("FAIL"); return; }
 623     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 624     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 625     exp.applyPattern("[aln]", status);
 626     if (U_FAILURE(status)) { errln("FAIL"); return; }
 627     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 628     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 629
 630     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 631         errln("FAIL: containsNone(UChar32, UChar32)");
 632     }
 633     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 634         errln("FAIL: containsSome(UChar32, UChar32)");
 635     }
 636     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 637         errln("FAIL: containsNone(UChar32, UChar32)");
 638     }
 639     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 640         errln("FAIL: containsSome(UChar32, UChar32)");
 641     }
 642
 643     set.removeAll("liu");
 644     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 645     if (U_FAILURE(status)) { errln("FAIL"); return; }
 646     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 647
 648     set.retainAll("star");
 649     exp.applyPattern("[rst]", status);
 650     if (U_FAILURE(status)) { errln("FAIL"); return; }
 651     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 652
 653     set.retain((UChar32)0x73);
 654     exp.applyPattern("[s]", status);
 655     if (U_FAILURE(status)) { errln("FAIL"); return; }
 656     if (set != exp) { errln("FAIL: retain('s')"); return; }
 657
 658     uint16_t buf[32];
 659     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
 660     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 661     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 662         errln("FAIL: serialize");
 663         return;
 664     }
 665 }
 666
 667 void UnicodeSetTest::TestStrings() {
 668     UErrorCode ec = U_ZERO_ERROR;
 669
 670     UnicodeSet* testList[] = {
 671         UnicodeSet::createFromAll("abc"),
 672         new UnicodeSet("[a-c]", ec),
 673
 674         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 675         new UnicodeSet("[{ll}{ch}a-z]", ec),
 676
 677         UnicodeSet::createFrom("ab}c"),
 678         new UnicodeSet("[{ab\\}c}]", ec),
 679
 680         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 681         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 682
 683         NULL
 684     };
 685
 686     if (U_FAILURE(ec)) {
 687         errln("FAIL: couldn't construct test sets");
 688     }
 689
 690     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 691         if (U_SUCCESS(ec)) {
 692             UnicodeString pat0, pat1;
 693             testList[i]->toPattern(pat0, TRUE);
 694             testList[i+1]->toPattern(pat1, TRUE);
 695             if (*testList[i] == *testList[i+1]) {
 696                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 697             } else {
 698                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 699             }
 700         }
 701         delete testList[i];
 702         delete testList[i+1];
 703     }
 704 }
 705
 706 /**
 707  * Test the [:Latin:] syntax.
 708  */
 709 void UnicodeSetTest::TestScriptSet() {
 710     expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 711
 712     expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 713
 714     /* Jitterbug 1423 */
 715     expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 716
 717 }
 718
 719 /**
 720  * Test the [:Latin:] syntax.
 721  */
 722 void UnicodeSetTest::TestPropertySet() {
 723     static const char* DATA[] = {
 724         // Pattern, Chars IN, Chars NOT in
 725
 726         "[:Latin:]",
 727         "aA",
 728         "\\u0391\\u03B1",
 729
 730         "[\\p{Greek}]",
 731         "\\u0391\\u03B1",
 732         "aA",
 733
 734         "\\P{ GENERAL Category = upper case letter }",
 735         "abc",
 736         "ABC",
 737
 738         // Combining class: @since ICU 2.2
 739         // Check both symbolic and numeric
 740         "\\p{ccc=Nukta}",
 741         "\\u0ABC",
 742         "abc",
 743
 744         "\\p{Canonical Combining Class = 11}",
 745         "\\u05B1",
 746         "\\u05B2",
 747
 748         "[:c c c = iota subscript :]",
 749         "\\u0345",
 750         "xyz",
 751
 752         // Bidi class: @since ICU 2.2
 753         "\\p{bidiclass=lefttoright}",
 754         "abc",
 755         "\\u0671\\u0672",
 756
 757         // Binary properties: @since ICU 2.2
 758         "\\p{ideographic}",
 759         "\\u4E0A",
 760         "x",
 761
 762         "[:math=false:]",
 763         "q)*(",
 764         // weiv: )(and * were removed from math in Unicode 4.0.1
 765         //"(*+)",
 766         "+<>^",
 767
 768         // JB#1767 \N{}, \p{ASCII}
 769         "[:Ascii:]",
 770         "abc\\u0000\\u007F",
 771         "\\u0080\\u4E00",
 772
 773         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 774         "az",
 775         "qrs",
 776
 777         // JB#2015
 778         "[:any:]",
 779         "a\\U0010FFFF",
 780         "",
 781
 782         "[:nv=0.5:]",
 783         "\\u00BD\\u0F2A",
 784         "\\u00BC",
 785
 786         // JB#2653: Age
 787         "[:Age=1.1:]",
 788         "\\u03D6", // 1.1
 789         "\\u03D8\\u03D9", // 3.2
 790
 791         "[:Age=3.1:]",
 792         "\\u1800\\u3400\\U0002f800",
 793         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 794
 795         // JB#2350: Case_Sensitive
 796         "[:Case Sensitive:]",
 797         "A\\u1FFC\\U00010410",
 798         ";\\u00B4\\U00010500",
 799
 800         // JB#2832: C99-compatibility props
 801         "[:blank:]",
 802         " \\u0009",
 803         "1-9A-Z",
 804
 805         "[:graph:]",
 806         "19AZ",
 807         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 808
 809         "[:punct:]",
 810         "!@#%&*()[]{}-_\\/;:,.?'\"",
 811         "09azAZ",
 812
 813         "[:xdigit:]",
 814         "09afAF",
 815         "gG!",
 816
 817         // Regex compatibility test
 818         "[-b]", // leading '-' is literal
 819         "-b",
 820         "ac",
 821
 822         "[^-b]", // leading '-' is literal
 823         "ac",
 824         "-b",
 825
 826         "[b-]", // trailing '-' is literal
 827         "-b",
 828         "ac",
 829
 830         "[^b-]", // trailing '-' is literal
 831         "ac",
 832         "-b",
 833
 834         "[a-b-]", // trailing '-' is literal
 835         "ab-",
 836         "c=",
 837
 838         "[[a-q]&[p-z]-]", // trailing '-' is literal
 839         "pq-",
 840         "or=",
 841
 842         "[\\s|\\)|:|$|\\>]", // from regex tests
 843         "s|):$>",
 844         "abc",
 845
 846         "[\\uDC00cd]", // JB#2906: isolated trail at start
 847         "cd\\uDC00",
 848         "ab\\uD800\\U00010000",
 849
 850         "[ab\\uD800]", // JB#2906: isolated trail at start
 851         "ab\\uD800",
 852         "cd\\uDC00\\U00010000",
 853
 854         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
 855         "abcd\\uD800",
 856         "ef\\uDC00\\U00010000",
 857
 858         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
 859         "abcd\\uDC00",
 860         "ef\\uD800\\U00010000",
 861
 862         "[:^lccc=0:]", // Lead canonical class
 863         "\\u0300\\u0301",
 864         "abcd\\u00c0\\u00c5",
 865
 866         "[:^tccc=0:]", // Trail canonical class
 867         "\\u0300\\u0301\\u00c0\\u00c5",
 868         "abcd",
 869
 870         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
 871         "\\u0300\\u0301\\u00c0\\u00c5",
 872         "abcd",
 873
 874         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
 875         "",
 876         "abcd\\u0300\\u0301\\u00c0\\u00c5",
 877
 878         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
 879         "\\u0F73\\u0F75\\u0F81",
 880         "abcd\\u0300\\u0301\\u00c0\\u00c5",
 881
 882     };
 883
 884     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
 885
 886     for (int32_t i=0; i<DATA_LEN; i+=3) {
 887         expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]),
 888                           CharsToUnicodeString(DATA[i+2]));
 889     }
 890 }
 891
 892 /**
 893  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
 894  */
 895 void UnicodeSetTest::TestClone() {
 896     UErrorCode ec = U_ZERO_ERROR;
 897     UnicodeSet s("[abcxyz]", ec);
 898     UnicodeSet t(s);
 899     expectContainment(t, "abc", "def");
 900 }
 901
 902 /**
 903  * Test the indexOf() and charAt() methods.
 904  */
 905 void UnicodeSetTest::TestIndexOf() {
 906     UErrorCode ec = U_ZERO_ERROR;
 907     UnicodeSet set("[a-cx-y3578]", ec);
 908     if (U_FAILURE(ec)) {
 909         errln("FAIL: UnicodeSet constructor");
 910         return;
 911     }
 912     for (int32_t i=0; i<set.size(); ++i) {
 913         UChar32 c = set.charAt(i);
 914         if (set.indexOf(c) != i) {
 915             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
 916                 i, c, set.indexOf(c));
 917         }
 918     }
 919     UChar32 c = set.charAt(set.size());
 920     if (c != -1) {
 921         errln("FAIL: charAt(<out of range>) = %X", c);
 922     }
 923     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
 924     if (j != -1) {
 925         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
 926     }
 927 }
 928
 929 /**
 930  * Test closure API.
 931  */
 932 void UnicodeSetTest::TestCloseOver() {
 933     UErrorCode ec = U_ZERO_ERROR;
 934
 935     char CASE[] = {(char)USET_CASE};
 936     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
 937     const char* DATA[] = {
 938         // selector, input, output
 939         CASE,
 940         "[aq\\u00DF{Bc}{bC}{Fi}]",
 941         "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
 942
 943         CASE,
 944         "[\\u01F1]", // 'DZ'
 945         "[\\u01F1\\u01F2\\u01F3]",
 946
 947         CASE,
 948         "[\\u1FB4]",
 949         "[\\u1FB4{\\u03AC\\u03B9}]",
 950
 951         CASE,
 952         "[{F\\uFB01}]",
 953         "[\\uFB03{ffi}]",
 954
 955         CASE, // make sure binary search finds limits
 956         "[a\\uFF3A]",
 957         "[aA\\uFF3A\\uFF5A]",
 958
 959         CASE,
 960         "[a-z]","[A-Za-z\\u017F\\u212A]",
 961         CASE,
 962         "[abc]","[A-Ca-c]",
 963         CASE,
 964         "[ABC]","[A-Ca-c]",
 965
 966         CASE_MAPPINGS,
 967         "[aq\\u00DF{Bc}{bC}{Fi}]",
 968         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
 969
 970         CASE_MAPPINGS,
 971         "[\\u01F1]", // 'DZ'
 972         "[\\u01F1\\u01F2\\u01F3]",
 973
 974         CASE_MAPPINGS,
 975         "[a-z]",
 976         "[A-Za-z]",
 977
 978         NULL
 979     };
 980
 981     UnicodeSet s;
 982     UnicodeSet t;
 983     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
 984         int32_t selector = DATA[i][0];
 985         UnicodeString pat(DATA[i+1]);
 986         UnicodeString exp(DATA[i+2]);
 987         s.applyPattern(pat, ec);
 988         s.closeOver(selector);
 989         t.applyPattern(exp, ec);
 990         if (U_FAILURE(ec)) {
 991             errln("FAIL: applyPattern failed");
 992             continue;
 993         }
 994         if (s == t) {
 995             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
 996         } else {
 997             UnicodeString buf;
 998             errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
 999                   s.toPattern(buf, TRUE) + ", expected " + exp);
1000         }
1001     }
1002
1003     // Test the pattern API
1004     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1005     if (U_FAILURE(ec)) {
1006         errln("FAIL: applyPattern failed");
1007     } else {
1008         expectContainment(s, "abcABC", "defDEF");
1009     }
1010     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1011     if (U_FAILURE(ec)) {
1012         errln("FAIL: constructor failed");
1013     } else {
1014         expectContainment(v, "defDEF", "abcABC");
1015     }
1016     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1017     if (U_FAILURE(ec)) {
1018         errln("FAIL: construct w/case mappings failed");
1019     } else {
1020         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1021     }
1022 }
1023
1024 void UnicodeSetTest::TestEscapePattern() {
1025     const char pattern[] =
1026         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1027     const char exp[] =
1028         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1029     // We test this with two passes; in the second pass we
1030     // pre-unescape the pattern.  Since U+200E is rule whitespace,
1031     // this fails -- which is what we expect.
1032     for (int32_t pass=1; pass<=2; ++pass) {
1033         UErrorCode ec = U_ZERO_ERROR;
1034         UnicodeString pat(pattern);
1035         if (pass==2) {
1036             pat = pat.unescape();
1037         }
1038         // Pattern is only good for pass 1
1039         UBool isPatternValid = (pass==1);
1040
1041         UnicodeSet set(pat, ec);
1042         if (U_SUCCESS(ec) != isPatternValid){
1043             errln((UnicodeString)"FAIL: applyPattern(" +
1044                   escape(pat) + ") => " +
1045                   u_errorName(ec));
1046             continue;
1047         }
1048         if (U_FAILURE(ec)) {
1049             continue;
1050         }
1051         if (set.contains((UChar)0x0644)){
1052             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1053         }
1054
1055         UnicodeString newpat;
1056         set.toPattern(newpat, TRUE);
1057         if (newpat == exp) {
1058             logln(escape(pat) + " => " + newpat);
1059         } else {
1060             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1061         }
1062
1063         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1064             UnicodeString str("Range ");
1065             str.append((UChar)(0x30 + i))
1066                 .append(": ")
1067                 .append((UChar32)set.getRangeStart(i))
1068                 .append(" - ")
1069                 .append((UChar32)set.getRangeEnd(i));
1070             str = str + " (" + set.getRangeStart(i) + " - " +
1071                 set.getRangeEnd(i) + ")";
1072             if (set.getRangeStart(i) < 0) {
1073                 errln((UnicodeString)"FAIL: " + escape(str));
1074             } else {
1075                 logln(escape(str));
1076             }
1077         }
1078     }
1079 }
1080
1081 void UnicodeSetTest::expectRange(const UnicodeString& label,
1082                                  const UnicodeSet& set,
1083                                  UChar32 start, UChar32 end) {
1084     UnicodeSet exp(start, end);
1085     UnicodeString pat;
1086     if (set == exp) {
1087         logln(label + " => " + set.toPattern(pat, TRUE));
1088     } else {
1089         UnicodeString xpat;
1090         errln((UnicodeString)"FAIL: " + label + " => " +
1091               set.toPattern(pat, TRUE) +
1092               ", expected " + exp.toPattern(xpat, TRUE));
1093     }
1094 }
1095
1096 void UnicodeSetTest::TestInvalidCodePoint() {
1097
1098     const UChar32 DATA[] = {
1099         // Test range             Expected range
1100         0, 0x10FFFF,              0, 0x10FFFF,
1101         (UChar32)-1, 8,           0, 8,
1102         8, 0x110000,              8, 0x10FFFF
1103     };
1104     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1105
1106     UnicodeString pat;
1107     int32_t i;
1108
1109     for (i=0; i<DATA_LENGTH; i+=4) {
1110         UChar32 start  = DATA[i];
1111         UChar32 end    = DATA[i+1];
1112         UChar32 xstart = DATA[i+2];
1113         UChar32 xend   = DATA[i+3];
1114
1115         // Try various API using the test code points
1116
1117         UnicodeSet set(start, end);
1118         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1119                     set, xstart, xend);
1120
1121         set.clear();
1122         set.set(start, end);
1123         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1124                     set, xstart, xend);
1125
1126         UBool b = set.contains(start);
1127         b = set.contains(start, end);
1128         b = set.containsNone(start, end);
1129         b = set.containsSome(start, end);
1130
1131         /*int32_t index = set.indexOf(start);*/
1132
1133         set.clear();
1134         set.add(start);
1135         set.add(start, end);
1136         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1137                     set, xstart, xend);
1138
1139         set.set(0, 0x10FFFF);
1140         set.retain(start, end);
1141         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1142                     set, xstart, xend);
1143         set.retain(start);
1144
1145         set.set(0, 0x10FFFF);
1146         set.remove(start);
1147         set.remove(start, end);
1148         set.complement();
1149         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1150                     set, xstart, xend);
1151
1152         set.set(0, 0x10FFFF);
1153         set.complement(start, end);
1154         set.complement();
1155         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1156                     set, xstart, xend);
1157         set.complement(start);
1158     }
1159
1160     const UChar32 DATA2[] = {
1161         0,
1162         0x10FFFF,
1163         (UChar32)-1,
1164         0x110000
1165     };
1166     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1167
1168     for (i=0; i<DATA2_LENGTH; ++i) {
1169         UChar32 c = DATA2[i], end = 0x10FFFF;
1170         UBool valid = (c >= 0 && c <= 0x10FFFF);
1171
1172         UnicodeSet set(0, 0x10FFFF);
1173
1174         // For single-codepoint contains, invalid codepoints are NOT contained
1175         UBool b = set.contains(c);
1176         if (b == valid) {
1177             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1178                   ") = " + b);
1179         } else {
1180             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1181                   ") = " + b);
1182         }
1183
1184         // For codepoint range contains, containsNone, and containsSome,
1185         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1186         b = set.contains(c, end);
1187         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1188               "," + end + ") = " + b);
1189
1190         b = set.containsNone(c, end);
1191         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1192               "," + end + ") = " + b);
1193
1194         b = set.containsSome(c, end);
1195         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1196               "," + end + ") = " + b);
1197
1198         int32_t index = set.indexOf(c);
1199         if ((index >= 0) == valid) {
1200             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1201                   ") = " + index);
1202         } else {
1203             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1204                   ") = " + index);
1205         }
1206     }
1207 }
1208
1209 // Used by TestSymbolTable
1210 class TokenSymbolTable : public SymbolTable {
1211 public:
1212     Hashtable contents;
1213
1214     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1215         contents.setValueDeleter(uhash_deleteUnicodeString);
1216     }
1217
1218     ~TokenSymbolTable() {}
1219
1220     /**
1221      * (Non-SymbolTable API) Add the given variable and value to
1222      * the table.  Variable should NOT contain leading '$'.
1223      */
1224     void add(const UnicodeString& var, const UnicodeString& value,
1225              UErrorCode& ec) {
1226         if (U_SUCCESS(ec)) {
1227             contents.put(var, new UnicodeString(value), ec);
1228         }
1229     }
1230
1231     /**
1232      * SymbolTable API
1233      */
1234     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1235         return (const UnicodeString*) contents.get(s);
1236     }
1237
1238     /**
1239      * SymbolTable API
1240      */
1241     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1242         return NULL;
1243     }
1244
1245     /**
1246      * SymbolTable API
1247      */
1248     virtual UnicodeString parseReference(const UnicodeString& text,
1249                                          ParsePosition& pos, int32_t limit) const {
1250         int32_t start = pos.getIndex();
1251         int32_t i = start;
1252         UnicodeString result;
1253         while (i < limit) {
1254             UChar c = text.charAt(i);
1255             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1256                 break;
1257             }
1258             ++i;
1259         }
1260         if (i == start) { // No valid name chars
1261             return result; // Indicate failure with empty string
1262         }
1263         pos.setIndex(i);
1264         text.extractBetween(start, i, result);
1265         return result;
1266     }
1267 };
1268
1269 void UnicodeSetTest::TestSymbolTable() {
1270     // Multiple test cases can be set up here.  Each test case
1271     // is terminated by null:
1272     // var, value, var, value,..., input pat., exp. output pat., null
1273     const char* DATA[] = {
1274         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1275         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1276         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1277         NULL
1278     };
1279
1280     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1281         UErrorCode ec = U_ZERO_ERROR;
1282         TokenSymbolTable sym(ec);
1283         if (U_FAILURE(ec)) {
1284             errln("FAIL: couldn't construct TokenSymbolTable");
1285             continue;
1286         }
1287
1288         // Set up variables
1289         while (DATA[i+2] != NULL) {
1290             sym.add(DATA[i], DATA[i+1], ec);
1291             if (U_FAILURE(ec)) {
1292                 errln("FAIL: couldn't add to TokenSymbolTable");
1293                 continue;
1294             }
1295             i += 2;
1296         }
1297
1298         // Input pattern and expected output pattern
1299         UnicodeString inpat = DATA[i], exppat = DATA[i+1];
1300         i += 2;
1301
1302         ParsePosition pos(0);
1303         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1304         if (U_FAILURE(ec)) {
1305             errln("FAIL: couldn't construct UnicodeSet");
1306             continue;
1307         }
1308
1309         // results
1310         if (pos.getIndex() != inpat.length()) {
1311             errln((UnicodeString)"Failed to read to end of string \""
1312                   + inpat + "\": read to "
1313                   + pos.getIndex() + ", length is "
1314                   + inpat.length());
1315         }
1316
1317         UnicodeSet us2(exppat, ec);
1318         if (U_FAILURE(ec)) {
1319             errln("FAIL: couldn't construct expected UnicodeSet");
1320             continue;
1321         }
1322
1323         UnicodeString a, b;
1324         if (us != us2) {
1325             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1326                   ", expected " + us2.toPattern(b, TRUE));
1327         } else {
1328             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1329         }
1330     }
1331 }
1332
1333 void UnicodeSetTest::TestSurrogate() {
1334     const char* DATA[] = {
1335         // These should all behave identically
1336         "[abc\\uD800\\uDC00]",
1337         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1338         "[abc\\U00010000]",
1339         0
1340     };
1341     for (int i=0; DATA[i] != 0; ++i) {
1342         UErrorCode ec = U_ZERO_ERROR;
1343         logln((UnicodeString)"Test pattern " + i + " :" + DATA[i]);
1344         UnicodeSet set(DATA[i], ec);
1345         if (U_FAILURE(ec)) {
1346             errln("FAIL: UnicodeSet constructor");
1347             continue;
1348         }
1349         expectContainment(set,
1350                           CharsToUnicodeString("abc\\U00010000"),
1351                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1352         if (set.size() != 4) {
1353             errln((UnicodeString)"FAIL: " + DATA[i] + ".size() == " +
1354                   set.size() + ", expected 4");
1355         }
1356     }
1357 }
1358
1359 void UnicodeSetTest::TestExhaustive() {
1360     // exhaustive tests. Simulate UnicodeSets with integers.
1361     // That gives us very solid tests (except for large memory tests).
1362
1363     int32_t limit = 128;
1364
1365     UnicodeSet x, y, z, aa;
1366
1367     for (int32_t i = 0; i < limit; ++i) {
1368         bitsToSet(i, x);
1369         logln((UnicodeString)"Testing " + i + ", " + x);
1370         _testComplement(i, x, y);
1371
1372         // AS LONG AS WE ARE HERE, check roundtrip
1373         checkRoundTrip(bitsToSet(i, aa));
1374
1375         for (int32_t j = 0; j < limit; ++j) {
1376             _testAdd(i,j,  x,y,z);
1377             _testXor(i,j,  x,y,z);
1378             _testRetain(i,j,  x,y,z);
1379             _testRemove(i,j,  x,y,z);
1380         }
1381     }
1382 }
1383
1384 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1385     bitsToSet(a, x);
1386     z = x;
1387     z.complement();
1388     int32_t c = setToBits(z);
1389     if (c != (~a)) {
1390         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1391         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1392     }
1393     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1394 }
1395
1396 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1397     bitsToSet(a, x);
1398     bitsToSet(b, y);
1399     z = x;
1400     z.addAll(y);
1401     int32_t c = setToBits(z);
1402     if (c != (a | b)) {
1403         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1404         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1405     }
1406     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1407 }
1408
1409 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1410     bitsToSet(a, x);
1411     bitsToSet(b, y);
1412     z = x;
1413     z.retainAll(y);
1414     int32_t c = setToBits(z);
1415     if (c != (a & b)) {
1416         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1417         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1418     }
1419     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1420 }
1421
1422 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1423     bitsToSet(a, x);
1424     bitsToSet(b, y);
1425     z = x;
1426     z.removeAll(y);
1427     int32_t c = setToBits(z);
1428     if (c != (a &~ b)) {
1429         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1430         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1431     }
1432     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1433 }
1434
1435 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1436     bitsToSet(a, x);
1437     bitsToSet(b, y);
1438     z = x;
1439     z.complementAll(y);
1440     int32_t c = setToBits(z);
1441     if (c != (a ^ b)) {
1442         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1443         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1444     }
1445     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1446 }
1447
1448 /**
1449  * Check that ranges are monotonically increasing and non-
1450  * overlapping.
1451  */
1452 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1453     int32_t n = set.getRangeCount();
1454     if (n < 0) {
1455         errln((UnicodeString)"FAIL result of " + msg +
1456               ": range count should be >= 0 but is " +
1457               n /*+ " for " + set.toPattern())*/);
1458         return;
1459     }
1460     UChar32 last = 0;
1461     for (int32_t i=0; i<n; ++i) {
1462         UChar32 start = set.getRangeStart(i);
1463         UChar32 end = set.getRangeEnd(i);
1464         if (start > end) {
1465             errln((UnicodeString)"FAIL result of " + msg +
1466                   ": range " + (i+1) +
1467                   " start > end: " + (int)start + ", " + (int)end +
1468                   " for " + set);
1469         }
1470         if (i > 0 && start <= last) {
1471             errln((UnicodeString)"FAIL result of " + msg +
1472                   ": range " + (i+1) +
1473                   " overlaps previous range: " + (int)start + ", " + (int)end +
1474                   " for " + set);
1475         }
1476         last = end;
1477     }
1478 }
1479
1480 /**
1481  * Convert a bitmask to a UnicodeSet.
1482  */
1483 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1484     result.clear();
1485     for (UChar32 i = 0; i < 32; ++i) {
1486         if ((a & (1<<i)) != 0) {
1487             result.add(i);
1488         }
1489     }
1490     return result;
1491 }
1492
1493 /**
1494  * Convert a UnicodeSet to a bitmask.  Only the characters
1495  * U+0000 to U+0020 are represented in the bitmask.
1496  */
1497 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1498     int32_t result = 0;
1499     for (int32_t i = 0; i < 32; ++i) {
1500         if (x.contains((UChar32)i)) {
1501             result |= (1<<i);
1502         }
1503     }
1504     return result;
1505 }
1506
1507 /**
1508  * Return the representation of an inversion list based UnicodeSet
1509  * as a pairs list.  Ranges are listed in ascending Unicode order.
1510  * For example, the set [a-zA-M3] is represented as "33AMaz".
1511  */
1512 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1513     UnicodeString pairs;
1514     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1515         UChar32 start = set.getRangeStart(i);
1516         UChar32 end = set.getRangeEnd(i);
1517         if (end > 0xFFFF) {
1518             end = 0xFFFF;
1519             i = set.getRangeCount(); // Should be unnecessary
1520         }
1521         pairs.append((UChar)start).append((UChar)end);
1522     }
1523     return pairs;
1524 }
1525
1526 /**
1527  * Basic consistency check for a few items.
1528  * That the iterator works, and that we can create a pattern and
1529  * get the same thing back
1530  */
1531 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1532     UErrorCode ec = U_ZERO_ERROR;
1533
1534     UnicodeSet t(s);
1535     checkEqual(s, t, "copy ct");
1536
1537     t = s;
1538     checkEqual(s, t, "operator=");
1539
1540     copyWithIterator(t, s, FALSE);
1541     checkEqual(s, t, "iterator roundtrip");
1542
1543     copyWithIterator(t, s, TRUE); // try range
1544     checkEqual(s, t, "iterator roundtrip");
1545
1546     UnicodeString pat; s.toPattern(pat, FALSE);
1547     t.applyPattern(pat, ec);
1548     if (U_FAILURE(ec)) {
1549         errln("FAIL: applyPattern");
1550         return;
1551     } else {
1552         checkEqual(s, t, "toPattern(false)");
1553     }
1554
1555     s.toPattern(pat, TRUE);
1556     t.applyPattern(pat, ec);
1557     if (U_FAILURE(ec)) {
1558         errln("FAIL: applyPattern");
1559         return;
1560     } else {
1561         checkEqual(s, t, "toPattern(true)");
1562     }
1563 }
1564
1565 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1566     t.clear();
1567     UnicodeSetIterator it(s);
1568     if (withRange) {
1569         while (it.nextRange()) {
1570             if (it.isString()) {
1571                 t.add(it.getString());
1572             } else {
1573                 t.add(it.getCodepoint(), it.getCodepointEnd());
1574             }
1575         }
1576     } else {
1577         while (it.next()) {
1578             if (it.isString()) {
1579                 t.add(it.getString());
1580             } else {
1581                 t.add(it.getCodepoint());
1582             }
1583         }
1584     }
1585 }
1586
1587 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1588     UnicodeString source; s.toPattern(source, TRUE);
1589     UnicodeString result; t.toPattern(result, TRUE);
1590     if (s != t) {
1591         errln((UnicodeString)"FAIL: " + message
1592               + "; source = " + source
1593               + "; result = " + result
1594               );
1595         return FALSE;
1596     } else {
1597         logln((UnicodeString)"Ok: " + message
1598               + "; source = " + source
1599               + "; result = " + result
1600               );
1601     }
1602     return TRUE;
1603 }
1604
1605 void
1606 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1607                                   const UnicodeString& charsIn,
1608                                   const UnicodeString& charsOut) {
1609     UErrorCode ec = U_ZERO_ERROR;
1610     UnicodeSet set(pat, ec);
1611     if (U_FAILURE(ec)) {
1612         errln((UnicodeString)"FAIL: pattern \"" +
1613               pat + "\" => " + u_errorName(ec));
1614         return;
1615     }
1616     expectContainment(set, pat, charsIn, charsOut);
1617 }
1618
1619 void
1620 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1621                                   const UnicodeString& charsIn,
1622                                   const UnicodeString& charsOut) {
1623     UnicodeString pat;
1624     set.toPattern(pat);
1625     expectContainment(set, pat, charsIn, charsOut);
1626 }
1627
1628 void
1629 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1630                                   const UnicodeString& setName,
1631                                   const UnicodeString& charsIn,
1632                                   const UnicodeString& charsOut) {
1633     UnicodeString bad;
1634     UChar32 c;
1635     int32_t i;
1636
1637     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1638         c = charsIn.char32At(i);
1639         if (!set.contains(c)) {
1640             bad.append(c);
1641         }
1642     }
1643     if (bad.length() > 0) {
1644         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1645               ", expected containment of " + prettify(charsIn));
1646     } else {
1647         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1648     }
1649
1650     bad.truncate(0);
1651     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1652         c = charsOut.char32At(i);
1653         if (set.contains(c)) {
1654             bad.append(c);
1655         }
1656     }
1657     if (bad.length() > 0) {
1658         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1659               ", expected non-containment of " + prettify(charsOut));
1660     } else {
1661         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1662     }
1663 }
1664
1665 void
1666 UnicodeSetTest::expectPattern(UnicodeSet& set,
1667                               const UnicodeString& pattern,
1668                               const UnicodeString& expectedPairs){
1669     UErrorCode status = U_ZERO_ERROR;
1670     set.applyPattern(pattern, status);
1671     if (U_FAILURE(status)) {
1672         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1673               "\") failed");
1674         return;
1675     } else {
1676         if (getPairs(set) != expectedPairs ) {
1677             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1678                   "\") => pairs \"" +
1679                   escape(getPairs(set)) + "\", expected \"" +
1680                   escape(expectedPairs) + "\"");
1681         } else {
1682             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
1683                   "\") => pairs \"" +
1684                   escape(getPairs(set)) + "\"");
1685         }
1686     }
1687     // the result of calling set.toPattern(), which is the string representation of
1688     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
1689     // will produce another set that is equal to this one.
1690     UnicodeString temppattern;
1691     set.toPattern(temppattern);
1692     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1693     if (U_FAILURE(status)) {
1694         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1695         return;
1696     }
1697     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1698         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1699             escape(getPairs(set)) + "\""));
1700     } else{
1701         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
1702     }
1703
1704     delete tempset;
1705
1706 }
1707
1708 void
1709 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
1710     if (getPairs(set) != expectedPairs) {
1711         errln(UnicodeString("FAIL: Expected pair list \"") +
1712               escape(expectedPairs) + "\", got \"" +
1713               escape(getPairs(set)) + "\"");
1714     }
1715 }
1716
1717 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
1718                                      const UnicodeString& expPat,
1719                                      const char** expStrings) {
1720     UnicodeString pat;
1721     set.toPattern(pat, TRUE);
1722     if (pat == expPat) {
1723         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
1724     } else {
1725         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
1726         return;
1727     }
1728     if (expStrings == NULL) {
1729         return;
1730     }
1731     UBool in = TRUE;
1732     for (int32_t i=0; expStrings[i] != NULL; ++i) {
1733         if (expStrings[i] == NOT) { // sic; pointer comparison
1734             in = FALSE;
1735             continue;
1736         }
1737         UnicodeString s = CharsToUnicodeString(expStrings[i]);
1738         UBool contained = set.contains(s);
1739         if (contained == in) {
1740             logln((UnicodeString)"Ok: " + expPat +
1741                   (contained ? " contains {" : " does not contain {") +
1742                   escape(expStrings[i]) + "}");
1743         } else {
1744             errln((UnicodeString)"FAIL: " + expPat +
1745                   (contained ? " contains {" : " does not contain {") +
1746                   escape(expStrings[i]) + "}");
1747         }
1748     }
1749 }
1750
1751 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
1752
1753 void
1754 UnicodeSetTest::doAssert(UBool condition, const char *message)
1755 {
1756     if (!condition) {
1757         errln(UnicodeString("ERROR : ") + message);
1758     }
1759 }
1760
1761 UnicodeString
1762 UnicodeSetTest::escape(const UnicodeString& s) {
1763     UnicodeString buf;
1764     for (int32_t i=0; i<s.length(); )
1765     {
1766         UChar32 c = s.char32At(i);
1767         if (0x0020 <= c && c <= 0x007F) {
1768             buf += c;
1769         } else {
1770             if (c <= 0xFFFF) {
1771                 buf += (UChar)0x5c; buf += (UChar)0x75;
1772             } else {
1773                 buf += (UChar)0x5c; buf += (UChar)0x55;
1774                 buf += toHexString((c & 0xF0000000) >> 28);
1775                 buf += toHexString((c & 0x0F000000) >> 24);
1776                 buf += toHexString((c & 0x00F00000) >> 20);
1777                 buf += toHexString((c & 0x000F0000) >> 16);
1778             }
1779             buf += toHexString((c & 0xF000) >> 12);
1780             buf += toHexString((c & 0x0F00) >> 8);
1781             buf += toHexString((c & 0x00F0) >> 4);
1782             buf += toHexString(c & 0x000F);
1783         }
1784         i += U16_LENGTH(c);
1785     }
1786     return buf;
1787 }