icuSources/test/intltest/ssearch.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2008, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8
   9 #include "unicode/utypes.h"
  10
  11 #if !UCONFIG_NO_COLLATION
  12
  13 #include "unicode/unistr.h"
  14 #include "unicode/putil.h"
  15 #include "unicode/usearch.h"
  16
  17 #include "cmemory.h"
  18 #include "unicode/coll.h"
  19 #include "unicode/tblcoll.h"
  20 #include "unicode/coleitr.h"
  21 #include "unicode/ucoleitr.h"
  22
  23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  24
  25 #include "unicode/uniset.h"
  26 #include "unicode/uset.h"
  27 #include "unicode/ustring.h"
  28 #include "hash.h"
  29 #include "uhash.h"
  30 #include "ucol_imp.h"
  31
  32 #include "intltest.h"
  33 #include "ssearch.h"
  34
  35 #include "xmlparser.h"
  36
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <stdio.h>
  40
  41 char testId[100];
  42
  43 #define TEST_ASSERT(x) {if (!(x)) { \
  44     errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
  45
  46 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
  47     errln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
  48
  49 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
  50     errln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
  51           __FILE__, __LINE__, testId, u_errorName(errcode));}}
  52
  53 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  54
  55 //---------------------------------------------------------------------------
  56 //
  57 //  Test class boilerplate
  58 //
  59 //---------------------------------------------------------------------------
  60 SSearchTest::SSearchTest()
  61 {
  62 }
  63
  64 SSearchTest::~SSearchTest()
  65 {
  66 }
  67
  68 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
  69 {
  70     if (exec) logln("TestSuite SSearchTest: ");
  71     switch (index) {
  72 #if !UCONFIG_NO_BREAK_ITERATION
  73        case 0: name = "searchTest";
  74             if (exec) searchTest();
  75             break;
  76
  77         case 1: name = "offsetTest";
  78             if (exec) offsetTest();
  79             break;
  80
  81         case 2: name = "monkeyTest";
  82             if (exec) monkeyTest(params);
  83             break;
  84 #endif
  85         default: name = "";
  86             break; //needed to end loop
  87     }
  88 }
  89
  90
  91 #if !UCONFIG_NO_BREAK_ITERATION
  92
  93 #define PATH_BUFFER_SIZE 2048
  94 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
  95     UErrorCode status = U_ZERO_ERROR;
  96     const char *testDataDirectory = IntlTest::getSourceTestData(status);
  97
  98     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
  99         errln("ERROR: getPath() failed - %s", u_errorName(status));
 100         return NULL;
 101     }
 102
 103     strcpy(buffer, testDataDirectory);
 104     strcat(buffer, filename);
 105     return buffer;
 106 }
 107
 108
 109 void SSearchTest::searchTest()
 110 {
 111 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 112     UErrorCode status = U_ZERO_ERROR;
 113     char path[PATH_BUFFER_SIZE];
 114     const char *testFilePath = getPath(path, "ssearch.xml");
 115
 116     if (testFilePath == NULL) {
 117         return; /* Couldn't get path: error message already output. */
 118     }
 119
 120     UXMLParser  *parser = UXMLParser::createParser(status);
 121     TEST_ASSERT_SUCCESS(status);
 122     UXMLElement *root   = parser->parseFile(testFilePath, status);
 123     TEST_ASSERT_SUCCESS(status);
 124     if (U_FAILURE(status)) {
 125         return;
 126     }
 127
 128     const UnicodeString *debugTestCase = root->getAttribute("debug");
 129     if (debugTestCase != NULL) {
 130 //       setenv("USEARCH_DEBUG", "1", 1);
 131     }
 132
 133
 134     const UXMLElement *testCase;
 135     int32_t tc = 0;
 136
 137     while((testCase = root->nextChildElement(tc)) != NULL) {
 138
 139         if (testCase->getTagName().compare("test-case") != 0) {
 140             errln("ssearch, unrecognized XML Element in test file");
 141             continue;
 142         }
 143         const UnicodeString *id       = testCase->getAttribute("id");
 144         *testId = 0;
 145         if (id != NULL) {
 146             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
 147         }
 148
 149         // If debugging test case has been specified and this is not it, skip to next.
 150         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
 151             continue;
 152         }
 153         //
 154         //  Get the requested collation strength.
 155         //    Default is tertiary if the XML attribute is missing from the test case.
 156         //
 157         const UnicodeString *strength = testCase->getAttribute("strength");
 158         UColAttributeValue collatorStrength;
 159         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
 160         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
 161         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
 162         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
 163         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
 164         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
 165         else {
 166             // Bogus value supplied for strength.  Shouldn't happen, even from
 167             //  typos, if the  XML source has been validated.
 168             //  This assert is a little deceiving in that strength can be
 169             //   any of the allowed values, not just TERTIARY, but it will
 170             //   do the job of getting the error output.
 171             TEST_ASSERT(*strength=="TERTIARY")
 172         }
 173
 174         //
 175         // Get the collator normalization flag.  Default is UCOL_OFF.
 176         //
 177         UColAttributeValue normalize = UCOL_OFF;
 178         const UnicodeString *norm = testCase->getAttribute("norm");
 179         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
 180         if (norm!=NULL && *norm=="ON") {
 181             normalize = UCOL_ON;
 182         }
 183
 184         const UnicodeString defLocale("en");
 185         char  clocale[100];
 186         const UnicodeString *locale   = testCase->getAttribute("locale");
 187         if (locale == NULL || locale->length()==0) {
 188             locale = &defLocale;
 189         };
 190         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
 191
 192
 193         UnicodeString  text;
 194         UnicodeString  target;
 195         UnicodeString  pattern;
 196         int32_t        expectedMatchStart = -1;
 197         int32_t        expectedMatchLimit = -1;
 198         const UXMLElement  *n;
 199         int                nodeCount = 0;
 200
 201         n = testCase->getChildElement("pattern");
 202         TEST_ASSERT(n != NULL);
 203         if (n==NULL) {
 204             continue;
 205         }
 206         text = n->getText(FALSE);
 207         text = text.unescape();
 208         pattern.append(text);
 209         nodeCount++;
 210
 211         n = testCase->getChildElement("pre");
 212         if (n!=NULL) {
 213             text = n->getText(FALSE);
 214             text = text.unescape();
 215             target.append(text);
 216             nodeCount++;
 217         }
 218
 219         n = testCase->getChildElement("m");
 220         if (n!=NULL) {
 221             expectedMatchStart = target.length();
 222             text = n->getText(FALSE);
 223             text = text.unescape();
 224             target.append(text);
 225             expectedMatchLimit = target.length();
 226             nodeCount++;
 227         }
 228
 229         n = testCase->getChildElement("post");
 230         if (n!=NULL) {
 231             text = n->getText(FALSE);
 232             text = text.unescape();
 233             target.append(text);
 234             nodeCount++;
 235         }
 236
 237         //  Check that there weren't extra things in the XML
 238         TEST_ASSERT(nodeCount == testCase->countChildren());
 239
 240         // Open a collotor and StringSearch based on the parameters
 241         //   obtained from the XML.
 242         //
 243         status = U_ZERO_ERROR;
 244         UCollator *collator = ucol_open(clocale, &status);
 245         ucol_setStrength(collator, collatorStrength);
 246         ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
 247         UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 248                                          target.getBuffer(), target.length(),
 249                                          collator,
 250                                          NULL,     // the break iterator
 251                                          &status);
 252
 253         TEST_ASSERT_SUCCESS(status);
 254         if (U_FAILURE(status)) {
 255             usearch_close(uss);
 256             ucol_close(collator);
 257             continue;
 258         }
 259
 260         int32_t foundStart = 0;
 261         int32_t foundLimit = 0;
 262         UBool   foundMatch;
 263
 264         //
 265         // Do the search, check the match result against the expected results.
 266         //
 267         foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
 268         TEST_ASSERT_SUCCESS(status);
 269         if (foundMatch && expectedMatchStart<0 ||
 270             foundStart != expectedMatchStart   ||
 271             foundLimit != expectedMatchLimit) {
 272                 TEST_ASSERT(FALSE);   //  ouput generic error position
 273                 infoln("Found, expected match start = %d, %d \n"
 274                        "Found, expected match limit = %d, %d",
 275                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 276         }
 277
 278         // In case there are other matches...
 279         // (should we only do this if the test case passed?)
 280         while (foundMatch) {
 281             expectedMatchStart = foundStart;
 282             expectedMatchLimit = foundLimit;
 283
 284             foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
 285         }
 286
 287         usearch_close(uss);
 288
 289         uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 290             target.getBuffer(), target.length(),
 291             collator,
 292             NULL,
 293             &status);
 294
 295         //
 296         // Do the backwards search, check the match result against the expected results.
 297         //
 298         foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
 299         TEST_ASSERT_SUCCESS(status);
 300         if (foundMatch && expectedMatchStart<0 ||
 301             foundStart != expectedMatchStart   ||
 302             foundLimit != expectedMatchLimit) {
 303                 TEST_ASSERT(FALSE);   //  ouput generic error position
 304                 infoln("Found, expected backwards match start = %d, %d \n"
 305                        "Found, expected backwards match limit = %d, %d",
 306                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 307         }
 308
 309         usearch_close(uss);
 310         ucol_close(collator);
 311     }
 312
 313     delete root;
 314     delete parser;
 315 #endif
 316 }
 317
 318 struct Order
 319 {
 320     int32_t order;
 321     int32_t lowOffset;
 322     int32_t highOffset;
 323 };
 324
 325 class OrderList
 326 {
 327 public:
 328     OrderList();
 329     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
 330     ~OrderList();
 331
 332     int32_t size(void) const;
 333     void add(int32_t order, int32_t low, int32_t high);
 334     const Order *get(int32_t index) const;
 335     int32_t getLowOffset(int32_t index) const;
 336     int32_t getHighOffset(int32_t index) const;
 337     int32_t getOrder(int32_t index) const;
 338     void reverse(void);
 339     UBool compare(const OrderList &other) const;
 340     UBool matchesAt(int32_t offset, const OrderList &other) const;
 341
 342 private:
 343     Order *list;
 344     int32_t listMax;
 345     int32_t listSize;
 346 };
 347
 348 OrderList::OrderList()
 349   : list(NULL), listSize(0), listMax(16)
 350 {
 351     list = new Order[listMax];
 352 }
 353
 354 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
 355     : list(NULL), listMax(16), listSize(0)
 356 {
 357     UErrorCode status = U_ZERO_ERROR;
 358     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
 359     uint32_t strengthMask = 0;
 360     int32_t order, low, high;
 361
 362     switch (ucol_getStrength(coll))
 363     {
 364     default:
 365         strengthMask |= UCOL_TERTIARYORDERMASK;
 366         /* fall through */
 367
 368     case UCOL_SECONDARY:
 369         strengthMask |= UCOL_SECONDARYORDERMASK;
 370         /* fall through */
 371
 372     case UCOL_PRIMARY:
 373         strengthMask |= UCOL_PRIMARYORDERMASK;
 374     }
 375
 376     list = new Order[listMax];
 377
 378     ucol_setOffset(elems, stringOffset, &status);
 379
 380     do {
 381         low   = ucol_getOffset(elems);
 382         order = ucol_next(elems, &status);
 383         high  = ucol_getOffset(elems);
 384
 385         if (order != UCOL_NULLORDER) {
 386             order &= strengthMask;
 387         }
 388
 389         if (order != UCOL_IGNORABLE) {
 390             add(order, low, high);
 391         }
 392     } while (order != UCOL_NULLORDER);
 393
 394     ucol_closeElements(elems);
 395 }
 396
 397 OrderList::~OrderList()
 398 {
 399     delete[] list;
 400 }
 401
 402 void OrderList::add(int32_t order, int32_t low, int32_t high)
 403 {
 404     if (listSize >= listMax) {
 405         listMax *= 2;
 406
 407         Order *newList = new Order[listMax];
 408
 409         uprv_memcpy(newList, list, listSize * sizeof(Order));
 410         delete[] list;
 411         list = newList;
 412     }
 413
 414     list[listSize].order      = order;
 415     list[listSize].lowOffset  = low;
 416     list[listSize].highOffset = high;
 417
 418     listSize += 1;
 419 }
 420
 421 const Order *OrderList::get(int32_t index) const
 422 {
 423     if (index >= listSize) {
 424         return NULL;
 425     }
 426
 427     return &list[index];
 428 }
 429
 430 int32_t OrderList::getLowOffset(int32_t index) const
 431 {
 432     const Order *order = get(index);
 433
 434     if (order != NULL) {
 435         return order->lowOffset;
 436     }
 437
 438     return -1;
 439 }
 440
 441 int32_t OrderList::getHighOffset(int32_t index) const
 442 {
 443     const Order *order = get(index);
 444
 445     if (order != NULL) {
 446         return order->highOffset;
 447     }
 448
 449     return -1;
 450 }
 451
 452 int32_t OrderList::getOrder(int32_t index) const
 453 {
 454     const Order *order = get(index);
 455
 456     if (order != NULL) {
 457         return order->order;
 458     }
 459
 460     return UCOL_NULLORDER;
 461 }
 462
 463 int32_t OrderList::size() const
 464 {
 465     return listSize;
 466 }
 467
 468 void OrderList::reverse()
 469 {
 470     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
 471         Order swap = list[b];
 472
 473         list[b] = list[f];
 474         list[f] = swap;
 475     }
 476 }
 477
 478 UBool OrderList::compare(const OrderList &other) const
 479 {
 480     if (listSize != other.listSize) {
 481         return FALSE;
 482     }
 483
 484     for(int32_t i = 0; i < listSize; i += 1) {
 485         if (list[i].order  != other.list[i].order ||
 486             list[i].lowOffset != other.list[i].lowOffset ||
 487             list[i].highOffset != other.list[i].highOffset) {
 488                 return FALSE;
 489         }
 490     }
 491
 492     return TRUE;
 493 }
 494
 495 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
 496 {
 497     // NOTE: sizes include the NULLORDER, which we don't want to compare.
 498     int32_t otherSize = other.size() - 1;
 499
 500     if (listSize - 1 - offset < otherSize) {
 501         return FALSE;
 502     }
 503
 504     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
 505         if (getOrder(i) != other.getOrder(j)) {
 506             return FALSE;
 507         }
 508     }
 509
 510     return TRUE;
 511 }
 512
 513 static char *printOffsets(char *buffer, OrderList &list)
 514 {
 515     int32_t size = list.size();
 516     char *s = buffer;
 517
 518     for(int32_t i = 0; i < size; i += 1) {
 519         const Order *order = list.get(i);
 520
 521         if (i != 0) {
 522             s += sprintf(s, ", ");
 523         }
 524
 525         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
 526     }
 527
 528     return buffer;
 529 }
 530
 531 static char *printOrders(char *buffer, OrderList &list)
 532 {
 533     int32_t size = list.size();
 534     char *s = buffer;
 535
 536     for(int32_t i = 0; i < size; i += 1) {
 537         const Order *order = list.get(i);
 538
 539         if (i != 0) {
 540             s += sprintf(s, ", ");
 541         }
 542
 543         s += sprintf(s, "%8.8X", order->order);
 544     }
 545
 546     return buffer;
 547 }
 548
 549 void SSearchTest::offsetTest()
 550 {
 551     const char *test[] = {
 552         "\\ua191\\u16ef\\u2036\\u017a",
 553
 554 #if 0
 555         // This results in a complex interaction between contraction,
 556         // expansion and normalization that confuses the backwards offset fixups.
 557         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 558 #endif
 559
 560         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 561         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
 562
 563         "\\u02FE\\u02FF"
 564         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
 565         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
 566         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
 567         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
 568         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
 569
 570         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
 571         "abc\\u0E41\\u0301\\u0316",
 572                 "abc\\u0E41\\u0316\\u0301",
 573                 "\\u0E41\\u0301\\u0316",
 574                 "\\u0E41\\u0316\\u0301",
 575                 "a\\u0301\\u0316",
 576                 "a\\u0316\\u0301",
 577                 "\\uAC52\\uAC53",
 578                 "\\u34CA\\u34CB",
 579                 "\\u11ED\\u11EE",
 580                 "\\u30C3\\u30D0",
 581                 "p\\u00E9ch\\u00E9",
 582         "a\\u0301\\u0325",
 583         "a\\u0300\\u0325",
 584         "a\\u0325\\u0300",
 585         "A\\u0323\\u0300B",
 586         "A\\u0300\\u0323B",
 587         "A\\u0301\\u0323B",
 588         "A\\u0302\\u0301\\u0323B",
 589         "abc",
 590         "ab\\u0300c",
 591         "ab\\u0300\\u0323c",
 592         " \\uD800\\uDC00\\uDC00",
 593         "a\\uD800\\uDC00\\uDC00",
 594         "A\\u0301\\u0301",
 595         "A\\u0301\\u0323",
 596         "A\\u0301\\u0323B",
 597         "B\\u0301\\u0323C",
 598         "A\\u0300\\u0323B",
 599         "\\u0301A\\u0301\\u0301",
 600         "abcd\\r\\u0301",
 601         "p\\u00EAche",
 602         "pe\\u0302che",
 603     };
 604
 605     int32_t testCount = ARRAY_SIZE(test);
 606     UErrorCode status = U_ZERO_ERROR;
 607     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
 608     if (U_FAILURE(status)) {
 609         errln("Failed to create collator in offsetTest!");
 610         return;
 611     }
 612     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
 613                         // We could allocate one that's the right size by (CE_count * 10) + 2
 614                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
 615
 616     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
 617
 618     for(int32_t i = 0; i < testCount; i += 1) {
 619         UnicodeString ts = CharsToUnicodeString(test[i]);
 620         CollationElementIterator *iter = col->createCollationElementIterator(ts);
 621         OrderList forwardList;
 622         OrderList backwardList;
 623         int32_t order, low, high;
 624
 625         do {
 626             low   = iter->getOffset();
 627             order = iter->next(status);
 628             high  = iter->getOffset();
 629
 630             forwardList.add(order, low, high);
 631         } while (order != CollationElementIterator::NULLORDER);
 632
 633         iter->reset();
 634         iter->setOffset(ts.length(), status);
 635
 636         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
 637
 638         do {
 639             high  = iter->getOffset();
 640             order = iter->previous(status);
 641             low   = iter->getOffset();
 642
 643             if (order == CollationElementIterator::NULLORDER) {
 644                 break;
 645             }
 646
 647             backwardList.add(order, low, high);
 648         } while (TRUE);
 649
 650         backwardList.reverse();
 651
 652         if (forwardList.compare(backwardList)) {
 653             logln("Works with \"%s\"", test[i]);
 654             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
 655 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
 656
 657             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
 658 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
 659
 660             logln();
 661         } else {
 662             errln("Fails with \"%s\"", test[i]);
 663             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
 664             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
 665
 666             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
 667             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
 668
 669             infoln();
 670         }
 671         delete iter;
 672     }
 673     delete col;
 674 }
 675
 676 class CEList
 677 {
 678 public:
 679     CEList(UCollator *coll, const UnicodeString &string);
 680     ~CEList();
 681
 682     int32_t size() const;
 683     int32_t get(int32_t index) const;
 684     UBool matchesAt(int32_t offset, const CEList *other) const;
 685
 686 private:
 687     void add(int32_t ce);
 688
 689     int32_t *ces;
 690     int32_t listMax;
 691     int32_t listSize;
 692 };
 693
 694 CEList::CEList(UCollator *coll, const UnicodeString &string)
 695     : ces(NULL), listMax(8), listSize(0)
 696 {
 697     UErrorCode status = U_ZERO_ERROR;
 698     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
 699     uint32_t strengthMask = 0;
 700     int32_t order;
 701
 702 #if 0
 703     switch (ucol_getStrength(coll))
 704     {
 705     default:
 706         strengthMask |= UCOL_TERTIARYORDERMASK;
 707         /* fall through */
 708
 709     case UCOL_SECONDARY:
 710         strengthMask |= UCOL_SECONDARYORDERMASK;
 711         /* fall through */
 712
 713     case UCOL_PRIMARY:
 714         strengthMask |= UCOL_PRIMARYORDERMASK;
 715     }
 716 #else
 717     strengthMask = UCOL_PRIMARYORDERMASK;
 718 #endif
 719
 720     ces = new int32_t[listMax];
 721
 722     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
 723         order &= strengthMask;
 724
 725         if (order == UCOL_IGNORABLE) {
 726             continue;
 727         }
 728
 729         add(order);
 730     }
 731
 732     ucol_closeElements(elems);
 733 }
 734
 735 CEList::~CEList()
 736 {
 737     delete[] ces;
 738 }
 739
 740 void CEList::add(int32_t ce)
 741 {
 742     if (listSize >= listMax) {
 743         listMax *= 2;
 744
 745         int32_t *newCEs = new int32_t[listMax];
 746
 747         uprv_memcpy(newCEs, ces, listSize * sizeof(int32_t));
 748         delete[] ces;
 749         ces = newCEs;
 750     }
 751
 752     ces[listSize++] = ce;
 753 }
 754
 755 int32_t CEList::get(int32_t index) const
 756 {
 757     if (index >= 0 && index < listSize) {
 758         return ces[index];
 759     }
 760
 761     return -1;
 762 }
 763
 764 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
 765 {
 766     if (listSize - offset < other->size()) {
 767         return FALSE;
 768     }
 769
 770     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
 771         if (ces[i] != other->get(j)) {
 772             return FALSE;
 773         }
 774     }
 775
 776     return TRUE;
 777 }
 778
 779 int32_t CEList::size() const
 780 {
 781     return listSize;
 782 }
 783
 784 class StringList
 785 {
 786 public:
 787     StringList();
 788     ~StringList();
 789
 790     void add(const UnicodeString *string);
 791     void add(const UChar *chars, int32_t count);
 792     const UnicodeString *get(int32_t index) const;
 793     int32_t size() const;
 794
 795 private:
 796     UnicodeString *strings;
 797     int32_t listMax;
 798     int32_t listSize;
 799 };
 800
 801 StringList::StringList()
 802     : strings(NULL), listMax(16), listSize(0)
 803 {
 804     strings = new UnicodeString [listMax];
 805 }
 806
 807 StringList::~StringList()
 808 {
 809     delete[] strings;
 810 }
 811
 812 void StringList::add(const UnicodeString *string)
 813 {
 814     if (listSize >= listMax) {
 815         listMax *= 2;
 816
 817         UnicodeString *newStrings = new UnicodeString[listMax];
 818
 819         uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
 820
 821         delete[] strings;
 822         strings = newStrings;
 823     }
 824
 825     // The ctor initialized all the strings in
 826     // the array to empty strings, so this
 827     // is the same as copying the source string.
 828     strings[listSize++].append(*string);
 829 }
 830
 831 void StringList::add(const UChar *chars, int32_t count)
 832 {
 833     const UnicodeString string(chars, count);
 834
 835     add(&string);
 836 }
 837
 838 const UnicodeString *StringList::get(int32_t index) const
 839 {
 840     if (index >= 0 && index < listSize) {
 841         return &strings[index];
 842     }
 843
 844     return NULL;
 845 }
 846
 847 int32_t StringList::size() const
 848 {
 849     return listSize;
 850 }
 851
 852 class CEToStringsMap
 853 {
 854 public:
 855
 856     CEToStringsMap();
 857     ~CEToStringsMap();
 858
 859     void put(int32_t ce, UnicodeString *string);
 860     StringList *getStringList(int32_t ce) const;
 861
 862 private:
 863
 864     static void deleteStringList(void *obj);
 865     void putStringList(int32_t ce, StringList *stringList);
 866     UHashtable *map;
 867 };
 868
 869 CEToStringsMap::CEToStringsMap()
 870 {
 871     UErrorCode status = U_ZERO_ERROR;
 872
 873     map = uhash_open(uhash_hashLong, uhash_compareLong,
 874                      uhash_compareCaselessUnicodeString,
 875                      &status);
 876
 877     uhash_setValueDeleter(map, deleteStringList);
 878 }
 879
 880 CEToStringsMap::~CEToStringsMap()
 881 {
 882     uhash_close(map);
 883 }
 884
 885 void CEToStringsMap::put(int32_t ce, UnicodeString *string)
 886 {
 887     StringList *strings = getStringList(ce);
 888
 889     if (strings == NULL) {
 890         strings = new StringList();
 891         putStringList(ce, strings);
 892     }
 893
 894     strings->add(string);
 895 }
 896
 897 StringList *CEToStringsMap::getStringList(int32_t ce) const
 898 {
 899     return (StringList *) uhash_iget(map, ce);
 900 }
 901
 902 void CEToStringsMap::putStringList(int32_t ce, StringList *stringList)
 903 {
 904     UErrorCode status = U_ZERO_ERROR;
 905
 906     uhash_iput(map, ce, (void *) stringList, &status);
 907 }
 908
 909 void CEToStringsMap::deleteStringList(void *obj)
 910 {
 911     StringList *strings = (StringList *) obj;
 912
 913     delete strings;
 914 }
 915
 916 class StringToCEsMap
 917 {
 918 public:
 919     StringToCEsMap();
 920     ~StringToCEsMap();
 921
 922     void put(const UnicodeString *string, const CEList *ces);
 923     const CEList *get(const UnicodeString *string);
 924
 925 private:
 926
 927     static void deleteCEList(void *obj);
 928     static void deleteUnicodeStringKey(void *obj);
 929
 930     UHashtable *map;
 931 };
 932
 933 StringToCEsMap::StringToCEsMap()
 934 {
 935     UErrorCode status = U_ZERO_ERROR;
 936
 937     map = uhash_open(uhash_hashCaselessUnicodeString,
 938                      uhash_compareCaselessUnicodeString,
 939                      uhash_compareLong,
 940                      &status);
 941
 942     uhash_setValueDeleter(map, deleteCEList);
 943     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
 944 }
 945
 946 StringToCEsMap::~StringToCEsMap()
 947 {
 948     uhash_close(map);
 949 }
 950
 951 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces)
 952 {
 953     UErrorCode status = U_ZERO_ERROR;
 954
 955     uhash_put(map, (void *) string, (void *) ces, &status);
 956 }
 957
 958 const CEList *StringToCEsMap::get(const UnicodeString *string)
 959 {
 960     return (const CEList *) uhash_get(map, string);
 961 }
 962
 963 void StringToCEsMap::deleteCEList(void *obj)
 964 {
 965     CEList *list = (CEList *) obj;
 966
 967     delete list;
 968 }
 969
 970 void StringToCEsMap::deleteUnicodeStringKey(void *obj)
 971 {
 972     UnicodeString *key = (UnicodeString *) obj;
 973
 974     delete key;
 975 }
 976
 977 static void buildData(UCollator *coll, USet *charsToTest, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
 978 {
 979     int32_t itemCount = uset_getItemCount(charsToTest);
 980     UErrorCode status = U_ZERO_ERROR;
 981
 982     for(int32_t item = 0; item < itemCount; item += 1) {
 983         UChar32 start = 0, end = 0;
 984         UChar buffer[16];
 985         int32_t len = uset_getItem(charsToTest, item, &start, &end,
 986                                    buffer, 16, &status);
 987
 988         if (len == 0) {
 989             for (UChar32 ch = start; ch <= end; ch += 1) {
 990                 UnicodeString *st = new UnicodeString(ch);
 991                 CEList *ceList = new CEList(coll, *st);
 992
 993                 charsToCEList->put(st, ceList);
 994                 ceToCharsStartingWith->put(ceList->get(0), st);
 995             }
 996         } else if (len > 0) {
 997             UnicodeString *st = new UnicodeString(buffer, len);
 998             CEList *ceList = new CEList(coll, *st);
 999
1000             charsToCEList->put(st, ceList);
1001             ceToCharsStartingWith->put(ceList->get(0), st);
1002         } else {
1003             // shouldn't happen...
1004         }
1005     }
1006 }
1007
1008 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
1009 {
1010     for(int32_t i = 0; i < string.length(); i += 1) {
1011         UChar32 ch = string.char32At(i);
1012
1013         if (ch >= 0x0020 && ch <= 0x007F) {
1014             if (ch == 0x005C) {
1015                 buffer.append("\\\\");
1016             } else {
1017                 buffer.append(ch);
1018             }
1019         } else {
1020             char cbuffer[12];
1021
1022             if (ch <= 0xFFFFL) {
1023                 sprintf(cbuffer, "\\u%4.4X", ch);
1024             } else {
1025                 sprintf(cbuffer, "\\U%8.8X", ch);
1026             }
1027
1028             buffer.append(cbuffer);
1029         }
1030
1031         if (ch >= 0x10000L) {
1032             i += 1;
1033         }
1034     }
1035
1036     return buffer;
1037 }
1038
1039 static int32_t minLengthInChars(const CEList *ceList, int32_t offset, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith,
1040                                 UnicodeString &debug)
1041 {
1042     // find out shortest string for the longest sequence of ces.
1043     // needs to be refined to use dynamic programming, but will be roughly right
1044         int32_t totalStringLength = 0;
1045
1046     while (offset < ceList->size()) {
1047         int32_t ce = ceList->get(offset);
1048         int32_t bestLength = INT32_MIN;
1049         const UnicodeString *bestString = NULL;
1050         int32_t bestCeLength = 0;
1051         const StringList *strings = ceToCharsStartingWith->getStringList(ce);
1052         int32_t stringCount = strings->size();
1053
1054         for (int32_t s = 0; s < stringCount; s += 1) {
1055             const UnicodeString *string = strings->get(s);
1056             const CEList *ceList2 = charsToCEList->get(string);
1057
1058             if (ceList->matchesAt(offset, ceList2)) {
1059                 int32_t length = ceList2->size() - string->length();
1060
1061                 if (bestLength < length) {
1062                     bestLength = length;
1063                     bestCeLength = ceList2->size();
1064                     bestString = string;
1065                 }
1066             }
1067         }
1068
1069         totalStringLength += bestString->length();
1070         escape(*bestString, debug).append("/");
1071         offset += bestCeLength;
1072     }
1073
1074     debug.append((UChar)0x0000);
1075     return totalStringLength;
1076 }
1077
1078 static void minLengthTest(UCollator *coll, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
1079 {
1080     UnicodeString examples[] = {"fuss", "fiss", "affliss", "VII"};
1081     UnicodeString debug;
1082     int32_t nExamples = sizeof(examples) / sizeof(examples[0]);
1083
1084     for (int32_t s = 0; s < nExamples; s += 1) {
1085         CEList *ceList = new CEList(coll, examples[s]);
1086
1087       //infoln("%S:", examples[s].getTerminatedBuffer());
1088
1089         for(int32_t i = 0; i < examples[s].length(); i += 1) {
1090             debug.remove();
1091
1092             int32_t minLength = minLengthInChars(ceList, i, charsToCEList, ceToCharsStartingWith, debug);
1093           //infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer());
1094         }
1095
1096       //infoln();
1097         delete ceList;
1098     }
1099 }
1100
1101 //----------------------------------------------------------------------------------------
1102 //
1103 //   Random Numbers.  Similar to standard lib rand() and srand()
1104 //                    Not using library to
1105 //                      1.  Get same results on all platforms.
1106 //                      2.  Get access to current seed, to more easily reproduce failures.
1107 //
1108 //---------------------------------------------------------------------------------------
1109 static uint32_t m_seed = 1;
1110
1111 static uint32_t m_rand()
1112 {
1113     m_seed = m_seed * 1103515245 + 12345;
1114     return (uint32_t)(m_seed/65536) % 32768;
1115 }
1116
1117 class Monkey
1118 {
1119 public:
1120     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
1121
1122 protected:
1123     Monkey();
1124     virtual ~Monkey();
1125 };
1126
1127 Monkey::Monkey()
1128 {
1129     // ook?
1130 }
1131
1132 Monkey::~Monkey()
1133 {
1134     // ook?
1135 }
1136
1137 class SetMonkey : public Monkey
1138 {
1139 public:
1140     SetMonkey(const USet *theSet);
1141     ~SetMonkey();
1142
1143     virtual void append(UnicodeString &test, UnicodeString &alternate);
1144
1145 private:
1146     const USet *set;
1147 };
1148
1149 SetMonkey::SetMonkey(const USet *theSet)
1150     : Monkey(), set(theSet)
1151 {
1152     // ook?
1153 }
1154
1155 SetMonkey::~SetMonkey()
1156 {
1157     //ook...
1158 }
1159
1160 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1161 {
1162     int32_t size = uset_size(set);
1163     int32_t index = m_rand() % size;
1164     UChar32 ch = uset_charAt(set, index);
1165     UnicodeString str(ch);
1166
1167     test.append(str);
1168     alternate.append(str); // flip case, or some junk?
1169 }
1170
1171 class StringSetMonkey : public Monkey
1172 {
1173 public:
1174     StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith);
1175     ~StringSetMonkey();
1176
1177     void append(UnicodeString &testCase, UnicodeString &alternate);
1178
1179 private:
1180     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1181
1182     const USet *set;
1183     UCollator      *coll;
1184     StringToCEsMap *charsToCEList;
1185     CEToStringsMap *ceToCharsStartingWith;
1186 };
1187
1188 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith)
1189 : Monkey(), set(theSet), coll(theCollator), charsToCEList(theCharsToCEList), ceToCharsStartingWith(theCeToCharsStartingWith)
1190 {
1191     // ook.
1192 }
1193
1194 StringSetMonkey::~StringSetMonkey()
1195 {
1196     // ook?
1197 }
1198
1199 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1200 {
1201     int32_t itemCount = uset_getItemCount(set), len = 0;
1202     int32_t index = m_rand() % itemCount;
1203     UChar32 rangeStart = 0, rangeEnd = 0;
1204     UChar buffer[16];
1205     UErrorCode err = U_ZERO_ERROR;
1206
1207     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1208
1209     if (len == 0) {
1210         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1211         UChar32 ch = rangeStart + offset;
1212         UnicodeString str(ch);
1213
1214         testCase.append(str);
1215         generateAlternative(str, alternate);
1216     } else if (len > 0) {
1217         // should check that len < 16...
1218         UnicodeString str(buffer, len);
1219
1220         testCase.append(str);
1221         generateAlternative(str, alternate);
1222     } else {
1223         // shouldn't happen...
1224     }
1225 }
1226
1227 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1228 {
1229     // find out shortest string for the longest sequence of ces.
1230     // needs to be refined to use dynamic programming, but will be roughly right
1231     CEList ceList(coll, testCase);
1232     UnicodeString alt;
1233     int32_t offset = 0;
1234
1235     if (ceList.size() == 0) {
1236         return alternate.append(testCase);
1237     }
1238
1239     while (offset < ceList.size()) {
1240         int32_t ce = ceList.get(offset);
1241         const StringList *strings = ceToCharsStartingWith->getStringList(ce);
1242
1243         if (strings == NULL) {
1244             return alternate.append(testCase);
1245         }
1246
1247         int32_t stringCount = strings->size();
1248         int32_t tries = 0;
1249
1250         // find random string that generates the same CEList
1251         const CEList *ceList2;
1252         const UnicodeString *string;
1253
1254         do {
1255             int32_t s = m_rand() % stringCount;
1256
1257             if (tries++ > stringCount) {
1258                 alternate.append(testCase);
1259                 return alternate;
1260             }
1261
1262             string = strings->get(s);
1263             ceList2 = charsToCEList->get(string);
1264         } while (! ceList.matchesAt(offset, ceList2));
1265
1266         alt.append(*string);
1267         offset += ceList2->size();
1268     }
1269
1270     const CEList altCEs(coll, alt);
1271
1272     if (ceList.matchesAt(0, &altCEs)) {
1273         return alternate.append(alt);
1274     }
1275
1276     return alternate.append(testCase);
1277 }
1278
1279 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1280 {
1281     int32_t pieces = (m_rand() % 4) + 1;
1282     UBool matches;
1283
1284     do {
1285         testCase.remove();
1286         alternate.remove();
1287         monkeys[0]->append(testCase, alternate);
1288
1289         for(int32_t piece = 0; piece < pieces; piece += 1) {
1290             int32_t monkey = m_rand() % monkeyCount;
1291
1292             monkeys[monkey]->append(testCase, alternate);
1293         }
1294
1295         const CEList ceTest(coll, testCase);
1296         const CEList ceAlt(coll, alternate);
1297
1298         matches = ceTest.matchesAt(0, &ceAlt);
1299     } while (! matches);
1300 }
1301
1302 static inline USet *uset_openEmpty()
1303 {
1304     return uset_open(1, 0);
1305 }
1306
1307 //
1308 //  Find the next acceptable boundary following the specified starting index
1309 //     in the target text being searched.
1310 //      TODO:  refine what is an acceptable boundary.  For the moment,
1311 //             choose the next position not within a combining sequence.
1312 //
1313 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
1314     const UChar *text = string.getBuffer();
1315     int32_t textLen   = string.length();
1316
1317     if (startIndex >= textLen) {
1318         return startIndex;
1319     }
1320
1321     UChar32  c;
1322     int32_t  i = startIndex;
1323
1324     U16_NEXT(text, i, textLen, c);
1325
1326     // If we are on a control character, stop without looking for combining marks.
1327     //    Control characters do not combine.
1328     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1329     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1330         return i;
1331     }
1332
1333     // The initial character was not a control, and can thus accept trailing
1334     //   combining characters.  Advance over however many of them there are.
1335     int32_t  indexOfLastCharChecked;
1336
1337     for (;;) {
1338         indexOfLastCharChecked = i;
1339
1340         if (i>=textLen) {
1341             break;
1342         }
1343
1344         U16_NEXT(text, i, textLen, c);
1345         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1346
1347         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1348             break;
1349         }
1350     }
1351
1352     return indexOfLastCharChecked;
1353 }
1354
1355 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
1356     const UChar *text = string.getBuffer();
1357     int32_t textLen   = string.length();
1358
1359     if (index>=textLen || index<=0) {
1360         return FALSE;
1361     }
1362
1363     // If the character at the current index is not a GRAPHEME_EXTEND
1364     //    then we can not be within a combining sequence.
1365     UChar32  c;
1366     U16_GET(text, 0, index, textLen, c);
1367     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1368     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1369         return FALSE;
1370     }
1371
1372     // We are at a combining mark.  If the preceding character is anything
1373     //   except a CONTROL, CR or LF, we are in a combining sequence.
1374     U16_PREV(text, 0, index, c);
1375     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1376
1377     return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1378 }
1379
1380 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1381 {
1382     UErrorCode      status = U_ZERO_ERROR;
1383     OrderList       targetOrders(coll, target, offset);
1384     OrderList       patternOrders(coll, pattern);
1385     int32_t         targetSize  = targetOrders.size() - 1;
1386     int32_t         patternSize = patternOrders.size() - 1;
1387     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
1388                                                                                       target.getBuffer(), target.length(), &status);
1389
1390     if (patternSize == 0) {
1391         matchStart = matchEnd = 0;
1392         return FALSE;
1393     }
1394
1395     matchStart = matchEnd = -1;
1396
1397     for(int32_t i = 0; i < targetSize; i += 1) {
1398         if (targetOrders.matchesAt(i, patternOrders)) {
1399             int32_t start    = targetOrders.getLowOffset(i);
1400             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1401             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1402
1403             // if the low and high offsets of the first CE in
1404             // the match are the same, it means that the match
1405             // starts in the middle of an expansion - all but
1406             // the first CE of the expansion will have the offset
1407             // of the following character.
1408             if (start == targetOrders.getHighOffset(i)) {
1409                 continue;
1410             }
1411
1412             // Make sure match starts on a grapheme boundary
1413             if (! ubrk_isBoundary(charBreakIterator, start)) {
1414                 continue;
1415             }
1416
1417             // If the low and high offsets of the CE after the match
1418             // are the same, it means that the match ends in the middle
1419             // of an expansion sequence.
1420             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1421                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1422                 continue;
1423             }
1424
1425             int32_t mend = maxLimit;
1426
1427             // Find the first grapheme break after the character index
1428             // of the last CE in the match. If it's after character index
1429             // that's after the last CE in the match, use that index
1430             // as the end of the match.
1431             if (minLimit < maxLimit) {
1432                 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1433
1434                 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1435                     mend = nba;
1436                 }
1437             }
1438
1439             if (mend > maxLimit) {
1440                 continue;
1441             }
1442
1443             if (! ubrk_isBoundary(charBreakIterator, mend)) {
1444                 continue;
1445             }
1446
1447             matchStart = start;
1448             matchEnd   = mend;
1449
1450             ubrk_close(charBreakIterator);
1451             return TRUE;
1452         }
1453     }
1454
1455     ubrk_close(charBreakIterator);
1456     return FALSE;
1457 }
1458
1459 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1460 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1461     int32_t val = defaultVal;
1462
1463     name.append(" *= *(-?\\d+)");
1464
1465     UErrorCode status = U_ZERO_ERROR;
1466     RegexMatcher m(name, params, 0, status);
1467
1468     if (m.find()) {
1469         // The param exists.  Convert the string to an int.
1470         char valString[100];
1471         int32_t paramLength = m.end(1, status) - m.start(1, status);
1472
1473         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1474             paramLength = (int32_t)(sizeof(valString)-2);
1475         }
1476
1477         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1478         val = strtol(valString,  NULL, 10);
1479
1480         // Delete this parameter from the params string.
1481         m.reset();
1482         params = m.replaceFirst("", status);
1483     }
1484
1485   //U_ASSERT(U_SUCCESS(status));
1486     if (! U_SUCCESS(status)) {
1487         val = defaultVal;
1488     }
1489
1490     return val;
1491 }
1492 #endif
1493
1494 #if !UCONFIG_NO_COLLATION
1495 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1496                                     const char *name, const char *strength, uint32_t seed)
1497 {
1498     UErrorCode status = U_ZERO_ERROR;
1499     int32_t actualStart = -1, actualEnd = -1;
1500   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1501     int32_t expectedStart = -1, expectedEnd = -1;
1502     int32_t notFoundCount = 0;
1503     UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1504                                 testCase.getBuffer(), testCase.length(),
1505                                 coll,
1506                                 NULL,     // the break iterator
1507                                 &status);
1508
1509     // **** TODO: find *all* matches, not just first one ****
1510     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1511
1512 #if 0
1513     usearch_search(uss, 0, &actualStart, &actualEnd, &status);
1514 #else
1515     actualStart = usearch_next(uss, &status);
1516     actualEnd   = actualStart + usearch_getMatchedLength(uss);
1517 #endif
1518
1519     if (actualStart != expectedStart || actualEnd != expectedEnd) {
1520         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1521               "    strength=%s seed=%d",
1522               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1523     }
1524
1525     if (expectedStart == -1 && actualStart == -1) {
1526         notFoundCount += 1;
1527     }
1528
1529     // **** TODO: find *all* matches, not just first one ****
1530     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1531
1532     usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
1533
1534 #if 0
1535     usearch_search(uss, 0, &actualStart, &actualEnd, &status);
1536 #else
1537     usearch_reset(uss);
1538     actualStart = usearch_next(uss, &status);
1539     actualEnd   = actualStart + usearch_getMatchedLength(uss);
1540 #endif
1541
1542     if (actualStart != expectedStart || actualEnd != expectedEnd) {
1543         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1544               "    strength=%s seed=%d",
1545               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1546     }
1547
1548     if (expectedStart == -1 && actualStart == -1) {
1549         notFoundCount += 1;
1550     }
1551
1552     usearch_close(uss);
1553
1554     return notFoundCount;
1555 }
1556 #endif
1557
1558 void SSearchTest::monkeyTest(char *params)
1559 {
1560     // ook!
1561     UErrorCode status = U_ZERO_ERROR;
1562     U_STRING_DECL(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
1563     U_STRING_INIT(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
1564     UCollator *coll = ucol_open(NULL, &status);
1565     if (U_FAILURE(status)) {
1566         errln("Failed to create collator in MonkeyTest!");
1567         return;
1568     }
1569     USet *charsToTest  = uset_openPattern(test_pattern, 47, &status);
1570     USet *expansions   = uset_openEmpty();
1571     USet *contractions = uset_openEmpty();
1572     StringToCEsMap *charsToCEList = new StringToCEsMap();
1573     CEToStringsMap *ceToCharsStartingWith = new CEToStringsMap();
1574
1575     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1576
1577     uset_addAll(charsToTest, contractions);
1578     uset_addAll(charsToTest, expansions);
1579
1580     // TODO: set strength to UCOL_PRIMARY, change CEList to use strength?
1581     buildData(coll, charsToTest, charsToCEList, ceToCharsStartingWith);
1582
1583     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1584     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1585     USet *letters = uset_openPattern(letter_pattern, 39, &status);
1586     SetMonkey letterMonkey(letters);
1587     StringSetMonkey contractionMonkey(contractions, coll, charsToCEList, ceToCharsStartingWith);
1588     StringSetMonkey expansionMonkey(expansions, coll, charsToCEList, ceToCharsStartingWith);
1589     UnicodeString testCase;
1590     UnicodeString alternate;
1591     UnicodeString pattern, altPattern;
1592     UnicodeString prefix, altPrefix;
1593     UnicodeString suffix, altSuffix;
1594
1595     Monkey *monkeys[] = {
1596         &letterMonkey,
1597         &contractionMonkey,
1598         &expansionMonkey,
1599         &contractionMonkey,
1600         &expansionMonkey,
1601         &contractionMonkey,
1602         &expansionMonkey,
1603         &contractionMonkey,
1604         &expansionMonkey};
1605     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
1606     int32_t nonMatchCount = 0;
1607
1608     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1609     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1610     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
1611     int32_t loopCount = quick? 1000 : 10000;
1612     int32_t firstStrength = 0;
1613     int32_t lastStrength  = strengthCount - 1;
1614
1615     if (params != NULL) {
1616 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1617         UnicodeString p(params);
1618
1619         loopCount = getIntParam("loop", p, loopCount);
1620         m_seed    = getIntParam("seed", p, m_seed);
1621
1622         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1623         if (m.find()) {
1624             UnicodeString breakType = m.group(1, status);
1625
1626             for (int32_t s = 0; s < strengthCount; s += 1) {
1627                 if (breakType == strengthNames[s]) {
1628                     firstStrength = lastStrength = s;
1629                     break;
1630                 }
1631             }
1632
1633             m.reset();
1634             p = m.replaceFirst("", status);
1635         }
1636
1637         if (RegexMatcher("\\S", p, 0, status).find()) {
1638             // Each option is stripped out of the option string as it is processed.
1639             // All options have been checked.  The option string should have been completely emptied..
1640             char buf[100];
1641             p.extract(buf, sizeof(buf), NULL, status);
1642             buf[sizeof(buf)-1] = 0;
1643             errln("Unrecognized or extra parameter:  %s\n", buf);
1644             return;
1645         }
1646 #else
1647         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1648 #endif
1649     }
1650
1651     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1652         int32_t notFoundCount = 0;
1653
1654         ucol_setStrength(coll, strengths[s]);
1655
1656         // TODO: try alternate prefix and suffix too?
1657         // TODO: alterntaes are only equal at primary strength. Is this OK?
1658         for(int32_t t = 0; t < 10000; t += 1) {
1659             uint32_t seed = m_seed;
1660             int32_t  nmc = 0;
1661
1662             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1663             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1664             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1665
1666             // pattern
1667             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1668
1669             testCase.remove();
1670             testCase.append(prefix);
1671             testCase.append(/*alt*/pattern);
1672
1673             // prefix + pattern
1674             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1675
1676             testCase.append(suffix);
1677
1678             // prefix + pattern + suffix
1679             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1680
1681             testCase.remove();
1682             testCase.append(pattern);
1683             testCase.append(suffix);
1684
1685             // pattern + suffix
1686             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1687         }
1688
1689         logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1690     }
1691
1692     delete ceToCharsStartingWith;
1693     delete charsToCEList;
1694
1695     uset_close(contractions);
1696     uset_close(expansions);
1697     uset_close(charsToTest);
1698     uset_close(letters);
1699
1700     ucol_close(coll);
1701 }
1702
1703 #endif
1704
1705 #endif