icuSources/test/intltest/ssearch.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2013, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_COLLATION
  11
  12 #include "cmemory.h"
  13 #include "cstring.h"
  14 #include "ucol_imp.h"
  15
  16 #include "unicode/coll.h"
  17 #include "unicode/tblcoll.h"
  18 #include "unicode/usearch.h"
  19 #include "unicode/uset.h"
  20 #include "unicode/ustring.h"
  21
  22 #include "unicode/coleitr.h"
  23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  24
  25 #include "colldata.h"
  26 #include "ssearch.h"
  27 #include "xmlparser.h"
  28
  29 #include <stdio.h>  // for sprintf
  30
  31 char testId[100];
  32
  33 #define TEST_ASSERT(x) {if (!(x)) { \
  34     errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
  35
  36 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
  37     dataerrln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
  38
  39 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
  40     dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
  41           __FILE__, __LINE__, testId, u_errorName(errcode));}}
  42
  43 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  44 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
  45 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  46
  47 //---------------------------------------------------------------------------
  48 //
  49 //  Test class boilerplate
  50 //
  51 //---------------------------------------------------------------------------
  52 SSearchTest::SSearchTest()
  53 {
  54 }
  55
  56 SSearchTest::~SSearchTest()
  57 {
  58 }
  59
  60 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
  61 {
  62     if (exec) logln("TestSuite SSearchTest: ");
  63     switch (index) {
  64 #if !UCONFIG_NO_BREAK_ITERATION
  65        case 0: name = "searchTest";
  66             if (exec) searchTest();
  67             break;
  68
  69         case 1: name = "offsetTest";
  70             if (exec) offsetTest();
  71             break;
  72
  73         case 2: name = "monkeyTest";
  74             if (exec) monkeyTest(params);
  75             break;
  76
  77         case 3: name = "sharpSTest";
  78             if (exec) sharpSTest();
  79             break;
  80
  81         case 4: name = "goodSuffixTest";
  82             if (exec) goodSuffixTest();
  83             break;
  84
  85         case 5: name = "searchTime";
  86             if (exec) searchTime();
  87             break;
  88 #endif
  89         default: name = "";
  90             break; //needed to end loop
  91     }
  92 }
  93
  94
  95 #if !UCONFIG_NO_BREAK_ITERATION
  96
  97 #define PATH_BUFFER_SIZE 2048
  98 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
  99     UErrorCode status = U_ZERO_ERROR;
 100     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 101
 102     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
 103         errln("ERROR: getPath() failed - %s", u_errorName(status));
 104         return NULL;
 105     }
 106
 107     strcpy(buffer, testDataDirectory);
 108     strcat(buffer, filename);
 109     return buffer;
 110 }
 111
 112
 113 void SSearchTest::searchTest()
 114 {
 115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 116     UErrorCode status = U_ZERO_ERROR;
 117     char path[PATH_BUFFER_SIZE];
 118     const char *testFilePath = getPath(path, "ssearch.xml");
 119
 120     if (testFilePath == NULL) {
 121         return; /* Couldn't get path: error message already output. */
 122     }
 123
 124     LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
 125     TEST_ASSERT_SUCCESS(status);
 126     LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
 127     TEST_ASSERT_SUCCESS(status);
 128     if (U_FAILURE(status)) {
 129         return;
 130     }
 131
 132     const UnicodeString *debugTestCase = root->getAttribute("debug");
 133     if (debugTestCase != NULL) {
 134 //       setenv("USEARCH_DEBUG", "1", 1);
 135     }
 136
 137
 138     const UXMLElement *testCase;
 139     int32_t tc = 0;
 140
 141     while((testCase = root->nextChildElement(tc)) != NULL) {
 142
 143         if (testCase->getTagName().compare("test-case") != 0) {
 144             errln("ssearch, unrecognized XML Element in test file");
 145             continue;
 146         }
 147         const UnicodeString *id       = testCase->getAttribute("id");
 148         *testId = 0;
 149         if (id != NULL) {
 150             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
 151         }
 152
 153         // If debugging test case has been specified and this is not it, skip to next.
 154         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
 155             continue;
 156         }
 157         //
 158         //  Get the requested collation strength.
 159         //    Default is tertiary if the XML attribute is missing from the test case.
 160         //
 161         const UnicodeString *strength = testCase->getAttribute("strength");
 162         UColAttributeValue collatorStrength = UCOL_PRIMARY;
 163         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
 164         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
 165         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
 166         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
 167         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
 168         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
 169         else {
 170             // Bogus value supplied for strength.  Shouldn't happen, even from
 171             //  typos, if the  XML source has been validated.
 172             //  This assert is a little deceiving in that strength can be
 173             //   any of the allowed values, not just TERTIARY, but it will
 174             //   do the job of getting the error output.
 175             TEST_ASSERT(*strength=="TERTIARY")
 176         }
 177
 178         //
 179         // Get the collator normalization flag.  Default is UCOL_OFF.
 180         //
 181         UColAttributeValue normalize = UCOL_OFF;
 182         const UnicodeString *norm = testCase->getAttribute("norm");
 183         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
 184         if (norm!=NULL && *norm=="ON") {
 185             normalize = UCOL_ON;
 186         }
 187
 188         //
 189         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
 190         //
 191         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
 192         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
 193         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
 194         if (alt != NULL && *alt == "SHIFTED") {
 195             alternateHandling = UCOL_SHIFTED;
 196         }
 197
 198         const UnicodeString defLocale("en");
 199         char  clocale[100];
 200         const UnicodeString *locale   = testCase->getAttribute("locale");
 201         if (locale == NULL || locale->length()==0) {
 202             locale = &defLocale;
 203         };
 204         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
 205
 206
 207         UnicodeString  text;
 208         UnicodeString  target;
 209         UnicodeString  pattern;
 210         int32_t        expectedMatchStart = -1;
 211         int32_t        expectedMatchLimit = -1;
 212         const UXMLElement  *n;
 213         int32_t                nodeCount = 0;
 214
 215         n = testCase->getChildElement("pattern");
 216         TEST_ASSERT(n != NULL);
 217         if (n==NULL) {
 218             continue;
 219         }
 220         text = n->getText(FALSE);
 221         text = text.unescape();
 222         pattern.append(text);
 223         nodeCount++;
 224
 225         n = testCase->getChildElement("pre");
 226         if (n!=NULL) {
 227             text = n->getText(FALSE);
 228             text = text.unescape();
 229             target.append(text);
 230             nodeCount++;
 231         }
 232
 233         n = testCase->getChildElement("m");
 234         if (n!=NULL) {
 235             expectedMatchStart = target.length();
 236             text = n->getText(FALSE);
 237             text = text.unescape();
 238             target.append(text);
 239             expectedMatchLimit = target.length();
 240             nodeCount++;
 241         }
 242
 243         n = testCase->getChildElement("post");
 244         if (n!=NULL) {
 245             text = n->getText(FALSE);
 246             text = text.unescape();
 247             target.append(text);
 248             nodeCount++;
 249         }
 250
 251         //  Check that there weren't extra things in the XML
 252         TEST_ASSERT(nodeCount == testCase->countChildren());
 253
 254         // Open a collator and StringSearch based on the parameters
 255         //   obtained from the XML.
 256         //
 257         status = U_ZERO_ERROR;
 258         LocalUCollatorPointer collator(ucol_open(clocale, &status));
 259         ucol_setStrength(collator.getAlias(), collatorStrength);
 260         ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
 261         ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
 262         LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 263                                                                target.getBuffer(), target.length(),
 264                                                                collator.getAlias(),
 265                                                                NULL,     // the break iterator
 266                                                                &status));
 267
 268         TEST_ASSERT_SUCCESS(status);
 269         if (U_FAILURE(status)) {
 270             continue;
 271         }
 272
 273         int32_t foundStart = 0;
 274         int32_t foundLimit = 0;
 275         UBool   foundMatch;
 276
 277         //
 278         // Do the search, check the match result against the expected results.
 279         //
 280         foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
 281         TEST_ASSERT_SUCCESS(status);
 282         if ((foundMatch && expectedMatchStart<0) ||
 283             (foundStart != expectedMatchStart)   ||
 284             (foundLimit != expectedMatchLimit)) {
 285                 TEST_ASSERT(FALSE);   //  ouput generic error position
 286                 infoln("Found, expected match start = %d, %d \n"
 287                        "Found, expected match limit = %d, %d",
 288                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 289         }
 290
 291         // In case there are other matches...
 292         // (should we only do this if the test case passed?)
 293         while (foundMatch) {
 294             expectedMatchStart = foundStart;
 295             expectedMatchLimit = foundLimit;
 296
 297             foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
 298         }
 299
 300         uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 301             target.getBuffer(), target.length(),
 302             collator.getAlias(),
 303             NULL,
 304             &status));
 305
 306         //
 307         // Do the backwards search, check the match result against the expected results.
 308         //
 309         foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
 310         TEST_ASSERT_SUCCESS(status);
 311         if ((foundMatch && expectedMatchStart<0) ||
 312             (foundStart != expectedMatchStart)   ||
 313             (foundLimit != expectedMatchLimit)) {
 314                 TEST_ASSERT(FALSE);   //  ouput generic error position
 315                 infoln("Found, expected backwards match start = %d, %d \n"
 316                        "Found, expected backwards match limit = %d, %d",
 317                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 318         }
 319     }
 320 #endif
 321 }
 322
 323 struct Order
 324 {
 325     int32_t order;
 326     int32_t lowOffset;
 327     int32_t highOffset;
 328 };
 329
 330 class OrderList
 331 {
 332 public:
 333     OrderList();
 334     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
 335     ~OrderList();
 336
 337     int32_t size(void) const;
 338     void add(int32_t order, int32_t low, int32_t high);
 339     const Order *get(int32_t index) const;
 340     int32_t getLowOffset(int32_t index) const;
 341     int32_t getHighOffset(int32_t index) const;
 342     int32_t getOrder(int32_t index) const;
 343     void reverse(void);
 344     UBool compare(const OrderList &other) const;
 345     UBool matchesAt(int32_t offset, const OrderList &other) const;
 346
 347 private:
 348     Order *list;
 349     int32_t listMax;
 350     int32_t listSize;
 351 };
 352
 353 OrderList::OrderList()
 354   : list(NULL),  listMax(16), listSize(0)
 355 {
 356     list = new Order[listMax];
 357 }
 358
 359 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
 360     : list(NULL), listMax(16), listSize(0)
 361 {
 362     UErrorCode status = U_ZERO_ERROR;
 363     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
 364     uint32_t strengthMask = 0;
 365     int32_t order, low, high;
 366
 367     switch (ucol_getStrength(coll))
 368     {
 369     default:
 370         strengthMask |= UCOL_TERTIARYORDERMASK;
 371         /* fall through */
 372
 373     case UCOL_SECONDARY:
 374         strengthMask |= UCOL_SECONDARYORDERMASK;
 375         /* fall through */
 376
 377     case UCOL_PRIMARY:
 378         strengthMask |= UCOL_PRIMARYORDERMASK;
 379     }
 380
 381     list = new Order[listMax];
 382
 383     ucol_setOffset(elems, stringOffset, &status);
 384
 385     do {
 386         low   = ucol_getOffset(elems);
 387         order = ucol_next(elems, &status);
 388         high  = ucol_getOffset(elems);
 389
 390         if (order != UCOL_NULLORDER) {
 391             order &= strengthMask;
 392         }
 393
 394         if (order != UCOL_IGNORABLE) {
 395             add(order, low, high);
 396         }
 397     } while (order != UCOL_NULLORDER);
 398
 399     ucol_closeElements(elems);
 400 }
 401
 402 OrderList::~OrderList()
 403 {
 404     delete[] list;
 405 }
 406
 407 void OrderList::add(int32_t order, int32_t low, int32_t high)
 408 {
 409     if (listSize >= listMax) {
 410         listMax *= 2;
 411
 412         Order *newList = new Order[listMax];
 413
 414         uprv_memcpy(newList, list, listSize * sizeof(Order));
 415         delete[] list;
 416         list = newList;
 417     }
 418
 419     list[listSize].order      = order;
 420     list[listSize].lowOffset  = low;
 421     list[listSize].highOffset = high;
 422
 423     listSize += 1;
 424 }
 425
 426 const Order *OrderList::get(int32_t index) const
 427 {
 428     if (index >= listSize) {
 429         return NULL;
 430     }
 431
 432     return &list[index];
 433 }
 434
 435 int32_t OrderList::getLowOffset(int32_t index) const
 436 {
 437     const Order *order = get(index);
 438
 439     if (order != NULL) {
 440         return order->lowOffset;
 441     }
 442
 443     return -1;
 444 }
 445
 446 int32_t OrderList::getHighOffset(int32_t index) const
 447 {
 448     const Order *order = get(index);
 449
 450     if (order != NULL) {
 451         return order->highOffset;
 452     }
 453
 454     return -1;
 455 }
 456
 457 int32_t OrderList::getOrder(int32_t index) const
 458 {
 459     const Order *order = get(index);
 460
 461     if (order != NULL) {
 462         return order->order;
 463     }
 464
 465     return UCOL_NULLORDER;
 466 }
 467
 468 int32_t OrderList::size() const
 469 {
 470     return listSize;
 471 }
 472
 473 void OrderList::reverse()
 474 {
 475     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
 476         Order swap = list[b];
 477
 478         list[b] = list[f];
 479         list[f] = swap;
 480     }
 481 }
 482
 483 UBool OrderList::compare(const OrderList &other) const
 484 {
 485     if (listSize != other.listSize) {
 486         return FALSE;
 487     }
 488
 489     for(int32_t i = 0; i < listSize; i += 1) {
 490         if (list[i].order  != other.list[i].order ||
 491             list[i].lowOffset != other.list[i].lowOffset ||
 492             list[i].highOffset != other.list[i].highOffset) {
 493                 return FALSE;
 494         }
 495     }
 496
 497     return TRUE;
 498 }
 499
 500 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
 501 {
 502     // NOTE: sizes include the NULLORDER, which we don't want to compare.
 503     int32_t otherSize = other.size() - 1;
 504
 505     if (listSize - 1 - offset < otherSize) {
 506         return FALSE;
 507     }
 508
 509     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
 510         if (getOrder(i) != other.getOrder(j)) {
 511             return FALSE;
 512         }
 513     }
 514
 515     return TRUE;
 516 }
 517
 518 static char *printOffsets(char *buffer, OrderList &list)
 519 {
 520     int32_t size = list.size();
 521     char *s = buffer;
 522
 523     for(int32_t i = 0; i < size; i += 1) {
 524         const Order *order = list.get(i);
 525
 526         if (i != 0) {
 527             s += sprintf(s, ", ");
 528         }
 529
 530         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
 531     }
 532
 533     return buffer;
 534 }
 535
 536 static char *printOrders(char *buffer, OrderList &list)
 537 {
 538     int32_t size = list.size();
 539     char *s = buffer;
 540
 541     for(int32_t i = 0; i < size; i += 1) {
 542         const Order *order = list.get(i);
 543
 544         if (i != 0) {
 545             s += sprintf(s, ", ");
 546         }
 547
 548         s += sprintf(s, "%8.8X", order->order);
 549     }
 550
 551     return buffer;
 552 }
 553
 554 void SSearchTest::offsetTest()
 555 {
 556     const char *test[] = {
 557         // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
 558         // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
 559         "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
 560
 561         "\\ua191\\u16ef\\u2036\\u017a",
 562
 563 #if 0
 564         // This results in a complex interaction between contraction,
 565         // expansion and normalization that confuses the backwards offset fixups.
 566         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 567 #endif
 568
 569         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 570         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
 571
 572         "\\u02FE\\u02FF"
 573         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
 574         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
 575         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
 576         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
 577         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
 578
 579         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
 580         "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
 581         "a\\u02FF\\u0316\\u0301",
 582         "a\\u0430\\u0301\\u0316",
 583         "a\\u0430\\u0316\\u0301",
 584         "abc\\u0E41\\u0301\\u0316",
 585         "abc\\u0E41\\u0316\\u0301",
 586         "\\u0E41\\u0301\\u0316",
 587         "\\u0E41\\u0316\\u0301",
 588         "a\\u0301\\u0316",
 589         "a\\u0316\\u0301",
 590         "\\uAC52\\uAC53",
 591         "\\u34CA\\u34CB",
 592         "\\u11ED\\u11EE",
 593         "\\u30C3\\u30D0",
 594         "p\\u00E9ch\\u00E9",
 595         "a\\u0301\\u0325",
 596         "a\\u0300\\u0325",
 597         "a\\u0325\\u0300",
 598         "A\\u0323\\u0300B",
 599         "A\\u0300\\u0323B",
 600         "A\\u0301\\u0323B",
 601         "A\\u0302\\u0301\\u0323B",
 602         "abc",
 603         "ab\\u0300c",
 604         "ab\\u0300\\u0323c",
 605         " \\uD800\\uDC00\\uDC00",
 606         "a\\uD800\\uDC00\\uDC00",
 607         "A\\u0301\\u0301",
 608         "A\\u0301\\u0323",
 609         "A\\u0301\\u0323B",
 610         "B\\u0301\\u0323C",
 611         "A\\u0300\\u0323B",
 612         "\\u0301A\\u0301\\u0301",
 613         "abcd\\r\\u0301",
 614         "p\\u00EAche",
 615         "pe\\u0302che",
 616     };
 617
 618     int32_t testCount = ARRAY_SIZE(test);
 619     UErrorCode status = U_ZERO_ERROR;
 620     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
 621     if (U_FAILURE(status)) {
 622         errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
 623         return;
 624     }
 625     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
 626                         // We could allocate one that's the right size by (CE_count * 10) + 2
 627                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
 628
 629     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
 630
 631     for(int32_t i = 0; i < testCount; i += 1) {
 632         if (!isICUVersionAtLeast(52, 0, 1) && i>=4 && i<=6) {
 633             continue; // timebomb until ticket #9156 (was #8081) is resolved
 634         }
 635         UnicodeString ts = CharsToUnicodeString(test[i]);
 636         CollationElementIterator *iter = col->createCollationElementIterator(ts);
 637         OrderList forwardList;
 638         OrderList backwardList;
 639         int32_t order, low, high;
 640
 641         do {
 642             low   = iter->getOffset();
 643             order = iter->next(status);
 644             high  = iter->getOffset();
 645
 646             forwardList.add(order, low, high);
 647         } while (order != CollationElementIterator::NULLORDER);
 648
 649         iter->reset();
 650         iter->setOffset(ts.length(), status);
 651
 652         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
 653
 654         do {
 655             high  = iter->getOffset();
 656             order = iter->previous(status);
 657             low   = iter->getOffset();
 658
 659             if (order == CollationElementIterator::NULLORDER) {
 660                 break;
 661             }
 662
 663             backwardList.add(order, low, high);
 664         } while (TRUE);
 665
 666         backwardList.reverse();
 667
 668         if (forwardList.compare(backwardList)) {
 669             logln("Works with \"%s\"", test[i]);
 670             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
 671 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
 672
 673             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
 674 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
 675
 676             logln();
 677         } else {
 678             errln("Fails with \"%s\"", test[i]);
 679             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
 680             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
 681
 682             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
 683             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
 684
 685             infoln();
 686         }
 687         delete iter;
 688     }
 689     delete col;
 690 }
 691
 692 #if 0
 693 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
 694 {
 695     for(int32_t i = 0; i < string.length(); i += 1) {
 696         UChar32 ch = string.char32At(i);
 697
 698         if (ch >= 0x0020 && ch <= 0x007F) {
 699             if (ch == 0x005C) {
 700                 buffer.append("\\\\");
 701             } else {
 702                 buffer.append(ch);
 703             }
 704         } else {
 705             char cbuffer[12];
 706
 707             if (ch <= 0xFFFFL) {
 708                 sprintf(cbuffer, "\\u%4.4X", ch);
 709             } else {
 710                 sprintf(cbuffer, "\\U%8.8X", ch);
 711             }
 712
 713             buffer.append(cbuffer);
 714         }
 715
 716         if (ch >= 0x10000L) {
 717             i += 1;
 718         }
 719     }
 720
 721     return buffer;
 722 }
 723 #endif
 724
 725 void SSearchTest::sharpSTest()
 726 {
 727     UErrorCode status = U_ZERO_ERROR;
 728     UCollator *coll = NULL;
 729     UnicodeString lp  = "fuss";
 730     UnicodeString sp = "fu\\u00DF";
 731     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
 732                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
 733                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
 734     int32_t start = -1, end = -1;
 735
 736     coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
 737     TEST_ASSERT_SUCCESS(status);
 738
 739     UnicodeString lpUnescaped = lp.unescape();
 740     UnicodeString spUnescaped = sp.unescape();
 741
 742     LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
 743                                                            lpUnescaped.getBuffer(), lpUnescaped.length(),   // actual test data will be set later
 744                                                            coll,
 745                                                            NULL,     // the break iterator
 746                                                            &status));
 747
 748     LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
 749                                                            spUnescaped.getBuffer(), spUnescaped.length(),   // actual test data will be set later
 750                                                            coll,
 751                                                            NULL,     // the break iterator
 752                                                            &status));
 753     TEST_ASSERT_SUCCESS(status);
 754
 755     for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
 756         UBool bFound;
 757         UnicodeString target = targets[t].unescape();
 758
 759         start = end = -1;
 760         usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
 761         bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
 762         TEST_ASSERT_SUCCESS(status);
 763         if (bFound) {
 764             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
 765         } else {
 766             dataerrln("Test %d: did not find long pattern.", t);
 767         }
 768
 769         usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
 770         bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
 771         TEST_ASSERT_SUCCESS(status);
 772         if (bFound) {
 773             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
 774         } else {
 775             dataerrln("Test %d: did not find long pattern.", t);
 776         }
 777     }
 778
 779     ucol_close(coll);
 780 }
 781
 782 void SSearchTest::goodSuffixTest()
 783 {
 784     UErrorCode status = U_ZERO_ERROR;
 785     UCollator *coll = NULL;
 786     UnicodeString pat = /*"gcagagag"*/ "fxeld";
 787     UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
 788     int32_t start = -1, end = -1;
 789     UBool bFound;
 790
 791     coll = ucol_open(NULL, &status);
 792     TEST_ASSERT_SUCCESS(status);
 793
 794     LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
 795                                                           target.getBuffer(), target.length(),
 796                                                           coll,
 797                                                           NULL,     // the break iterator
 798                                                           &status));
 799     TEST_ASSERT_SUCCESS(status);
 800
 801     bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
 802     TEST_ASSERT_SUCCESS(status);
 803     if (bFound) {
 804         logln("Found pattern at [%d, %d].", start, end);
 805     } else {
 806         dataerrln("Did not find pattern.");
 807     }
 808
 809     ucol_close(coll);
 810 }
 811
 812 //
 813 //  searchTime()    A quick and dirty performance test for string search.
 814 //                  Probably  doesn't really belong as part of intltest, but it
 815 //                  does check that the search succeeds, and gets the right result,
 816 //                  so it serves as a functionality test also.
 817 //
 818 //                  To run as a perf test, up the loop count, select by commenting
 819 //                  and uncommenting in the code the operation to be measured,
 820 //                  rebuild, and measure the running time of this test alone.
 821 //
 822 //                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
 823 //
 824 void SSearchTest::searchTime() {
 825     static const char *longishText =
 826 "Whylom, as olde stories tellen us,\n"
 827 "Ther was a duk that highte Theseus:\n"
 828 "Of Athenes he was lord and governour,\n"
 829 "And in his tyme swich a conquerour,\n"
 830 "That gretter was ther noon under the sonne.\n"
 831 "Ful many a riche contree hadde he wonne;\n"
 832 "What with his wisdom and his chivalrye,\n"
 833 "He conquered al the regne of Femenye,\n"
 834 "That whylom was y-cleped Scithia;\n"
 835 "And weddede the quene Ipolita,\n"
 836 "And broghte hir hoom with him in his contree\n"
 837 "With muchel glorie and greet solempnitee,\n"
 838 "And eek hir yonge suster Emelye.\n"
 839 "And thus with victorie and with melodye\n"
 840 "Lete I this noble duk to Athenes ryde,\n"
 841 "And al his hoost, in armes, him bisyde.\n"
 842 "And certes, if it nere to long to here,\n"
 843 "I wolde han told yow fully the manere,\n"
 844 "How wonnen was the regne of Femenye\n"
 845 "By Theseus, and by his chivalrye;\n"
 846 "And of the grete bataille for the nones\n"
 847 "Bitwixen Athen's and Amazones;\n"
 848 "And how asseged was Ipolita,\n"
 849 "The faire hardy quene of Scithia;\n"
 850 "And of the feste that was at hir weddinge,\n"
 851 "And of the tempest at hir hoom-cominge;\n"
 852 "But al that thing I moot as now forbere.\n"
 853 "I have, God woot, a large feeld to ere,\n"
 854 "And wayke been the oxen in my plough.\n"
 855 "The remenant of the tale is long y-nough.\n"
 856 "I wol nat letten eek noon of this route;\n"
 857 "Lat every felawe telle his tale aboute,\n"
 858 "And lat see now who shal the soper winne;\n"
 859 "And ther I lefte, I wol ageyn biginne.\n"
 860 "This duk, of whom I make mencioun,\n"
 861 "When he was come almost unto the toun,\n"
 862 "In al his wele and in his moste pryde,\n"
 863 "He was war, as he caste his eye asyde,\n"
 864 "Wher that ther kneled in the hye weye\n"
 865 "A companye of ladies, tweye and tweye,\n"
 866 "Ech after other, clad in clothes blake; \n"
 867 "But swich a cry and swich a wo they make,\n"
 868 "That in this world nis creature livinge,\n"
 869 "That herde swich another weymentinge;\n"
 870 "And of this cry they nolde never stenten,\n"
 871 "Til they the reynes of his brydel henten.\n"
 872 "'What folk ben ye, that at myn hoomcominge\n"
 873 "Perturben so my feste with cryinge'?\n"
 874 "Quod Theseus, 'have ye so greet envye\n"
 875 "Of myn honour, that thus compleyne and crye? \n"
 876 "Or who hath yow misboden, or offended?\n"
 877 "And telleth me if it may been amended;\n"
 878 "And why that ye ben clothed thus in blak'?\n"
 879 "The eldest lady of hem alle spak,\n"
 880 "When she hadde swowned with a deedly chere,\n"
 881 "That it was routhe for to seen and here,\n"
 882 "And seyde: 'Lord, to whom Fortune hath yiven\n"
 883 "Victorie, and as a conquerour to liven,\n"
 884 "Noght greveth us your glorie and your honour;\n"
 885 "But we biseken mercy and socour.\n"
 886 "Have mercy on our wo and our distresse.\n"
 887 "Som drope of pitee, thurgh thy gentilesse,\n"
 888 "Up-on us wrecched wommen lat thou falle.\n"
 889 "For certes, lord, ther nis noon of us alle,\n"
 890 "That she nath been a duchesse or a quene;\n"
 891 "Now be we caitifs, as it is wel sene:\n"
 892 "Thanked be Fortune, and hir false wheel,\n"
 893 "That noon estat assureth to be weel.\n"
 894 "And certes, lord, t'abyden your presence,\n"
 895 "Here in the temple of the goddesse Clemence\n"
 896 "We han ben waytinge al this fourtenight;\n"
 897 "Now help us, lord, sith it is in thy might.\n"
 898 "I wrecche, which that wepe and waille thus,\n"
 899 "Was whylom wyf to king Capaneus,\n"
 900 "That starf at Thebes, cursed be that day!\n"
 901 "And alle we, that been in this array,\n"
 902 "And maken al this lamentacioun,\n"
 903 "We losten alle our housbondes at that toun,\n"
 904 "Whyl that the sege ther-aboute lay.\n"
 905 "And yet now th'olde Creon, weylaway!\n"
 906 "The lord is now of Thebes the citee, \n"
 907 "Fulfild of ire and of iniquitee,\n"
 908 "He, for despyt, and for his tirannye,\n"
 909 "To do the dede bodyes vileinye,\n"
 910 "Of alle our lordes, whiche that ben slawe,\n"
 911 "Hath alle the bodyes on an heep y-drawe,\n"
 912 "And wol nat suffren hem, by noon assent,\n"
 913 "Neither to been y-buried nor y-brent,\n"
 914 "But maketh houndes ete hem in despyt. zet'\n";
 915
 916 const char *cPattern = "maketh houndes ete hem";
 917 //const char *cPattern = "Whylom";
 918 //const char *cPattern = "zet";
 919     const char *testId = "searchTime()";   // for error macros.
 920     UnicodeString target = longishText;
 921     UErrorCode status = U_ZERO_ERROR;
 922
 923
 924     LocalUCollatorPointer collator(ucol_open("en", &status));
 925     //ucol_setStrength(collator.getAlias(), collatorStrength);
 926     //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
 927     UnicodeString uPattern = cPattern;
 928     LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
 929                                                            target.getBuffer(), target.length(),
 930                                                            collator.getAlias(),
 931                                                            NULL,     // the break iterator
 932                                                            &status));
 933     TEST_ASSERT_SUCCESS(status);
 934
 935 //  int32_t foundStart;
 936 //  int32_t foundEnd;
 937     UBool   found;
 938
 939     // Find the match position usgin strstr
 940     const char *pm = strstr(longishText, cPattern);
 941     TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
 942     int32_t  refMatchPos = (int32_t)(pm - longishText);
 943     int32_t  icuMatchPos;
 944     int32_t  icuMatchEnd;
 945     usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
 946     TEST_ASSERT_SUCCESS(status);
 947     TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
 948
 949     int32_t i;
 950     // int32_t j=0;
 951
 952     // Try loopcounts around 100000 to some millions, depending on the operation,
 953     //   to get runtimes of at least several seconds.
 954     for (i=0; i<10000; i++) {
 955         found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
 956         //TEST_ASSERT_SUCCESS(status);
 957         //TEST_ASSERT(found);
 958
 959         // usearch_setOffset(uss.getAlias(), 0, &status);
 960         // icuMatchPos = usearch_next(uss.getAlias(), &status);
 961
 962          // The i+j stuff is to confuse the optimizer and get it to actually leave the
 963          //   call to strstr in place.
 964          //pm = strstr(longishText+j, cPattern);
 965          //j = (j + i)%5;
 966     }
 967
 968     //printf("%ld, %d\n", pm-longishText, j);
 969 }
 970
 971 //----------------------------------------------------------------------------------------
 972 //
 973 //   Random Numbers.  Similar to standard lib rand() and srand()
 974 //                    Not using library to
 975 //                      1.  Get same results on all platforms.
 976 //                      2.  Get access to current seed, to more easily reproduce failures.
 977 //
 978 //---------------------------------------------------------------------------------------
 979 static uint32_t m_seed = 1;
 980
 981 static uint32_t m_rand()
 982 {
 983     m_seed = m_seed * 1103515245 + 12345;
 984     return (uint32_t)(m_seed/65536) % 32768;
 985 }
 986
 987 class Monkey
 988 {
 989 public:
 990     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
 991
 992 protected:
 993     Monkey();
 994     virtual ~Monkey();
 995 };
 996
 997 Monkey::Monkey()
 998 {
 999     // ook?
1000 }
1001
1002 Monkey::~Monkey()
1003 {
1004     // ook?
1005 }
1006
1007 class SetMonkey : public Monkey
1008 {
1009 public:
1010     SetMonkey(const USet *theSet);
1011     ~SetMonkey();
1012
1013     virtual void append(UnicodeString &test, UnicodeString &alternate);
1014
1015 private:
1016     const USet *set;
1017 };
1018
1019 SetMonkey::SetMonkey(const USet *theSet)
1020     : Monkey(), set(theSet)
1021 {
1022     // ook?
1023 }
1024
1025 SetMonkey::~SetMonkey()
1026 {
1027     //ook...
1028 }
1029
1030 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1031 {
1032     int32_t size = uset_size(set);
1033     int32_t index = m_rand() % size;
1034     UChar32 ch = uset_charAt(set, index);
1035     UnicodeString str(ch);
1036
1037     test.append(str);
1038     alternate.append(str); // flip case, or some junk?
1039 }
1040
1041 class StringSetMonkey : public Monkey
1042 {
1043 public:
1044     StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1045     ~StringSetMonkey();
1046
1047     void append(UnicodeString &testCase, UnicodeString &alternate);
1048
1049 private:
1050     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1051
1052     const USet *set;
1053     UCollator  *coll;
1054     CollData   *collData;
1055 };
1056
1057 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1058 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1059 {
1060     // ook.
1061 }
1062
1063 StringSetMonkey::~StringSetMonkey()
1064 {
1065     // ook?
1066 }
1067
1068 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1069 {
1070     int32_t itemCount = uset_getItemCount(set), len = 0;
1071     int32_t index = m_rand() % itemCount;
1072     UChar32 rangeStart = 0, rangeEnd = 0;
1073     UChar buffer[16];
1074     UErrorCode err = U_ZERO_ERROR;
1075
1076     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1077
1078     if (len == 0) {
1079         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1080         UChar32 ch = rangeStart + offset;
1081         UnicodeString str(ch);
1082
1083         testCase.append(str);
1084         generateAlternative(str, alternate);
1085     } else if (len > 0) {
1086         // should check that len < 16...
1087         UnicodeString str(buffer, len);
1088
1089         testCase.append(str);
1090         generateAlternative(str, alternate);
1091     } else {
1092         // shouldn't happen...
1093     }
1094 }
1095
1096 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1097 {
1098     // find out shortest string for the longest sequence of ces.
1099     // needs to be refined to use dynamic programming, but will be roughly right
1100     UErrorCode status = U_ZERO_ERROR;
1101     CEList ceList(coll, testCase, status);
1102     UnicodeString alt;
1103     int32_t offset = 0;
1104
1105     if (ceList.size() == 0) {
1106         return alternate.append(testCase);
1107     }
1108
1109     while (offset < ceList.size()) {
1110         int32_t ce = ceList.get(offset);
1111         const StringList *strings = collData->getStringList(ce);
1112
1113         if (strings == NULL) {
1114             return alternate.append(testCase);
1115         }
1116
1117         int32_t stringCount = strings->size();
1118         int32_t tries = 0;
1119
1120         // find random string that generates the same CEList
1121         const CEList *ceList2 = NULL;
1122         const UnicodeString *string = NULL;
1123               UBool matches = FALSE;
1124
1125         do {
1126             int32_t s = m_rand() % stringCount;
1127
1128             if (tries++ > stringCount) {
1129                 alternate.append(testCase);
1130                 return alternate;
1131             }
1132
1133             string = strings->get(s);
1134             ceList2 = collData->getCEList(string);
1135             matches = ceList.matchesAt(offset, ceList2);
1136
1137             if (! matches) {
1138                 collData->freeCEList((CEList *) ceList2);
1139             }
1140         } while (! matches);
1141
1142         alt.append(*string);
1143         offset += ceList2->size();
1144         collData->freeCEList(ceList2);
1145     }
1146
1147     const CEList altCEs(coll, alt, status);
1148
1149     if (ceList.matchesAt(0, &altCEs)) {
1150         return alternate.append(alt);
1151     }
1152
1153     return alternate.append(testCase);
1154 }
1155
1156 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1157 {
1158     int32_t pieces = (m_rand() % 4) + 1;
1159     UErrorCode status = U_ZERO_ERROR;
1160     UBool matches;
1161
1162     do {
1163         testCase.remove();
1164         alternate.remove();
1165         monkeys[0]->append(testCase, alternate);
1166
1167         for(int32_t piece = 0; piece < pieces; piece += 1) {
1168             int32_t monkey = m_rand() % monkeyCount;
1169
1170             monkeys[monkey]->append(testCase, alternate);
1171         }
1172
1173         const CEList ceTest(coll, testCase, status);
1174         const CEList ceAlt(coll, alternate, status);
1175
1176         matches = ceTest.matchesAt(0, &ceAlt);
1177     } while (! matches);
1178 }
1179
1180 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1181 {
1182     UErrorCode      status = U_ZERO_ERROR;
1183     OrderList       targetOrders(coll, target, offset);
1184     OrderList       patternOrders(coll, pattern);
1185     int32_t         targetSize  = targetOrders.size() - 1;
1186     int32_t         patternSize = patternOrders.size() - 1;
1187     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1188                                                   target.getBuffer(), target.length(), &status);
1189
1190     if (patternSize == 0) {
1191         // Searching for an empty pattern always fails
1192         matchStart = matchEnd = -1;
1193         ubrk_close(charBreakIterator);
1194         return FALSE;
1195     }
1196
1197     matchStart = matchEnd = -1;
1198
1199     for(int32_t i = 0; i < targetSize; i += 1) {
1200         if (targetOrders.matchesAt(i, patternOrders)) {
1201             int32_t start    = targetOrders.getLowOffset(i);
1202             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1203             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1204
1205             // if the low and high offsets of the first CE in
1206             // the match are the same, it means that the match
1207             // starts in the middle of an expansion - all but
1208             // the first CE of the expansion will have the offset
1209             // of the following character.
1210             if (start == targetOrders.getHighOffset(i)) {
1211                 continue;
1212             }
1213
1214             // Make sure match starts on a grapheme boundary
1215             if (! ubrk_isBoundary(charBreakIterator, start)) {
1216                 continue;
1217             }
1218
1219             // If the low and high offsets of the CE after the match
1220             // are the same, it means that the match ends in the middle
1221             // of an expansion sequence.
1222             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1223                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1224                 continue;
1225             }
1226
1227             int32_t mend = maxLimit;
1228
1229             // Find the first grapheme break after the character index
1230             // of the last CE in the match. If it's after character index
1231             // that's after the last CE in the match, use that index
1232             // as the end of the match.
1233             if (minLimit < maxLimit) {
1234                 // When the last CE's low index is same with its high index, the CE is likely
1235                 // a part of expansion. In this case, the index is located just after the
1236                 // character corresponding to the CEs compared above. If the index is right
1237                 // at the break boundary, move the position to the next boundary will result
1238                 // incorrect match length when there are ignorable characters exist between
1239                 // the position and the next character produces CE(s). See ticket#8482.
1240                 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1241                     mend = minLimit;
1242                 } else {
1243                     int32_t nba = ubrk_following(charBreakIterator, minLimit);
1244
1245                     if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1246                         mend = nba;
1247                     }
1248                 }
1249             }
1250
1251             if (mend > maxLimit) {
1252                 continue;
1253             }
1254
1255             if (! ubrk_isBoundary(charBreakIterator, mend)) {
1256                 continue;
1257             }
1258
1259             matchStart = start;
1260             matchEnd   = mend;
1261
1262             ubrk_close(charBreakIterator);
1263             return TRUE;
1264         }
1265     }
1266
1267     ubrk_close(charBreakIterator);
1268     return FALSE;
1269 }
1270
1271 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1272 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1273     int32_t val = defaultVal;
1274
1275     name.append(" *= *(-?\\d+)");
1276
1277     UErrorCode status = U_ZERO_ERROR;
1278     RegexMatcher m(name, params, 0, status);
1279
1280     if (m.find()) {
1281         // The param exists.  Convert the string to an int.
1282         char valString[100];
1283         int32_t paramLength = m.end(1, status) - m.start(1, status);
1284
1285         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1286             paramLength = (int32_t)(sizeof(valString)-2);
1287         }
1288
1289         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1290         val = uprv_strtol(valString,  NULL, 10);
1291
1292         // Delete this parameter from the params string.
1293         m.reset();
1294         params = m.replaceFirst("", status);
1295     }
1296
1297   //U_ASSERT(U_SUCCESS(status));
1298     if (! U_SUCCESS(status)) {
1299         val = defaultVal;
1300     }
1301
1302     return val;
1303 }
1304 #endif
1305
1306 #if !UCONFIG_NO_COLLATION
1307 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1308                                     const char *name, const char *strength, uint32_t seed)
1309 {
1310     UErrorCode status = U_ZERO_ERROR;
1311     int32_t actualStart = -1, actualEnd = -1;
1312   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1313     int32_t expectedStart = -1, expectedEnd = -1;
1314     int32_t notFoundCount = 0;
1315     LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1316                                                            testCase.getBuffer(), testCase.length(),
1317                                                            coll,
1318                                                            NULL,     // the break iterator
1319                                                            &status));
1320
1321     // **** TODO: find *all* matches, not just first one ****
1322     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1323
1324     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1325
1326     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1327         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1328               "    strength=%s seed=%d",
1329               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1330     }
1331
1332     if (expectedStart == -1 && actualStart == -1) {
1333         notFoundCount += 1;
1334     }
1335
1336     // **** TODO: find *all* matches, not just first one ****
1337     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1338
1339     usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1340
1341     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1342
1343     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1344         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1345               "    strength=%s seed=%d",
1346               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1347     }
1348
1349     if (expectedStart == -1 && actualStart == -1) {
1350         notFoundCount += 1;
1351     }
1352
1353     return notFoundCount;
1354 }
1355 #endif
1356
1357 void SSearchTest::monkeyTest(char *params)
1358 {
1359     // ook!
1360     UErrorCode status = U_ZERO_ERROR;
1361   //UCollator *coll = ucol_open(NULL, &status);
1362     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1363
1364     if (U_FAILURE(status)) {
1365         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1366         return;
1367     }
1368
1369     CollData  *monkeyData = new CollData(coll, status);
1370
1371     USet *expansions   = uset_openEmpty();
1372     USet *contractions = uset_openEmpty();
1373
1374     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1375
1376     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1377     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1378     USet *letters = uset_openPattern(letter_pattern, 39, &status);
1379     SetMonkey letterMonkey(letters);
1380     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1381     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1382     UnicodeString testCase;
1383     UnicodeString alternate;
1384     UnicodeString pattern, altPattern;
1385     UnicodeString prefix, altPrefix;
1386     UnicodeString suffix, altSuffix;
1387
1388     Monkey *monkeys[] = {
1389         &letterMonkey,
1390         &contractionMonkey,
1391         &expansionMonkey,
1392         &contractionMonkey,
1393         &expansionMonkey,
1394         &contractionMonkey,
1395         &expansionMonkey,
1396         &contractionMonkey,
1397         &expansionMonkey};
1398     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
1399     // int32_t nonMatchCount = 0;
1400
1401     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1402     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1403     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
1404     int32_t loopCount = quick? 1000 : 10000;
1405     int32_t firstStrength = 0;
1406     int32_t lastStrength  = strengthCount - 1; //*/ 0;
1407
1408     if (params != NULL) {
1409 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1410         UnicodeString p(params);
1411
1412         loopCount = getIntParam("loop", p, loopCount);
1413         m_seed    = getIntParam("seed", p, m_seed);
1414
1415         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1416         if (m.find()) {
1417             UnicodeString breakType = m.group(1, status);
1418
1419             for (int32_t s = 0; s < strengthCount; s += 1) {
1420                 if (breakType == strengthNames[s]) {
1421                     firstStrength = lastStrength = s;
1422                     break;
1423                 }
1424             }
1425
1426             m.reset();
1427             p = m.replaceFirst("", status);
1428         }
1429
1430         if (RegexMatcher("\\S", p, 0, status).find()) {
1431             // Each option is stripped out of the option string as it is processed.
1432             // All options have been checked.  The option string should have been completely emptied..
1433             char buf[100];
1434             p.extract(buf, sizeof(buf), NULL, status);
1435             buf[sizeof(buf)-1] = 0;
1436             errln("Unrecognized or extra parameter:  %s\n", buf);
1437             return;
1438         }
1439 #else
1440         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1441 #endif
1442     }
1443
1444     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1445         int32_t notFoundCount = 0;
1446
1447         logln("Setting strength to %s.", strengthNames[s]);
1448         ucol_setStrength(coll, strengths[s]);
1449
1450         // TODO: try alternate prefix and suffix too?
1451         // TODO: alterntaes are only equal at primary strength. Is this OK?
1452         for(int32_t t = 0; t < loopCount; t += 1) {
1453             uint32_t seed = m_seed;
1454             // int32_t  nmc = 0;
1455
1456             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1457             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1458             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1459
1460             // pattern
1461             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1462
1463             testCase.remove();
1464             testCase.append(prefix);
1465             testCase.append(/*alt*/pattern);
1466
1467             // prefix + pattern
1468             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1469
1470             testCase.append(suffix);
1471
1472             // prefix + pattern + suffix
1473             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1474
1475             testCase.remove();
1476             testCase.append(pattern);
1477             testCase.append(suffix);
1478
1479             // pattern + suffix
1480             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1481         }
1482
1483        logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1484     }
1485
1486     uset_close(contractions);
1487     uset_close(expansions);
1488     uset_close(letters);
1489     delete monkeyData;
1490
1491     ucol_close(coll);
1492 }
1493
1494 #endif
1495
1496 #endif