icuSources/test/intltest/ssearch.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2016, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_COLLATION
  11
  12 #include "cmemory.h"
  13 #include "cstring.h"
  14 #include "usrchimp.h"
  15
  16 #include "unicode/coll.h"
  17 #include "unicode/tblcoll.h"
  18 #include "unicode/usearch.h"
  19 #include "unicode/uset.h"
  20 #include "unicode/ustring.h"
  21
  22 #include "unicode/coleitr.h"
  23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  24
  25 #include "colldata.h"
  26 #include "ssearch.h"
  27 #include "xmlparser.h"
  28
  29 #include <stdio.h>  // for sprintf
  30
  31 char testId[100];
  32
  33 #define TEST_ASSERT(x) {if (!(x)) { \
  34     errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
  35
  36 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
  37     dataerrln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
  38
  39 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
  40     dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
  41           __FILE__, __LINE__, testId, u_errorName(errcode));}}
  42
  43 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
  44 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  45
  46 //---------------------------------------------------------------------------
  47 //
  48 //  Test class boilerplate
  49 //
  50 //---------------------------------------------------------------------------
  51 SSearchTest::SSearchTest()
  52 {
  53 }
  54
  55 SSearchTest::~SSearchTest()
  56 {
  57 }
  58
  59 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
  60 {
  61     if (exec) logln("TestSuite SSearchTest: ");
  62     switch (index) {
  63 #if !UCONFIG_NO_BREAK_ITERATION
  64        case 0: name = "searchTest";
  65             if (exec) searchTest();
  66             break;
  67
  68         case 1: name = "offsetTest";
  69             if (exec) offsetTest();
  70             break;
  71
  72         case 2: name = "monkeyTest";
  73             if (exec) monkeyTest(params);
  74             break;
  75
  76         case 3: name = "sharpSTest";
  77             if (exec) sharpSTest();
  78             break;
  79
  80         case 4: name = "goodSuffixTest";
  81             if (exec) goodSuffixTest();
  82             break;
  83
  84         case 5: name = "searchTime";
  85             if (exec) searchTime();
  86             break;
  87 #endif
  88         default: name = "";
  89             break; //needed to end loop
  90     }
  91 }
  92
  93
  94 #if !UCONFIG_NO_BREAK_ITERATION
  95
  96 #define PATH_BUFFER_SIZE 2048
  97 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
  98     UErrorCode status = U_ZERO_ERROR;
  99     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 100
 101     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
 102         errln("ERROR: getPath() failed - %s", u_errorName(status));
 103         return NULL;
 104     }
 105
 106     strcpy(buffer, testDataDirectory);
 107     strcat(buffer, filename);
 108     return buffer;
 109 }
 110
 111
 112 void SSearchTest::searchTest()
 113 {
 114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 115     UErrorCode status = U_ZERO_ERROR;
 116     char path[PATH_BUFFER_SIZE];
 117     const char *testFilePath = getPath(path, "ssearch.xml");
 118
 119     if (testFilePath == NULL) {
 120         return; /* Couldn't get path: error message already output. */
 121     }
 122
 123     LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
 124     TEST_ASSERT_SUCCESS(status);
 125     LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
 126     TEST_ASSERT_SUCCESS(status);
 127     if (U_FAILURE(status)) {
 128         return;
 129     }
 130
 131     const UnicodeString *debugTestCase = root->getAttribute("debug");
 132     if (debugTestCase != NULL) {
 133 //       setenv("USEARCH_DEBUG", "1", 1);
 134     }
 135
 136
 137     const UXMLElement *testCase;
 138     int32_t tc = 0;
 139
 140     while((testCase = root->nextChildElement(tc)) != NULL) {
 141
 142         if (testCase->getTagName().compare("test-case") != 0) {
 143             errln("ssearch, unrecognized XML Element in test file");
 144             continue;
 145         }
 146         const UnicodeString *id       = testCase->getAttribute("id");
 147         *testId = 0;
 148         if (id != NULL) {
 149             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
 150         }
 151
 152         // If debugging test case has been specified and this is not it, skip to next.
 153         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
 154             continue;
 155         }
 156         //
 157         //  Get the requested collation strength.
 158         //    Default is tertiary if the XML attribute is missing from the test case.
 159         //
 160         const UnicodeString *strength = testCase->getAttribute("strength");
 161         UColAttributeValue collatorStrength = UCOL_PRIMARY;
 162         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
 163         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
 164         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
 165         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
 166         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
 167         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
 168         else {
 169             // Bogus value supplied for strength.  Shouldn't happen, even from
 170             //  typos, if the  XML source has been validated.
 171             //  This assert is a little deceiving in that strength can be
 172             //   any of the allowed values, not just TERTIARY, but it will
 173             //   do the job of getting the error output.
 174             TEST_ASSERT(*strength=="TERTIARY")
 175         }
 176
 177         //
 178         // Get the collator normalization flag.  Default is UCOL_OFF.
 179         //
 180         UColAttributeValue normalize = UCOL_OFF;
 181         const UnicodeString *norm = testCase->getAttribute("norm");
 182         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
 183         if (norm!=NULL && *norm=="ON") {
 184             normalize = UCOL_ON;
 185         }
 186
 187         //
 188         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
 189         //
 190         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
 191         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
 192         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
 193         if (alt != NULL && *alt == "SHIFTED") {
 194             alternateHandling = UCOL_SHIFTED;
 195         }
 196
 197         const UnicodeString defLocale("en");
 198         char  clocale[100];
 199         const UnicodeString *locale   = testCase->getAttribute("locale");
 200         if (locale == NULL || locale->length()==0) {
 201             locale = &defLocale;
 202         };
 203         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
 204
 205
 206         UnicodeString  text;
 207         UnicodeString  target;
 208         UnicodeString  pattern;
 209         int32_t        expectedMatchStart = -1;
 210         int32_t        expectedMatchLimit = -1;
 211         const UXMLElement  *n;
 212         int32_t                nodeCount = 0;
 213
 214         n = testCase->getChildElement("pattern");
 215         TEST_ASSERT(n != NULL);
 216         if (n==NULL) {
 217             continue;
 218         }
 219         text = n->getText(FALSE);
 220         text = text.unescape();
 221         pattern.append(text);
 222         nodeCount++;
 223
 224         n = testCase->getChildElement("pre");
 225         if (n!=NULL) {
 226             text = n->getText(FALSE);
 227             text = text.unescape();
 228             target.append(text);
 229             nodeCount++;
 230         }
 231
 232         n = testCase->getChildElement("m");
 233         if (n!=NULL) {
 234             expectedMatchStart = target.length();
 235             text = n->getText(FALSE);
 236             text = text.unescape();
 237             target.append(text);
 238             expectedMatchLimit = target.length();
 239             nodeCount++;
 240         }
 241
 242         n = testCase->getChildElement("post");
 243         if (n!=NULL) {
 244             text = n->getText(FALSE);
 245             text = text.unescape();
 246             target.append(text);
 247             nodeCount++;
 248         }
 249
 250         //  Check that there weren't extra things in the XML
 251         TEST_ASSERT(nodeCount == testCase->countChildren());
 252
 253         // Open a collator and StringSearch based on the parameters
 254         //   obtained from the XML.
 255         //
 256         status = U_ZERO_ERROR;
 257         LocalUCollatorPointer collator(ucol_open(clocale, &status));
 258         ucol_setStrength(collator.getAlias(), collatorStrength);
 259         ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
 260         ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
 261         LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 262                                                                target.getBuffer(), target.length(),
 263                                                                collator.getAlias(),
 264                                                                NULL,     // the break iterator
 265                                                                &status));
 266
 267         TEST_ASSERT_SUCCESS(status);
 268         if (U_FAILURE(status)) {
 269             continue;
 270         }
 271
 272         int32_t foundStart = 0;
 273         int32_t foundLimit = 0;
 274         UBool   foundMatch;
 275
 276         //
 277         // Do the search, check the match result against the expected results.
 278         //
 279         foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
 280         TEST_ASSERT_SUCCESS(status);
 281         if ((foundMatch && expectedMatchStart<0) ||
 282             (foundStart != expectedMatchStart)   ||
 283             (foundLimit != expectedMatchLimit)) {
 284                 TEST_ASSERT(FALSE);   //  ouput generic error position
 285                 infoln("Found, expected match start = %d, %d \n"
 286                        "Found, expected match limit = %d, %d",
 287                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 288         }
 289
 290         // In case there are other matches...
 291         // (should we only do this if the test case passed?)
 292         while (foundMatch) {
 293             expectedMatchStart = foundStart;
 294             expectedMatchLimit = foundLimit;
 295
 296             foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
 297         }
 298
 299         uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 300             target.getBuffer(), target.length(),
 301             collator.getAlias(),
 302             NULL,
 303             &status));
 304
 305         //
 306         // Do the backwards search, check the match result against the expected results.
 307         //
 308         foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
 309         TEST_ASSERT_SUCCESS(status);
 310         if ((foundMatch && expectedMatchStart<0) ||
 311             (foundStart != expectedMatchStart)   ||
 312             (foundLimit != expectedMatchLimit)) {
 313                 TEST_ASSERT(FALSE);   //  ouput generic error position
 314                 infoln("Found, expected backwards match start = %d, %d \n"
 315                        "Found, expected backwards match limit = %d, %d",
 316                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 317         }
 318     }
 319 #endif
 320 }
 321
 322 struct Order
 323 {
 324     int32_t order;
 325     int32_t lowOffset;
 326     int32_t highOffset;
 327 };
 328
 329 class OrderList
 330 {
 331 public:
 332     OrderList();
 333     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
 334     ~OrderList();
 335
 336     int32_t size(void) const;
 337     void add(int32_t order, int32_t low, int32_t high);
 338     const Order *get(int32_t index) const;
 339     int32_t getLowOffset(int32_t index) const;
 340     int32_t getHighOffset(int32_t index) const;
 341     int32_t getOrder(int32_t index) const;
 342     void reverse(void);
 343     UBool compare(const OrderList &other) const;
 344     UBool matchesAt(int32_t offset, const OrderList &other) const;
 345
 346 private:
 347     Order *list;
 348     int32_t listMax;
 349     int32_t listSize;
 350 };
 351
 352 OrderList::OrderList()
 353   : list(NULL),  listMax(16), listSize(0)
 354 {
 355     list = new Order[listMax];
 356 }
 357
 358 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
 359     : list(NULL), listMax(16), listSize(0)
 360 {
 361     UErrorCode status = U_ZERO_ERROR;
 362     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
 363     uint32_t strengthMask = 0;
 364     int32_t order, low, high;
 365
 366     switch (ucol_getStrength(coll))
 367     {
 368     default:
 369         strengthMask |= UCOL_TERTIARYORDERMASK;
 370         U_FALLTHROUGH;
 371     case UCOL_SECONDARY:
 372         strengthMask |= UCOL_SECONDARYORDERMASK;
 373         U_FALLTHROUGH;
 374     case UCOL_PRIMARY:
 375         strengthMask |= UCOL_PRIMARYORDERMASK;
 376     }
 377
 378     list = new Order[listMax];
 379
 380     ucol_setOffset(elems, stringOffset, &status);
 381
 382     do {
 383         low   = ucol_getOffset(elems);
 384         order = ucol_next(elems, &status);
 385         high  = ucol_getOffset(elems);
 386
 387         if (order != UCOL_NULLORDER) {
 388             order &= strengthMask;
 389         }
 390
 391         if (order != UCOL_IGNORABLE) {
 392             add(order, low, high);
 393         }
 394     } while (order != UCOL_NULLORDER);
 395
 396     ucol_closeElements(elems);
 397 }
 398
 399 OrderList::~OrderList()
 400 {
 401     delete[] list;
 402 }
 403
 404 void OrderList::add(int32_t order, int32_t low, int32_t high)
 405 {
 406     if (listSize >= listMax) {
 407         listMax *= 2;
 408
 409         Order *newList = new Order[listMax];
 410
 411         uprv_memcpy(newList, list, listSize * sizeof(Order));
 412         delete[] list;
 413         list = newList;
 414     }
 415
 416     list[listSize].order      = order;
 417     list[listSize].lowOffset  = low;
 418     list[listSize].highOffset = high;
 419
 420     listSize += 1;
 421 }
 422
 423 const Order *OrderList::get(int32_t index) const
 424 {
 425     if (index >= listSize) {
 426         return NULL;
 427     }
 428
 429     return &list[index];
 430 }
 431
 432 int32_t OrderList::getLowOffset(int32_t index) const
 433 {
 434     const Order *order = get(index);
 435
 436     if (order != NULL) {
 437         return order->lowOffset;
 438     }
 439
 440     return -1;
 441 }
 442
 443 int32_t OrderList::getHighOffset(int32_t index) const
 444 {
 445     const Order *order = get(index);
 446
 447     if (order != NULL) {
 448         return order->highOffset;
 449     }
 450
 451     return -1;
 452 }
 453
 454 int32_t OrderList::getOrder(int32_t index) const
 455 {
 456     const Order *order = get(index);
 457
 458     if (order != NULL) {
 459         return order->order;
 460     }
 461
 462     return UCOL_NULLORDER;
 463 }
 464
 465 int32_t OrderList::size() const
 466 {
 467     return listSize;
 468 }
 469
 470 void OrderList::reverse()
 471 {
 472     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
 473         Order swap = list[b];
 474
 475         list[b] = list[f];
 476         list[f] = swap;
 477     }
 478 }
 479
 480 UBool OrderList::compare(const OrderList &other) const
 481 {
 482     if (listSize != other.listSize) {
 483         return FALSE;
 484     }
 485
 486     for(int32_t i = 0; i < listSize; i += 1) {
 487         if (list[i].order  != other.list[i].order ||
 488             list[i].lowOffset != other.list[i].lowOffset ||
 489             list[i].highOffset != other.list[i].highOffset) {
 490                 return FALSE;
 491         }
 492     }
 493
 494     return TRUE;
 495 }
 496
 497 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
 498 {
 499     // NOTE: sizes include the NULLORDER, which we don't want to compare.
 500     int32_t otherSize = other.size() - 1;
 501
 502     if (listSize - 1 - offset < otherSize) {
 503         return FALSE;
 504     }
 505
 506     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
 507         if (getOrder(i) != other.getOrder(j)) {
 508             return FALSE;
 509         }
 510     }
 511
 512     return TRUE;
 513 }
 514
 515 static char *printOffsets(char *buffer, OrderList &list)
 516 {
 517     int32_t size = list.size();
 518     char *s = buffer;
 519
 520     for(int32_t i = 0; i < size; i += 1) {
 521         const Order *order = list.get(i);
 522
 523         if (i != 0) {
 524             s += sprintf(s, ", ");
 525         }
 526
 527         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
 528     }
 529
 530     return buffer;
 531 }
 532
 533 static char *printOrders(char *buffer, OrderList &list)
 534 {
 535     int32_t size = list.size();
 536     char *s = buffer;
 537
 538     for(int32_t i = 0; i < size; i += 1) {
 539         const Order *order = list.get(i);
 540
 541         if (i != 0) {
 542             s += sprintf(s, ", ");
 543         }
 544
 545         s += sprintf(s, "%8.8X", order->order);
 546     }
 547
 548     return buffer;
 549 }
 550
 551 void SSearchTest::offsetTest()
 552 {
 553     const char *test[] = {
 554         // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
 555         // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
 556         "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
 557
 558         "\\ua191\\u16ef\\u2036\\u017a",
 559
 560 #if 0
 561         // This results in a complex interaction between contraction,
 562         // expansion and normalization that confuses the backwards offset fixups.
 563         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 564 #endif
 565
 566         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 567         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
 568
 569         "\\u02FE\\u02FF"
 570         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
 571         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
 572         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
 573         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
 574         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
 575
 576         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
 577         "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
 578         "a\\u02FF\\u0316\\u0301",
 579         "a\\u0430\\u0301\\u0316",
 580         "a\\u0430\\u0316\\u0301",
 581         "abc\\u0E41\\u0301\\u0316",
 582         "abc\\u0E41\\u0316\\u0301",
 583         "\\u0E41\\u0301\\u0316",
 584         "\\u0E41\\u0316\\u0301",
 585         "a\\u0301\\u0316",
 586         "a\\u0316\\u0301",
 587         "\\uAC52\\uAC53",
 588         "\\u34CA\\u34CB",
 589         "\\u11ED\\u11EE",
 590         "\\u30C3\\u30D0",
 591         "p\\u00E9ch\\u00E9",
 592         "a\\u0301\\u0325",
 593         "a\\u0300\\u0325",
 594         "a\\u0325\\u0300",
 595         "A\\u0323\\u0300B",
 596         "A\\u0300\\u0323B",
 597         "A\\u0301\\u0323B",
 598         "A\\u0302\\u0301\\u0323B",
 599         "abc",
 600         "ab\\u0300c",
 601         "ab\\u0300\\u0323c",
 602         " \\uD800\\uDC00\\uDC00",
 603         "a\\uD800\\uDC00\\uDC00",
 604         "A\\u0301\\u0301",
 605         "A\\u0301\\u0323",
 606         "A\\u0301\\u0323B",
 607         "B\\u0301\\u0323C",
 608         "A\\u0300\\u0323B",
 609         "\\u0301A\\u0301\\u0301",
 610         "abcd\\r\\u0301",
 611         "p\\u00EAche",
 612         "pe\\u0302che",
 613     };
 614
 615     int32_t testCount = UPRV_LENGTHOF(test);
 616     UErrorCode status = U_ZERO_ERROR;
 617     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
 618     if (U_FAILURE(status)) {
 619         errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
 620         return;
 621     }
 622     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
 623                         // We could allocate one that's the right size by (CE_count * 10) + 2
 624                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
 625
 626     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
 627
 628     for(int32_t i = 0; i < testCount; i += 1) {
 629         UnicodeString ts = CharsToUnicodeString(test[i]);
 630         CollationElementIterator *iter = col->createCollationElementIterator(ts);
 631         OrderList forwardList;
 632         OrderList backwardList;
 633         int32_t order, low, high;
 634
 635         do {
 636             low   = iter->getOffset();
 637             order = iter->next(status);
 638             high  = iter->getOffset();
 639
 640             forwardList.add(order, low, high);
 641         } while (order != CollationElementIterator::NULLORDER);
 642
 643         iter->reset();
 644         iter->setOffset(ts.length(), status);
 645
 646         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
 647
 648         do {
 649             high  = iter->getOffset();
 650             order = iter->previous(status);
 651             low   = iter->getOffset();
 652
 653             if (order == CollationElementIterator::NULLORDER) {
 654                 break;
 655             }
 656
 657             backwardList.add(order, low, high);
 658         } while (TRUE);
 659
 660         backwardList.reverse();
 661
 662         if (forwardList.compare(backwardList)) {
 663             logln("Works with \"%s\"", test[i]);
 664             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
 665 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
 666
 667             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
 668 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
 669
 670             logln();
 671         } else {
 672             errln("Fails with \"%s\"", test[i]);
 673             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
 674             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
 675
 676             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
 677             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
 678
 679             infoln();
 680         }
 681         delete iter;
 682     }
 683     delete col;
 684 }
 685
 686 #if 0
 687 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
 688 {
 689     for(int32_t i = 0; i < string.length(); i += 1) {
 690         UChar32 ch = string.char32At(i);
 691
 692         if (ch >= 0x0020 && ch <= 0x007F) {
 693             if (ch == 0x005C) {
 694                 buffer.append("\\\\");
 695             } else {
 696                 buffer.append(ch);
 697             }
 698         } else {
 699             char cbuffer[12];
 700
 701             if (ch <= 0xFFFFL) {
 702                 sprintf(cbuffer, "\\u%4.4X", ch);
 703             } else {
 704                 sprintf(cbuffer, "\\U%8.8X", ch);
 705             }
 706
 707             buffer.append(cbuffer);
 708         }
 709
 710         if (ch >= 0x10000L) {
 711             i += 1;
 712         }
 713     }
 714
 715     return buffer;
 716 }
 717 #endif
 718
 719 void SSearchTest::sharpSTest()
 720 {
 721     UErrorCode status = U_ZERO_ERROR;
 722     UCollator *coll = NULL;
 723     UnicodeString lp  = "fuss";
 724     UnicodeString sp = "fu\\u00DF";
 725     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
 726                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
 727                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
 728     int32_t start = -1, end = -1;
 729
 730     coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
 731     TEST_ASSERT_SUCCESS(status);
 732
 733     UnicodeString lpUnescaped = lp.unescape();
 734     UnicodeString spUnescaped = sp.unescape();
 735
 736     LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
 737                                                            lpUnescaped.getBuffer(), lpUnescaped.length(),   // actual test data will be set later
 738                                                            coll,
 739                                                            NULL,     // the break iterator
 740                                                            &status));
 741
 742     LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
 743                                                            spUnescaped.getBuffer(), spUnescaped.length(),   // actual test data will be set later
 744                                                            coll,
 745                                                            NULL,     // the break iterator
 746                                                            &status));
 747     TEST_ASSERT_SUCCESS(status);
 748
 749     for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) {
 750         UBool bFound;
 751         UnicodeString target = targets[t].unescape();
 752
 753         start = end = -1;
 754         usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
 755         bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
 756         TEST_ASSERT_SUCCESS(status);
 757         if (bFound) {
 758             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
 759         } else {
 760             dataerrln("Test %d: did not find long pattern.", t);
 761         }
 762
 763         usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
 764         bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
 765         TEST_ASSERT_SUCCESS(status);
 766         if (bFound) {
 767             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
 768         } else {
 769             dataerrln("Test %d: did not find long pattern.", t);
 770         }
 771     }
 772
 773     ucol_close(coll);
 774 }
 775
 776 void SSearchTest::goodSuffixTest()
 777 {
 778     UErrorCode status = U_ZERO_ERROR;
 779     UCollator *coll = NULL;
 780     UnicodeString pat = /*"gcagagag"*/ "fxeld";
 781     UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
 782     int32_t start = -1, end = -1;
 783     UBool bFound;
 784
 785     coll = ucol_open(NULL, &status);
 786     TEST_ASSERT_SUCCESS(status);
 787
 788     LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
 789                                                           target.getBuffer(), target.length(),
 790                                                           coll,
 791                                                           NULL,     // the break iterator
 792                                                           &status));
 793     TEST_ASSERT_SUCCESS(status);
 794
 795     bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
 796     TEST_ASSERT_SUCCESS(status);
 797     if (bFound) {
 798         logln("Found pattern at [%d, %d].", start, end);
 799     } else {
 800         dataerrln("Did not find pattern.");
 801     }
 802
 803     ucol_close(coll);
 804 }
 805
 806 //
 807 //  searchTime()    A quick and dirty performance test for string search.
 808 //                  Probably  doesn't really belong as part of intltest, but it
 809 //                  does check that the search succeeds, and gets the right result,
 810 //                  so it serves as a functionality test also.
 811 //
 812 //                  To run as a perf test, up the loop count, select by commenting
 813 //                  and uncommenting in the code the operation to be measured,
 814 //                  rebuild, and measure the running time of this test alone.
 815 //
 816 //                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
 817 //
 818 void SSearchTest::searchTime() {
 819     static const char *longishText =
 820 "Whylom, as olde stories tellen us,\n"
 821 "Ther was a duk that highte Theseus:\n"
 822 "Of Athenes he was lord and governour,\n"
 823 "And in his tyme swich a conquerour,\n"
 824 "That gretter was ther noon under the sonne.\n"
 825 "Ful many a riche contree hadde he wonne;\n"
 826 "What with his wisdom and his chivalrye,\n"
 827 "He conquered al the regne of Femenye,\n"
 828 "That whylom was y-cleped Scithia;\n"
 829 "And weddede the quene Ipolita,\n"
 830 "And broghte hir hoom with him in his contree\n"
 831 "With muchel glorie and greet solempnitee,\n"
 832 "And eek hir yonge suster Emelye.\n"
 833 "And thus with victorie and with melodye\n"
 834 "Lete I this noble duk to Athenes ryde,\n"
 835 "And al his hoost, in armes, him bisyde.\n"
 836 "And certes, if it nere to long to here,\n"
 837 "I wolde han told yow fully the manere,\n"
 838 "How wonnen was the regne of Femenye\n"
 839 "By Theseus, and by his chivalrye;\n"
 840 "And of the grete bataille for the nones\n"
 841 "Bitwixen Athen's and Amazones;\n"
 842 "And how asseged was Ipolita,\n"
 843 "The faire hardy quene of Scithia;\n"
 844 "And of the feste that was at hir weddinge,\n"
 845 "And of the tempest at hir hoom-cominge;\n"
 846 "But al that thing I moot as now forbere.\n"
 847 "I have, God woot, a large feeld to ere,\n"
 848 "And wayke been the oxen in my plough.\n"
 849 "The remenant of the tale is long y-nough.\n"
 850 "I wol nat letten eek noon of this route;\n"
 851 "Lat every felawe telle his tale aboute,\n"
 852 "And lat see now who shal the soper winne;\n"
 853 "And ther I lefte, I wol ageyn biginne.\n"
 854 "This duk, of whom I make mencioun,\n"
 855 "When he was come almost unto the toun,\n"
 856 "In al his wele and in his moste pryde,\n"
 857 "He was war, as he caste his eye asyde,\n"
 858 "Wher that ther kneled in the hye weye\n"
 859 "A companye of ladies, tweye and tweye,\n"
 860 "Ech after other, clad in clothes blake; \n"
 861 "But swich a cry and swich a wo they make,\n"
 862 "That in this world nis creature livinge,\n"
 863 "That herde swich another weymentinge;\n"
 864 "And of this cry they nolde never stenten,\n"
 865 "Til they the reynes of his brydel henten.\n"
 866 "'What folk ben ye, that at myn hoomcominge\n"
 867 "Perturben so my feste with cryinge'?\n"
 868 "Quod Theseus, 'have ye so greet envye\n"
 869 "Of myn honour, that thus compleyne and crye? \n"
 870 "Or who hath yow misboden, or offended?\n"
 871 "And telleth me if it may been amended;\n"
 872 "And why that ye ben clothed thus in blak'?\n"
 873 "The eldest lady of hem alle spak,\n"
 874 "When she hadde swowned with a deedly chere,\n"
 875 "That it was routhe for to seen and here,\n"
 876 "And seyde: 'Lord, to whom Fortune hath yiven\n"
 877 "Victorie, and as a conquerour to liven,\n"
 878 "Noght greveth us your glorie and your honour;\n"
 879 "But we biseken mercy and socour.\n"
 880 "Have mercy on our wo and our distresse.\n"
 881 "Som drope of pitee, thurgh thy gentilesse,\n"
 882 "Up-on us wrecched wommen lat thou falle.\n"
 883 "For certes, lord, ther nis noon of us alle,\n"
 884 "That she nath been a duchesse or a quene;\n"
 885 "Now be we caitifs, as it is wel sene:\n"
 886 "Thanked be Fortune, and hir false wheel,\n"
 887 "That noon estat assureth to be weel.\n"
 888 "And certes, lord, t'abyden your presence,\n"
 889 "Here in the temple of the goddesse Clemence\n"
 890 "We han ben waytinge al this fourtenight;\n"
 891 "Now help us, lord, sith it is in thy might.\n"
 892 "I wrecche, which that wepe and waille thus,\n"
 893 "Was whylom wyf to king Capaneus,\n"
 894 "That starf at Thebes, cursed be that day!\n"
 895 "And alle we, that been in this array,\n"
 896 "And maken al this lamentacioun,\n"
 897 "We losten alle our housbondes at that toun,\n"
 898 "Whyl that the sege ther-aboute lay.\n"
 899 "And yet now th'olde Creon, weylaway!\n"
 900 "The lord is now of Thebes the citee, \n"
 901 "Fulfild of ire and of iniquitee,\n"
 902 "He, for despyt, and for his tirannye,\n"
 903 "To do the dede bodyes vileinye,\n"
 904 "Of alle our lordes, whiche that ben slawe,\n"
 905 "Hath alle the bodyes on an heep y-drawe,\n"
 906 "And wol nat suffren hem, by noon assent,\n"
 907 "Neither to been y-buried nor y-brent,\n"
 908 "But maketh houndes ete hem in despyt. zet'\n";
 909
 910 const char *cPattern = "maketh houndes ete hem";
 911 //const char *cPattern = "Whylom";
 912 //const char *cPattern = "zet";
 913     const char *testId = "searchTime()";   // for error macros.
 914     UnicodeString target = longishText;
 915     UErrorCode status = U_ZERO_ERROR;
 916
 917
 918     LocalUCollatorPointer collator(ucol_open("en", &status));
 919     //ucol_setStrength(collator.getAlias(), collatorStrength);
 920     //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
 921     UnicodeString uPattern = cPattern;
 922     LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
 923                                                            target.getBuffer(), target.length(),
 924                                                            collator.getAlias(),
 925                                                            NULL,     // the break iterator
 926                                                            &status));
 927     TEST_ASSERT_SUCCESS(status);
 928
 929 //  int32_t foundStart;
 930 //  int32_t foundEnd;
 931     UBool   found;
 932
 933     // Find the match position usgin strstr
 934     const char *pm = strstr(longishText, cPattern);
 935     TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
 936     int32_t  refMatchPos = (int32_t)(pm - longishText);
 937     int32_t  icuMatchPos;
 938     int32_t  icuMatchEnd;
 939     usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
 940     TEST_ASSERT_SUCCESS(status);
 941     TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
 942
 943     int32_t i;
 944     // int32_t j=0;
 945
 946     // Try loopcounts around 100000 to some millions, depending on the operation,
 947     //   to get runtimes of at least several seconds.
 948     for (i=0; i<10000; i++) {
 949         found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
 950         (void)found;   // Suppress set but not used warning.
 951         //TEST_ASSERT_SUCCESS(status);
 952         //TEST_ASSERT(found);
 953
 954         // usearch_setOffset(uss.getAlias(), 0, &status);
 955         // icuMatchPos = usearch_next(uss.getAlias(), &status);
 956
 957          // The i+j stuff is to confuse the optimizer and get it to actually leave the
 958          //   call to strstr in place.
 959          //pm = strstr(longishText+j, cPattern);
 960          //j = (j + i)%5;
 961     }
 962
 963     //printf("%ld, %d\n", pm-longishText, j);
 964 }
 965
 966 //----------------------------------------------------------------------------------------
 967 //
 968 //   Random Numbers.  Similar to standard lib rand() and srand()
 969 //                    Not using library to
 970 //                      1.  Get same results on all platforms.
 971 //                      2.  Get access to current seed, to more easily reproduce failures.
 972 //
 973 //---------------------------------------------------------------------------------------
 974 static uint32_t m_seed = 1;
 975
 976 static uint32_t m_rand()
 977 {
 978     m_seed = m_seed * 1103515245 + 12345;
 979     return (uint32_t)(m_seed/65536) % 32768;
 980 }
 981
 982 class Monkey
 983 {
 984 public:
 985     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
 986
 987 protected:
 988     Monkey();
 989     virtual ~Monkey();
 990 };
 991
 992 Monkey::Monkey()
 993 {
 994     // ook?
 995 }
 996
 997 Monkey::~Monkey()
 998 {
 999     // ook?
1000 }
1001
1002 class SetMonkey : public Monkey
1003 {
1004 public:
1005     SetMonkey(const USet *theSet);
1006     ~SetMonkey();
1007
1008     virtual void append(UnicodeString &test, UnicodeString &alternate);
1009
1010 private:
1011     const USet *set;
1012 };
1013
1014 SetMonkey::SetMonkey(const USet *theSet)
1015     : Monkey(), set(theSet)
1016 {
1017     // ook?
1018 }
1019
1020 SetMonkey::~SetMonkey()
1021 {
1022     //ook...
1023 }
1024
1025 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1026 {
1027     int32_t size = uset_size(set);
1028     int32_t index = m_rand() % size;
1029     UChar32 ch = uset_charAt(set, index);
1030     UnicodeString str(ch);
1031
1032     test.append(str);
1033     alternate.append(str); // flip case, or some junk?
1034 }
1035
1036 class StringSetMonkey : public Monkey
1037 {
1038 public:
1039     StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1040     ~StringSetMonkey();
1041
1042     void append(UnicodeString &testCase, UnicodeString &alternate);
1043
1044 private:
1045     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1046
1047     const USet *set;
1048     UCollator  *coll;
1049     CollData   *collData;
1050 };
1051
1052 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1053 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1054 {
1055     // ook.
1056 }
1057
1058 StringSetMonkey::~StringSetMonkey()
1059 {
1060     // ook?
1061 }
1062
1063 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1064 {
1065     int32_t itemCount = uset_getItemCount(set), len = 0;
1066     int32_t index = m_rand() % itemCount;
1067     UChar32 rangeStart = 0, rangeEnd = 0;
1068     UChar buffer[16];
1069     UErrorCode err = U_ZERO_ERROR;
1070
1071     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1072
1073     if (len == 0) {
1074         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1075         UChar32 ch = rangeStart + offset;
1076         UnicodeString str(ch);
1077
1078         testCase.append(str);
1079         generateAlternative(str, alternate);
1080     } else if (len > 0) {
1081         // should check that len < 16...
1082         UnicodeString str(buffer, len);
1083
1084         testCase.append(str);
1085         generateAlternative(str, alternate);
1086     } else {
1087         // shouldn't happen...
1088     }
1089 }
1090
1091 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1092 {
1093     // find out shortest string for the longest sequence of ces.
1094     // needs to be refined to use dynamic programming, but will be roughly right
1095     UErrorCode status = U_ZERO_ERROR;
1096     CEList ceList(coll, testCase, status);
1097     UnicodeString alt;
1098     int32_t offset = 0;
1099
1100     if (ceList.size() == 0) {
1101         return alternate.append(testCase);
1102     }
1103
1104     while (offset < ceList.size()) {
1105         int32_t ce = ceList.get(offset);
1106         const StringList *strings = collData->getStringList(ce);
1107
1108         if (strings == NULL) {
1109             return alternate.append(testCase);
1110         }
1111
1112         int32_t stringCount = strings->size();
1113         int32_t tries = 0;
1114
1115         // find random string that generates the same CEList
1116         const CEList *ceList2 = NULL;
1117         const UnicodeString *string = NULL;
1118               UBool matches = FALSE;
1119
1120         do {
1121             int32_t s = m_rand() % stringCount;
1122
1123             if (tries++ > stringCount) {
1124                 alternate.append(testCase);
1125                 return alternate;
1126             }
1127
1128             string = strings->get(s);
1129             ceList2 = collData->getCEList(string);
1130             matches = ceList.matchesAt(offset, ceList2);
1131
1132             if (! matches) {
1133                 collData->freeCEList((CEList *) ceList2);
1134             }
1135         } while (! matches);
1136
1137         alt.append(*string);
1138         offset += ceList2->size();
1139         collData->freeCEList(ceList2);
1140     }
1141
1142     const CEList altCEs(coll, alt, status);
1143
1144     if (ceList.matchesAt(0, &altCEs)) {
1145         return alternate.append(alt);
1146     }
1147
1148     return alternate.append(testCase);
1149 }
1150
1151 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1152 {
1153     int32_t pieces = (m_rand() % 4) + 1;
1154     UErrorCode status = U_ZERO_ERROR;
1155     UBool matches;
1156
1157     do {
1158         testCase.remove();
1159         alternate.remove();
1160         monkeys[0]->append(testCase, alternate);
1161
1162         for(int32_t piece = 0; piece < pieces; piece += 1) {
1163             int32_t monkey = m_rand() % monkeyCount;
1164
1165             monkeys[monkey]->append(testCase, alternate);
1166         }
1167
1168         const CEList ceTest(coll, testCase, status);
1169         const CEList ceAlt(coll, alternate, status);
1170
1171         matches = ceTest.matchesAt(0, &ceAlt);
1172     } while (! matches);
1173 }
1174
1175 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1176 {
1177     UErrorCode      status = U_ZERO_ERROR;
1178     OrderList       targetOrders(coll, target, offset);
1179     OrderList       patternOrders(coll, pattern);
1180     int32_t         targetSize  = targetOrders.size() - 1;
1181     int32_t         patternSize = patternOrders.size() - 1;
1182     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1183                                                   target.getBuffer(), target.length(), &status);
1184
1185     if (patternSize == 0) {
1186         // Searching for an empty pattern always fails
1187         matchStart = matchEnd = -1;
1188         ubrk_close(charBreakIterator);
1189         return FALSE;
1190     }
1191
1192     matchStart = matchEnd = -1;
1193
1194     for(int32_t i = 0; i < targetSize; i += 1) {
1195         if (targetOrders.matchesAt(i, patternOrders)) {
1196             int32_t start    = targetOrders.getLowOffset(i);
1197             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1198             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1199
1200             // if the low and high offsets of the first CE in
1201             // the match are the same, it means that the match
1202             // starts in the middle of an expansion - all but
1203             // the first CE of the expansion will have the offset
1204             // of the following character.
1205             if (start == targetOrders.getHighOffset(i)) {
1206                 continue;
1207             }
1208
1209             // Make sure match starts on a grapheme boundary
1210             if (! ubrk_isBoundary(charBreakIterator, start)) {
1211                 continue;
1212             }
1213
1214             // If the low and high offsets of the CE after the match
1215             // are the same, it means that the match ends in the middle
1216             // of an expansion sequence.
1217             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1218                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1219                 continue;
1220             }
1221
1222             int32_t mend = maxLimit;
1223
1224             // Find the first grapheme break after the character index
1225             // of the last CE in the match. If it's after character index
1226             // that's after the last CE in the match, use that index
1227             // as the end of the match.
1228             if (minLimit < maxLimit) {
1229                 // When the last CE's low index is same with its high index, the CE is likely
1230                 // a part of expansion. In this case, the index is located just after the
1231                 // character corresponding to the CEs compared above. If the index is right
1232                 // at the break boundary, move the position to the next boundary will result
1233                 // incorrect match length when there are ignorable characters exist between
1234                 // the position and the next character produces CE(s). See ticket#8482.
1235                 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1236                     mend = minLimit;
1237                 } else {
1238                     int32_t nba = ubrk_following(charBreakIterator, minLimit);
1239
1240                     if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1241                         mend = nba;
1242                     }
1243                 }
1244             }
1245
1246             if (mend > maxLimit) {
1247                 continue;
1248             }
1249
1250             if (! ubrk_isBoundary(charBreakIterator, mend)) {
1251                 continue;
1252             }
1253
1254             matchStart = start;
1255             matchEnd   = mend;
1256
1257             ubrk_close(charBreakIterator);
1258             return TRUE;
1259         }
1260     }
1261
1262     ubrk_close(charBreakIterator);
1263     return FALSE;
1264 }
1265
1266 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1267 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1268     int32_t val = defaultVal;
1269
1270     name.append(" *= *(-?\\d+)");
1271
1272     UErrorCode status = U_ZERO_ERROR;
1273     RegexMatcher m(name, params, 0, status);
1274
1275     if (m.find()) {
1276         // The param exists.  Convert the string to an int.
1277         char valString[100];
1278         int32_t paramLength = m.end(1, status) - m.start(1, status);
1279
1280         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1281             paramLength = (int32_t)(sizeof(valString)-2);
1282         }
1283
1284         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1285         val = uprv_strtol(valString,  NULL, 10);
1286
1287         // Delete this parameter from the params string.
1288         m.reset();
1289         params = m.replaceFirst("", status);
1290     }
1291
1292   //U_ASSERT(U_SUCCESS(status));
1293     if (! U_SUCCESS(status)) {
1294         val = defaultVal;
1295     }
1296
1297     return val;
1298 }
1299 #endif
1300
1301 #if !UCONFIG_NO_COLLATION
1302 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1303                                     const char *name, const char *strength, uint32_t seed)
1304 {
1305     UErrorCode status = U_ZERO_ERROR;
1306     int32_t actualStart = -1, actualEnd = -1;
1307   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1308     int32_t expectedStart = -1, expectedEnd = -1;
1309     int32_t notFoundCount = 0;
1310     LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1311                                                            testCase.getBuffer(), testCase.length(),
1312                                                            coll,
1313                                                            NULL,     // the break iterator
1314                                                            &status));
1315
1316     // **** TODO: find *all* matches, not just first one ****
1317     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1318
1319     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1320
1321     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1322         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1323               "    strength=%s seed=%d",
1324               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1325     }
1326
1327     if (expectedStart == -1 && actualStart == -1) {
1328         notFoundCount += 1;
1329     }
1330
1331     // **** TODO: find *all* matches, not just first one ****
1332     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1333
1334     usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1335
1336     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1337
1338     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1339         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1340               "    strength=%s seed=%d",
1341               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1342     }
1343
1344     if (expectedStart == -1 && actualStart == -1) {
1345         notFoundCount += 1;
1346     }
1347
1348     return notFoundCount;
1349 }
1350 #endif
1351
1352 void SSearchTest::monkeyTest(char *params)
1353 {
1354     // ook!
1355     UErrorCode status = U_ZERO_ERROR;
1356   //UCollator *coll = ucol_open(NULL, &status);
1357     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1358
1359     if (U_FAILURE(status)) {
1360         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1361         return;
1362     }
1363
1364     CollData  *monkeyData = new CollData(coll, status);
1365
1366     USet *expansions   = uset_openEmpty();
1367     USet *contractions = uset_openEmpty();
1368
1369     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1370
1371     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1372     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1373     USet *letters = uset_openPattern(letter_pattern, 39, &status);
1374     SetMonkey letterMonkey(letters);
1375     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1376     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1377     UnicodeString testCase;
1378     UnicodeString alternate;
1379     UnicodeString pattern, altPattern;
1380     UnicodeString prefix, altPrefix;
1381     UnicodeString suffix, altSuffix;
1382
1383     Monkey *monkeys[] = {
1384         &letterMonkey,
1385         &contractionMonkey,
1386         &expansionMonkey,
1387         &contractionMonkey,
1388         &expansionMonkey,
1389         &contractionMonkey,
1390         &expansionMonkey,
1391         &contractionMonkey,
1392         &expansionMonkey};
1393     int32_t monkeyCount = UPRV_LENGTHOF(monkeys);
1394     // int32_t nonMatchCount = 0;
1395
1396     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1397     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1398     int32_t strengthCount = UPRV_LENGTHOF(strengths);
1399     int32_t loopCount = quick? 1000 : 10000;
1400     int32_t firstStrength = 0;
1401     int32_t lastStrength  = strengthCount - 1; //*/ 0;
1402
1403     if (params != NULL) {
1404 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1405         UnicodeString p(params);
1406
1407         loopCount = getIntParam("loop", p, loopCount);
1408         m_seed    = getIntParam("seed", p, m_seed);
1409
1410         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1411         if (m.find()) {
1412             UnicodeString breakType = m.group(1, status);
1413
1414             for (int32_t s = 0; s < strengthCount; s += 1) {
1415                 if (breakType == strengthNames[s]) {
1416                     firstStrength = lastStrength = s;
1417                     break;
1418                 }
1419             }
1420
1421             m.reset();
1422             p = m.replaceFirst("", status);
1423         }
1424
1425         if (RegexMatcher("\\S", p, 0, status).find()) {
1426             // Each option is stripped out of the option string as it is processed.
1427             // All options have been checked.  The option string should have been completely emptied..
1428             char buf[100];
1429             p.extract(buf, sizeof(buf), NULL, status);
1430             buf[sizeof(buf)-1] = 0;
1431             errln("Unrecognized or extra parameter:  %s\n", buf);
1432             return;
1433         }
1434 #else
1435         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1436 #endif
1437     }
1438
1439     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1440         int32_t notFoundCount = 0;
1441
1442         logln("Setting strength to %s.", strengthNames[s]);
1443         ucol_setStrength(coll, strengths[s]);
1444
1445         // TODO: try alternate prefix and suffix too?
1446         // TODO: alternates are only equal at primary strength. Is this OK?
1447         for(int32_t t = 0; t < loopCount; t += 1) {
1448             uint32_t seed = m_seed;
1449             // int32_t  nmc = 0;
1450
1451             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1452             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1453             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1454
1455             // pattern
1456             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1457
1458             testCase.remove();
1459             testCase.append(prefix);
1460             testCase.append(/*alt*/pattern);
1461
1462             // prefix + pattern
1463             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1464
1465             testCase.append(suffix);
1466
1467             // prefix + pattern + suffix
1468             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1469
1470             testCase.remove();
1471             testCase.append(pattern);
1472             testCase.append(suffix);
1473
1474             // pattern + suffix
1475             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1476         }
1477
1478        logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1479     }
1480
1481     uset_close(contractions);
1482     uset_close(expansions);
1483     uset_close(letters);
1484     delete monkeyData;
1485
1486     ucol_close(coll);
1487 }
1488
1489 #endif
1490
1491 #endif