icuSources/test/intltest/ssearch.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2012, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8
   9 #include "unicode/utypes.h"
  10
  11 #if !UCONFIG_NO_COLLATION
  12
  13 #include "unicode/unistr.h"
  14 #include "unicode/putil.h"
  15 #include "unicode/usearch.h"
  16
  17 #include "cmemory.h"
  18 #include "unicode/coll.h"
  19 #include "unicode/tblcoll.h"
  20 #include "unicode/coleitr.h"
  21 #include "unicode/ucoleitr.h"
  22
  23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  24
  25 #include "unicode/uniset.h"
  26 #include "unicode/uset.h"
  27 #include "unicode/ustring.h"
  28 #include "hash.h"
  29 #include "uhash.h"
  30 #include "ucol_imp.h"
  31
  32 #include "intltest.h"
  33 #include "ssearch.h"
  34
  35 #include "unicode/colldata.h"
  36 #include "unicode/bmsearch.h"
  37 #include "unicode/bms.h"
  38
  39 #include "xmlparser.h"
  40 #include "ucbuf.h"
  41
  42 #include <stdlib.h>
  43 #include <string.h>
  44 #include <stdio.h>
  45
  46 char testId[100];
  47
  48 #define TEST_ASSERT(x) {if (!(x)) { \
  49     errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
  50
  51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
  52     errln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
  53
  54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
  55     dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
  56           __FILE__, __LINE__, testId, u_errorName(errcode));}}
  57
  58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
  60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  61
  62 //---------------------------------------------------------------------------
  63 //
  64 //  Test class boilerplate
  65 //
  66 //---------------------------------------------------------------------------
  67 SSearchTest::SSearchTest()
  68 {
  69 }
  70
  71 SSearchTest::~SSearchTest()
  72 {
  73 }
  74
  75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
  76 {
  77     if (exec) logln("TestSuite SSearchTest: ");
  78     switch (index) {
  79 #if !UCONFIG_NO_BREAK_ITERATION
  80        case 0: name = "searchTest";
  81             if (exec) searchTest();
  82             break;
  83
  84         case 1: name = "offsetTest";
  85             if (exec) offsetTest();
  86             break;
  87
  88         case 2: name = "monkeyTest";
  89             if (exec) monkeyTest(params);
  90             break;
  91
  92         case 3: name = "bmMonkeyTest";
  93             if (exec) bmMonkeyTest(params);
  94             break;
  95
  96         case 4: name = "boyerMooreTest";
  97             if (exec) boyerMooreTest();
  98             break;
  99
 100         case 5: name = "goodSuffixTest";
 101             if (exec) goodSuffixTest();
 102             break;
 103
 104         case 6: name = "searchTime";
 105             if (exec) searchTime();
 106             break;
 107
 108         case 7: name = "bmsTest";
 109             if (exec) bmsTest();
 110             break;
 111
 112         case 8: name = "bmSearchTest";
 113             if (exec) bmSearchTest();
 114             break;
 115
 116         case 9: name = "udhrTest";
 117             if (exec) udhrTest();
 118             break;
 119         case 10: name = "stringListTest";
 120             if (exec) stringListTest();
 121             break;
 122 #endif
 123         default: name = "";
 124             break; //needed to end loop
 125     }
 126 }
 127
 128
 129 #if !UCONFIG_NO_BREAK_ITERATION
 130
 131 #define PATH_BUFFER_SIZE 2048
 132 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
 133     UErrorCode status = U_ZERO_ERROR;
 134     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 135
 136     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
 137         errln("ERROR: getPath() failed - %s", u_errorName(status));
 138         return NULL;
 139     }
 140
 141     strcpy(buffer, testDataDirectory);
 142     strcat(buffer, filename);
 143     return buffer;
 144 }
 145
 146
 147 void SSearchTest::searchTest()
 148 {
 149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 150     UErrorCode status = U_ZERO_ERROR;
 151     char path[PATH_BUFFER_SIZE];
 152     const char *testFilePath = getPath(path, "ssearch.xml");
 153
 154     if (testFilePath == NULL) {
 155         return; /* Couldn't get path: error message already output. */
 156     }
 157
 158     LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
 159     TEST_ASSERT_SUCCESS(status);
 160     LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
 161     TEST_ASSERT_SUCCESS(status);
 162     if (U_FAILURE(status)) {
 163         return;
 164     }
 165
 166     const UnicodeString *debugTestCase = root->getAttribute("debug");
 167     if (debugTestCase != NULL) {
 168 //       setenv("USEARCH_DEBUG", "1", 1);
 169     }
 170
 171
 172     const UXMLElement *testCase;
 173     int32_t tc = 0;
 174
 175     while((testCase = root->nextChildElement(tc)) != NULL) {
 176
 177         if (testCase->getTagName().compare("test-case") != 0) {
 178             errln("ssearch, unrecognized XML Element in test file");
 179             continue;
 180         }
 181         const UnicodeString *id       = testCase->getAttribute("id");
 182         *testId = 0;
 183         if (id != NULL) {
 184             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
 185         }
 186
 187         // If debugging test case has been specified and this is not it, skip to next.
 188         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
 189             continue;
 190         }
 191         //
 192         //  Get the requested collation strength.
 193         //    Default is tertiary if the XML attribute is missing from the test case.
 194         //
 195         const UnicodeString *strength = testCase->getAttribute("strength");
 196         UColAttributeValue collatorStrength = UCOL_PRIMARY;
 197         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
 198         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
 199         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
 200         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
 201         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
 202         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
 203         else {
 204             // Bogus value supplied for strength.  Shouldn't happen, even from
 205             //  typos, if the  XML source has been validated.
 206             //  This assert is a little deceiving in that strength can be
 207             //   any of the allowed values, not just TERTIARY, but it will
 208             //   do the job of getting the error output.
 209             TEST_ASSERT(*strength=="TERTIARY")
 210         }
 211
 212         //
 213         // Get the collator normalization flag.  Default is UCOL_OFF.
 214         //
 215         UColAttributeValue normalize = UCOL_OFF;
 216         const UnicodeString *norm = testCase->getAttribute("norm");
 217         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
 218         if (norm!=NULL && *norm=="ON") {
 219             normalize = UCOL_ON;
 220         }
 221
 222         //
 223         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
 224         //
 225         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
 226         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
 227         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
 228         if (alt != NULL && *alt == "SHIFTED") {
 229             alternateHandling = UCOL_SHIFTED;
 230         }
 231
 232         const UnicodeString defLocale("en");
 233         char  clocale[100];
 234         const UnicodeString *locale   = testCase->getAttribute("locale");
 235         if (locale == NULL || locale->length()==0) {
 236             locale = &defLocale;
 237         };
 238         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
 239
 240
 241         UnicodeString  text;
 242         UnicodeString  target;
 243         UnicodeString  pattern;
 244         int32_t        expectedMatchStart = -1;
 245         int32_t        expectedMatchLimit = -1;
 246         const UXMLElement  *n;
 247         int32_t                nodeCount = 0;
 248
 249         n = testCase->getChildElement("pattern");
 250         TEST_ASSERT(n != NULL);
 251         if (n==NULL) {
 252             continue;
 253         }
 254         text = n->getText(FALSE);
 255         text = text.unescape();
 256         pattern.append(text);
 257         nodeCount++;
 258
 259         n = testCase->getChildElement("pre");
 260         if (n!=NULL) {
 261             text = n->getText(FALSE);
 262             text = text.unescape();
 263             target.append(text);
 264             nodeCount++;
 265         }
 266
 267         n = testCase->getChildElement("m");
 268         if (n!=NULL) {
 269             expectedMatchStart = target.length();
 270             text = n->getText(FALSE);
 271             text = text.unescape();
 272             target.append(text);
 273             expectedMatchLimit = target.length();
 274             nodeCount++;
 275         }
 276
 277         n = testCase->getChildElement("post");
 278         if (n!=NULL) {
 279             text = n->getText(FALSE);
 280             text = text.unescape();
 281             target.append(text);
 282             nodeCount++;
 283         }
 284
 285         //  Check that there weren't extra things in the XML
 286         TEST_ASSERT(nodeCount == testCase->countChildren());
 287
 288         // Open a collator and StringSearch based on the parameters
 289         //   obtained from the XML.
 290         //
 291         status = U_ZERO_ERROR;
 292         LocalUCollatorPointer collator(ucol_open(clocale, &status));
 293         ucol_setStrength(collator.getAlias(), collatorStrength);
 294         ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
 295         ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
 296         LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 297                                                                target.getBuffer(), target.length(),
 298                                                                collator.getAlias(),
 299                                                                NULL,     // the break iterator
 300                                                                &status));
 301
 302         TEST_ASSERT_SUCCESS(status);
 303         if (U_FAILURE(status)) {
 304             continue;
 305         }
 306
 307         int32_t foundStart = 0;
 308         int32_t foundLimit = 0;
 309         UBool   foundMatch;
 310
 311         //
 312         // Do the search, check the match result against the expected results.
 313         //
 314         foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
 315         TEST_ASSERT_SUCCESS(status);
 316         if ((foundMatch && expectedMatchStart<0) ||
 317             (foundStart != expectedMatchStart)   ||
 318             (foundLimit != expectedMatchLimit)) {
 319                 TEST_ASSERT(FALSE);   //  ouput generic error position
 320                 infoln("Found, expected match start = %d, %d \n"
 321                        "Found, expected match limit = %d, %d",
 322                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 323         }
 324
 325         // In case there are other matches...
 326         // (should we only do this if the test case passed?)
 327         while (foundMatch) {
 328             expectedMatchStart = foundStart;
 329             expectedMatchLimit = foundLimit;
 330
 331             foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
 332         }
 333
 334         uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
 335             target.getBuffer(), target.length(),
 336             collator.getAlias(),
 337             NULL,
 338             &status));
 339
 340         //
 341         // Do the backwards search, check the match result against the expected results.
 342         //
 343         foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
 344         TEST_ASSERT_SUCCESS(status);
 345         if ((foundMatch && expectedMatchStart<0) ||
 346             (foundStart != expectedMatchStart)   ||
 347             (foundLimit != expectedMatchLimit)) {
 348                 TEST_ASSERT(FALSE);   //  ouput generic error position
 349                 infoln("Found, expected backwards match start = %d, %d \n"
 350                        "Found, expected backwards match limit = %d, %d",
 351                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 352         }
 353     }
 354 #endif
 355 }
 356
 357 struct UdhrTestCase
 358 {
 359     const char *locale;
 360     const char *file;
 361 };
 362
 363 void SSearchTest::udhrTest()
 364 {
 365     UErrorCode status = U_ZERO_ERROR;
 366     char path[PATH_BUFFER_SIZE];
 367     const char *udhrPath = getPath(path, "udhr");
 368
 369     if (udhrPath == NULL) {
 370         // couldn't get path: error message already output...
 371         return;
 372     }
 373
 374     UdhrTestCase testCases[] = {
 375         {"en", "udhr_eng.txt"},
 376         {"de", "udhr_deu_1996.txt"},
 377         {"fr", "udhr_fra.txt"},
 378         {"ru", "udhr_rus.txt"},
 379         {"th", "udhr_tha.txt"},
 380         {"ja", "udhr_jpn.txt"},
 381         {"ko", "udhr_kor.txt"},
 382         {"zh", "udhr_cmn_hans.txt"},
 383         {"zh_Hant", "udhr_cmn_hant.txt"}
 384     };
 385
 386     int32_t testCount = ARRAY_SIZE(testCases);
 387
 388     for (int32_t t = 0; t < testCount; t += 1) {
 389         int32_t len = 0;
 390         char *resolvedFileName = NULL;
 391         const char *encoding = NULL;
 392         UCHARBUF *ucharBuf = NULL;
 393
 394         ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status);
 395         resolvedFileName = NEW_ARRAY(char, len);
 396
 397         if(resolvedFileName == NULL){
 398             continue;
 399         }
 400
 401         if(status == U_BUFFER_OVERFLOW_ERROR){
 402             status = U_ZERO_ERROR;
 403         }
 404
 405         ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status);
 406         ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status);
 407
 408         DELETE_ARRAY(resolvedFileName);
 409
 410         if(U_FAILURE(status)){
 411             infoln("Could not open the input file %s. Test skipped\n", testCases[t].file);
 412             continue;
 413         }
 414
 415         int32_t targetLen = 0;
 416         const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status);
 417
 418         /* The first line of the file contains the pattern */
 419         int32_t start = 0, end = 0, plen = 0;
 420
 421         for(end = start; ; end += 1) {
 422             UChar ch = target[end];
 423
 424             if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
 425                 break;
 426             }
 427         }
 428
 429         plen = end - start;
 430
 431         UChar *pattern = NEW_ARRAY(UChar, plen);
 432         for (int32_t i = 0; i < plen; i += 1) {
 433             pattern[i] =  target[start++];
 434         }
 435
 436         int32_t offset = 0;
 437         UCollator *coll = ucol_open(testCases[t].locale, &status);
 438         UCD *ucd = NULL;
 439         BMS *bms = NULL;
 440
 441         if (U_FAILURE(status)) {
 442             errln("Could not open collator for %s", testCases[t].locale);
 443             goto delete_collator;
 444         }
 445
 446         ucd = ucd_open(coll, &status);
 447
 448         if (U_FAILURE(status)) {
 449             errln("Could not open CollData object for %s", testCases[t].locale);
 450             goto delete_ucd;
 451         }
 452
 453         bms = bms_open(ucd, pattern, plen, target, targetLen, &status);
 454
 455         if (U_FAILURE(status)) {
 456             errln("Could not open search object for %s", testCases[t].locale);
 457             goto delete_bms;
 458         }
 459
 460         start = end = -1;
 461         while (bms_search(bms, offset, &start, &end)) {
 462             offset = end;
 463         }
 464
 465         if (offset == 0) {
 466             errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file);
 467         }
 468
 469 delete_bms:
 470         bms_close(bms);
 471
 472 delete_ucd:
 473         ucd_close(ucd);
 474
 475 delete_collator:
 476         ucol_close(coll);
 477
 478         DELETE_ARRAY(pattern);
 479         ucbuf_close(ucharBuf);
 480     }
 481
 482     ucd_flushCache();
 483 }
 484
 485 void SSearchTest::bmSearchTest()
 486 {
 487 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 488     UErrorCode status = U_ZERO_ERROR;
 489     char path[PATH_BUFFER_SIZE];
 490     const char *testFilePath = getPath(path, "ssearch.xml");
 491
 492     if (testFilePath == NULL) {
 493         return; /* Couldn't get path: error message already output. */
 494     }
 495
 496     UXMLParser  *parser = UXMLParser::createParser(status);
 497     TEST_ASSERT_SUCCESS(status);
 498     UXMLElement *root   = parser->parseFile(testFilePath, status);
 499     TEST_ASSERT_SUCCESS(status);
 500     if (U_FAILURE(status)) {
 501         return;
 502     }
 503
 504     const UnicodeString *debugTestCase = root->getAttribute("debug");
 505     if (debugTestCase != NULL) {
 506 //       setenv("USEARCH_DEBUG", "1", 1);
 507     }
 508
 509
 510     const UXMLElement *testCase;
 511     int32_t tc = 0;
 512
 513     while((testCase = root->nextChildElement(tc)) != NULL) {
 514
 515         if (testCase->getTagName().compare("test-case") != 0) {
 516             errln("ssearch, unrecognized XML Element in test file");
 517             continue;
 518         }
 519         const UnicodeString *id       = testCase->getAttribute("id");
 520         *testId = 0;
 521         if (id != NULL) {
 522             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
 523         }
 524
 525         // If debugging test case has been specified and this is not it, skip to next.
 526         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
 527             continue;
 528         }
 529         //
 530         //  Get the requested collation strength.
 531         //    Default is tertiary if the XML attribute is missing from the test case.
 532         //
 533         const UnicodeString *strength = testCase->getAttribute("strength");
 534         UColAttributeValue collatorStrength = UCOL_PRIMARY;
 535         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
 536         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
 537         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
 538         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
 539         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
 540         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
 541         else {
 542             // Bogus value supplied for strength.  Shouldn't happen, even from
 543             //  typos, if the  XML source has been validated.
 544             //  This assert is a little deceiving in that strength can be
 545             //   any of the allowed values, not just TERTIARY, but it will
 546             //   do the job of getting the error output.
 547             TEST_ASSERT(*strength=="TERTIARY")
 548         }
 549
 550         //
 551         // Get the collator normalization flag.  Default is UCOL_OFF.
 552         //
 553         UColAttributeValue normalize = UCOL_OFF;
 554         const UnicodeString *norm = testCase->getAttribute("norm");
 555         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
 556         if (norm!=NULL && *norm=="ON") {
 557             normalize = UCOL_ON;
 558         }
 559
 560         //
 561         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
 562         //
 563         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
 564         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
 565         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
 566         if (alt != NULL && *alt == "SHIFTED") {
 567             alternateHandling = UCOL_SHIFTED;
 568         }
 569
 570         const UnicodeString defLocale("en");
 571         char  clocale[100];
 572         const UnicodeString *locale   = testCase->getAttribute("locale");
 573         if (locale == NULL || locale->length()==0) {
 574             locale = &defLocale;
 575         };
 576         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
 577
 578
 579         UnicodeString  text;
 580         UnicodeString  target;
 581         UnicodeString  pattern;
 582         int32_t        expectedMatchStart = -1;
 583         int32_t        expectedMatchLimit = -1;
 584         const UXMLElement  *n;
 585         int32_t                nodeCount = 0;
 586
 587         n = testCase->getChildElement("pattern");
 588         TEST_ASSERT(n != NULL);
 589         if (n==NULL) {
 590             continue;
 591         }
 592         text = n->getText(FALSE);
 593         text = text.unescape();
 594         pattern.append(text);
 595         nodeCount++;
 596
 597         n = testCase->getChildElement("pre");
 598         if (n!=NULL) {
 599             text = n->getText(FALSE);
 600             text = text.unescape();
 601             target.append(text);
 602             nodeCount++;
 603         }
 604
 605         n = testCase->getChildElement("m");
 606         if (n!=NULL) {
 607             expectedMatchStart = target.length();
 608             text = n->getText(FALSE);
 609             text = text.unescape();
 610             target.append(text);
 611             expectedMatchLimit = target.length();
 612             nodeCount++;
 613         }
 614
 615         n = testCase->getChildElement("post");
 616         if (n!=NULL) {
 617             text = n->getText(FALSE);
 618             text = text.unescape();
 619             target.append(text);
 620             nodeCount++;
 621         }
 622
 623         //  Check that there weren't extra things in the XML
 624         TEST_ASSERT(nodeCount == testCase->countChildren());
 625
 626         // Open a collator and StringSearch based on the parameters
 627         //   obtained from the XML.
 628         //
 629         status = U_ZERO_ERROR;
 630         UCollator *collator = ucol_open(clocale, &status);
 631         ucol_setStrength(collator, collatorStrength);
 632         ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
 633         ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
 634         UCD *ucd = ucd_open(collator, &status);
 635         BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status);
 636
 637         TEST_ASSERT_SUCCESS(status);
 638         if (U_FAILURE(status)) {
 639             bms_close(bms);
 640             ucd_close(ucd);
 641             ucol_close(collator);
 642             continue;
 643         }
 644
 645         int32_t foundStart = 0;
 646         int32_t foundLimit = 0;
 647         UBool   foundMatch;
 648
 649         //
 650         // Do the search, check the match result against the expected results.
 651         //
 652         foundMatch = bms_search(bms, 0, &foundStart, &foundLimit);
 653       //TEST_ASSERT_SUCCESS(status);
 654         if ((foundMatch && expectedMatchStart < 0) ||
 655             (foundStart != expectedMatchStart)     ||
 656             (foundLimit != expectedMatchLimit)) {
 657                 TEST_ASSERT(FALSE);   //  ouput generic error position
 658                 infoln("Found, expected match start = %d, %d \n"
 659                        "Found, expected match limit = %d, %d",
 660                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
 661         }
 662
 663         bms_close(bms);
 664         ucd_close(ucd);
 665         ucol_close(collator);
 666     }
 667
 668     ucd_flushCache();
 669     delete root;
 670     delete parser;
 671 #endif
 672 }
 673
 674 struct Order
 675 {
 676     int32_t order;
 677     int32_t lowOffset;
 678     int32_t highOffset;
 679 };
 680
 681 class OrderList
 682 {
 683 public:
 684     OrderList();
 685     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
 686     ~OrderList();
 687
 688     int32_t size(void) const;
 689     void add(int32_t order, int32_t low, int32_t high);
 690     const Order *get(int32_t index) const;
 691     int32_t getLowOffset(int32_t index) const;
 692     int32_t getHighOffset(int32_t index) const;
 693     int32_t getOrder(int32_t index) const;
 694     void reverse(void);
 695     UBool compare(const OrderList &other) const;
 696     UBool matchesAt(int32_t offset, const OrderList &other) const;
 697
 698 private:
 699     Order *list;
 700     int32_t listMax;
 701     int32_t listSize;
 702 };
 703
 704 OrderList::OrderList()
 705   : list(NULL),  listMax(16), listSize(0)
 706 {
 707     list = new Order[listMax];
 708 }
 709
 710 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
 711     : list(NULL), listMax(16), listSize(0)
 712 {
 713     UErrorCode status = U_ZERO_ERROR;
 714     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
 715     uint32_t strengthMask = 0;
 716     int32_t order, low, high;
 717
 718     switch (ucol_getStrength(coll))
 719     {
 720     default:
 721         strengthMask |= UCOL_TERTIARYORDERMASK;
 722         /* fall through */
 723
 724     case UCOL_SECONDARY:
 725         strengthMask |= UCOL_SECONDARYORDERMASK;
 726         /* fall through */
 727
 728     case UCOL_PRIMARY:
 729         strengthMask |= UCOL_PRIMARYORDERMASK;
 730     }
 731
 732     list = new Order[listMax];
 733
 734     ucol_setOffset(elems, stringOffset, &status);
 735
 736     do {
 737         low   = ucol_getOffset(elems);
 738         order = ucol_next(elems, &status);
 739         high  = ucol_getOffset(elems);
 740
 741         if (order != UCOL_NULLORDER) {
 742             order &= strengthMask;
 743         }
 744
 745         if (order != UCOL_IGNORABLE) {
 746             add(order, low, high);
 747         }
 748     } while (order != UCOL_NULLORDER);
 749
 750     ucol_closeElements(elems);
 751 }
 752
 753 OrderList::~OrderList()
 754 {
 755     delete[] list;
 756 }
 757
 758 void OrderList::add(int32_t order, int32_t low, int32_t high)
 759 {
 760     if (listSize >= listMax) {
 761         listMax *= 2;
 762
 763         Order *newList = new Order[listMax];
 764
 765         uprv_memcpy(newList, list, listSize * sizeof(Order));
 766         delete[] list;
 767         list = newList;
 768     }
 769
 770     list[listSize].order      = order;
 771     list[listSize].lowOffset  = low;
 772     list[listSize].highOffset = high;
 773
 774     listSize += 1;
 775 }
 776
 777 const Order *OrderList::get(int32_t index) const
 778 {
 779     if (index >= listSize) {
 780         return NULL;
 781     }
 782
 783     return &list[index];
 784 }
 785
 786 int32_t OrderList::getLowOffset(int32_t index) const
 787 {
 788     const Order *order = get(index);
 789
 790     if (order != NULL) {
 791         return order->lowOffset;
 792     }
 793
 794     return -1;
 795 }
 796
 797 int32_t OrderList::getHighOffset(int32_t index) const
 798 {
 799     const Order *order = get(index);
 800
 801     if (order != NULL) {
 802         return order->highOffset;
 803     }
 804
 805     return -1;
 806 }
 807
 808 int32_t OrderList::getOrder(int32_t index) const
 809 {
 810     const Order *order = get(index);
 811
 812     if (order != NULL) {
 813         return order->order;
 814     }
 815
 816     return UCOL_NULLORDER;
 817 }
 818
 819 int32_t OrderList::size() const
 820 {
 821     return listSize;
 822 }
 823
 824 void OrderList::reverse()
 825 {
 826     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
 827         Order swap = list[b];
 828
 829         list[b] = list[f];
 830         list[f] = swap;
 831     }
 832 }
 833
 834 UBool OrderList::compare(const OrderList &other) const
 835 {
 836     if (listSize != other.listSize) {
 837         return FALSE;
 838     }
 839
 840     for(int32_t i = 0; i < listSize; i += 1) {
 841         if (list[i].order  != other.list[i].order ||
 842             list[i].lowOffset != other.list[i].lowOffset ||
 843             list[i].highOffset != other.list[i].highOffset) {
 844                 return FALSE;
 845         }
 846     }
 847
 848     return TRUE;
 849 }
 850
 851 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
 852 {
 853     // NOTE: sizes include the NULLORDER, which we don't want to compare.
 854     int32_t otherSize = other.size() - 1;
 855
 856     if (listSize - 1 - offset < otherSize) {
 857         return FALSE;
 858     }
 859
 860     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
 861         if (getOrder(i) != other.getOrder(j)) {
 862             return FALSE;
 863         }
 864     }
 865
 866     return TRUE;
 867 }
 868
 869 static char *printOffsets(char *buffer, OrderList &list)
 870 {
 871     int32_t size = list.size();
 872     char *s = buffer;
 873
 874     for(int32_t i = 0; i < size; i += 1) {
 875         const Order *order = list.get(i);
 876
 877         if (i != 0) {
 878             s += sprintf(s, ", ");
 879         }
 880
 881         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
 882     }
 883
 884     return buffer;
 885 }
 886
 887 static char *printOrders(char *buffer, OrderList &list)
 888 {
 889     int32_t size = list.size();
 890     char *s = buffer;
 891
 892     for(int32_t i = 0; i < size; i += 1) {
 893         const Order *order = list.get(i);
 894
 895         if (i != 0) {
 896             s += sprintf(s, ", ");
 897         }
 898
 899         s += sprintf(s, "%8.8X", order->order);
 900     }
 901
 902     return buffer;
 903 }
 904
 905 void SSearchTest::offsetTest()
 906 {
 907     const char *test[] = {
 908         // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
 909         // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
 910         "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
 911
 912         "\\ua191\\u16ef\\u2036\\u017a",
 913
 914 #if 0
 915         // This results in a complex interaction between contraction,
 916         // expansion and normalization that confuses the backwards offset fixups.
 917         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 918 #endif
 919
 920         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
 921         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
 922
 923         "\\u02FE\\u02FF"
 924         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
 925         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
 926         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
 927         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
 928         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
 929
 930         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
 931         "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
 932         "a\\u02FF\\u0316\\u0301",
 933         "a\\u0430\\u0301\\u0316",
 934         "a\\u0430\\u0316\\u0301",
 935         "abc\\u0E41\\u0301\\u0316",
 936         "abc\\u0E41\\u0316\\u0301",
 937         "\\u0E41\\u0301\\u0316",
 938         "\\u0E41\\u0316\\u0301",
 939         "a\\u0301\\u0316",
 940         "a\\u0316\\u0301",
 941         "\\uAC52\\uAC53",
 942         "\\u34CA\\u34CB",
 943         "\\u11ED\\u11EE",
 944         "\\u30C3\\u30D0",
 945         "p\\u00E9ch\\u00E9",
 946         "a\\u0301\\u0325",
 947         "a\\u0300\\u0325",
 948         "a\\u0325\\u0300",
 949         "A\\u0323\\u0300B",
 950         "A\\u0300\\u0323B",
 951         "A\\u0301\\u0323B",
 952         "A\\u0302\\u0301\\u0323B",
 953         "abc",
 954         "ab\\u0300c",
 955         "ab\\u0300\\u0323c",
 956         " \\uD800\\uDC00\\uDC00",
 957         "a\\uD800\\uDC00\\uDC00",
 958         "A\\u0301\\u0301",
 959         "A\\u0301\\u0323",
 960         "A\\u0301\\u0323B",
 961         "B\\u0301\\u0323C",
 962         "A\\u0300\\u0323B",
 963         "\\u0301A\\u0301\\u0301",
 964         "abcd\\r\\u0301",
 965         "p\\u00EAche",
 966         "pe\\u0302che",
 967     };
 968
 969     int32_t testCount = ARRAY_SIZE(test);
 970     UErrorCode status = U_ZERO_ERROR;
 971     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
 972     if (U_FAILURE(status)) {
 973         errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
 974         return;
 975     }
 976     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
 977                         // We could allocate one that's the right size by (CE_count * 10) + 2
 978                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
 979
 980     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
 981
 982     for(int32_t i = 0; i < testCount; i += 1) {
 983         if (!isICUVersionAtLeast(50, 0) && i>=4 && i<=6) {
 984             continue; // timebomb until ticket #8080 is resolved
 985         }
 986         UnicodeString ts = CharsToUnicodeString(test[i]);
 987         CollationElementIterator *iter = col->createCollationElementIterator(ts);
 988         OrderList forwardList;
 989         OrderList backwardList;
 990         int32_t order, low, high;
 991
 992         do {
 993             low   = iter->getOffset();
 994             order = iter->next(status);
 995             high  = iter->getOffset();
 996
 997             forwardList.add(order, low, high);
 998         } while (order != CollationElementIterator::NULLORDER);
 999
1000         iter->reset();
1001         iter->setOffset(ts.length(), status);
1002
1003         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
1004
1005         do {
1006             high  = iter->getOffset();
1007             order = iter->previous(status);
1008             low   = iter->getOffset();
1009
1010             if (order == CollationElementIterator::NULLORDER) {
1011                 break;
1012             }
1013
1014             backwardList.add(order, low, high);
1015         } while (TRUE);
1016
1017         backwardList.reverse();
1018
1019         if (forwardList.compare(backwardList)) {
1020             logln("Works with \"%s\"", test[i]);
1021             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
1022 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1023
1024             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
1025 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1026
1027             logln();
1028         } else {
1029             errln("Fails with \"%s\"", test[i]);
1030             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
1031             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1032
1033             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
1034             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1035
1036             infoln();
1037         }
1038         delete iter;
1039     }
1040     delete col;
1041 }
1042
1043 #if 0
1044 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
1045 {
1046     for(int32_t i = 0; i < string.length(); i += 1) {
1047         UChar32 ch = string.char32At(i);
1048
1049         if (ch >= 0x0020 && ch <= 0x007F) {
1050             if (ch == 0x005C) {
1051                 buffer.append("\\\\");
1052             } else {
1053                 buffer.append(ch);
1054             }
1055         } else {
1056             char cbuffer[12];
1057
1058             if (ch <= 0xFFFFL) {
1059                 sprintf(cbuffer, "\\u%4.4X", ch);
1060             } else {
1061                 sprintf(cbuffer, "\\U%8.8X", ch);
1062             }
1063
1064             buffer.append(cbuffer);
1065         }
1066
1067         if (ch >= 0x10000L) {
1068             i += 1;
1069         }
1070     }
1071
1072     return buffer;
1073 }
1074 #endif
1075
1076 #if 1
1077
1078 struct PCE
1079 {
1080     uint64_t ce;
1081     int32_t  lowOffset;
1082     int32_t  highOffset;
1083 };
1084
1085 class PCEList
1086 {
1087 public:
1088     PCEList(UCollator *coll, const UnicodeString &string);
1089     ~PCEList();
1090
1091     int32_t size() const;
1092
1093     const PCE *get(int32_t index) const;
1094
1095     int32_t getLowOffset(int32_t index) const;
1096     int32_t getHighOffset(int32_t index) const;
1097     uint64_t getOrder(int32_t index) const;
1098
1099     UBool matchesAt(int32_t offset, const PCEList &other) const;
1100
1101     uint64_t operator[](int32_t index) const;
1102
1103 private:
1104     void add(uint64_t ce, int32_t low, int32_t high);
1105
1106     PCE *list;
1107     int32_t listMax;
1108     int32_t listSize;
1109 };
1110
1111 PCEList::PCEList(UCollator *coll, const UnicodeString &string)
1112 {
1113     UErrorCode status = U_ZERO_ERROR;
1114     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
1115     uint64_t order;
1116     int32_t low, high;
1117
1118     list = new PCE[listMax];
1119
1120     ucol_setOffset(elems, 0, &status);
1121
1122     do {
1123         order = ucol_nextProcessed(elems, &low, &high, &status);
1124         add(order, low, high);
1125     } while (order != UCOL_PROCESSED_NULLORDER);
1126
1127     ucol_closeElements(elems);
1128 }
1129
1130 PCEList::~PCEList()
1131 {
1132     delete[] list;
1133 }
1134
1135 void PCEList::add(uint64_t order, int32_t low, int32_t high)
1136 {
1137     if (listSize >= listMax) {
1138         listMax *= 2;
1139
1140         PCE *newList = new PCE[listMax];
1141
1142         uprv_memcpy(newList, list, listSize * sizeof(Order));
1143         delete[] list;
1144         list = newList;
1145     }
1146
1147     list[listSize].ce         = order;
1148     list[listSize].lowOffset  = low;
1149     list[listSize].highOffset = high;
1150
1151     listSize += 1;
1152 }
1153
1154 const PCE *PCEList::get(int32_t index) const
1155 {
1156     if (index >= listSize) {
1157         return NULL;
1158     }
1159
1160     return &list[index];
1161 }
1162
1163 int32_t PCEList::getLowOffset(int32_t index) const
1164 {
1165     const PCE *pce = get(index);
1166
1167     if (pce != NULL) {
1168         return pce->lowOffset;
1169     }
1170
1171     return -1;
1172 }
1173
1174 int32_t PCEList::getHighOffset(int32_t index) const
1175 {
1176     const PCE *pce = get(index);
1177
1178     if (pce != NULL) {
1179         return pce->highOffset;
1180     }
1181
1182     return -1;
1183 }
1184
1185 uint64_t PCEList::getOrder(int32_t index) const
1186 {
1187     const PCE *pce = get(index);
1188
1189     if (pce != NULL) {
1190         return pce->ce;
1191     }
1192
1193     return UCOL_PROCESSED_NULLORDER;
1194 }
1195
1196 int32_t PCEList::size() const
1197 {
1198     return listSize;
1199 }
1200
1201 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const
1202 {
1203     // NOTE: sizes include the NULLORDER, which we don't want to compare.
1204     int32_t otherSize = other.size() - 1;
1205
1206     if (listSize - 1 - offset < otherSize) {
1207         return FALSE;
1208     }
1209
1210     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
1211         if (getOrder(i) != other.getOrder(j)) {
1212             return FALSE;
1213         }
1214     }
1215
1216     return TRUE;
1217 }
1218
1219 uint64_t PCEList::operator[](int32_t index) const
1220 {
1221     return getOrder(index);
1222 }
1223
1224 void SSearchTest::boyerMooreTest()
1225 {
1226     UErrorCode status = U_ZERO_ERROR;
1227     UCollator *coll = NULL;
1228     CollData *data = NULL;
1229     const CEList* ce = NULL;
1230     const CEList* ce1 = NULL;
1231     UnicodeString lp  = "fuss";
1232     UnicodeString sp = "fu\\u00DF";
1233     BoyerMooreSearch *longPattern = NULL;
1234     BoyerMooreSearch *shortPattern = NULL;
1235     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1236                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1237                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1238     int32_t start = -1, end = -1;
1239
1240     coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
1241     if (U_FAILURE(status)) {
1242         errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1243         return;
1244     }
1245
1246     data = CollData::open(coll, status);
1247     if (U_FAILURE(status)) {
1248         errln("Could not open CollData object.");
1249         goto close_data;
1250     }
1251
1252     data->getDynamicClassID();
1253     if (U_FAILURE(status)) {
1254         errln("Could not get dynamic class ID of CollData.");
1255         goto close_patterns;
1256     }
1257
1258     data->getStaticClassID();
1259     if (U_FAILURE(status)) {
1260         errln("Could not get static class ID of CollData.");
1261         goto close_patterns;
1262     }
1263
1264     longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status);
1265     shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status);
1266     if (U_FAILURE(status)) {
1267         errln("Could not create pattern objects.");
1268         goto close_patterns;
1269     }
1270
1271     longPattern->getBadCharacterTable();
1272     shortPattern->getBadCharacterTable();
1273     if (U_FAILURE(status)) {
1274         errln("Could not get bad character table.");
1275         goto close_patterns;
1276     }
1277
1278     longPattern->getGoodSuffixTable();
1279     shortPattern->getGoodSuffixTable();
1280     if (U_FAILURE(status)) {
1281         errln("Could not get good suffix table.");
1282         goto close_patterns;
1283     }
1284
1285     longPattern->getDynamicClassID();
1286     shortPattern->getDynamicClassID();
1287     if (U_FAILURE(status)) {
1288         errln("Could not get dynamic class ID of BoyerMooreSearch.");
1289         goto close_patterns;
1290     }
1291
1292     longPattern->getStaticClassID();
1293     shortPattern->getStaticClassID();
1294     if (U_FAILURE(status)) {
1295         errln("Could not get static class ID of BoyerMooreSearch.");
1296         goto close_patterns;
1297     }
1298
1299     longPattern->getData();
1300     shortPattern->getData();
1301     if (U_FAILURE(status)) {
1302         errln("Could not get collate data.");
1303         goto close_patterns;
1304     }
1305
1306     ce = longPattern->getPatternCEs();
1307     ce1 = shortPattern->getPatternCEs();
1308     if (U_FAILURE(status)) {
1309         errln("Could not get pattern CEs.");
1310         goto close_patterns;
1311     }
1312
1313     ce->getDynamicClassID();
1314     ce1->getDynamicClassID();
1315     if (U_FAILURE(status)) {
1316         errln("Could not get dynamic class ID of CEList.");
1317         goto close_patterns;
1318     }
1319
1320     ce->getStaticClassID();
1321     ce1->getStaticClassID();
1322     if (U_FAILURE(status)) {
1323         errln("Could not get static class ID of CEList.");
1324         goto close_patterns;
1325     }
1326
1327     if(data->minLengthInChars(ce,0) != 3){
1328         errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3.");
1329         goto close_patterns;
1330     }
1331
1332     if(data->minLengthInChars(ce1,0) != 3){
1333         errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3.");
1334         goto close_patterns;
1335     }
1336
1337     for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1338         UnicodeString target = targets[t].unescape();
1339
1340         longPattern->setTargetString(&target, status);
1341         if (longPattern->search(0, start, end)) {
1342             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1343         } else {
1344             errln("Test %d: did not find long pattern.", t);
1345         }
1346
1347         shortPattern->setTargetString(&target, status);
1348         if (shortPattern->search(0, start, end)) {
1349             logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1350         } else {
1351             errln("Test %d: did not find short pattern.", t);
1352         }
1353
1354         if(longPattern->empty()){
1355             errln("Test %d: Long pattern should not have been empty.");
1356         }
1357
1358         if(shortPattern->empty()){
1359             errln("Test %d: Short pattern should not have been empty.");
1360         }
1361     }
1362
1363 close_patterns:
1364     delete shortPattern;
1365     delete longPattern;
1366
1367 close_data:
1368     CollData::close(data);
1369     ucol_close(coll);
1370 }
1371
1372 void SSearchTest::bmsTest()
1373 {
1374     UErrorCode status = U_ZERO_ERROR;
1375     UCollator *coll = NULL;
1376     UCD *data = NULL;
1377     UnicodeString lp  = "fuss";
1378     UnicodeString lpu = lp.unescape();
1379     UnicodeString sp  = "fu\\u00DF";
1380     UnicodeString spu = sp.unescape();
1381     BMS *longPattern = NULL;
1382     BMS *shortPattern = NULL;
1383     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1384                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1385                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1386     int32_t start = -1, end = -1;
1387
1388     coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
1389     if (U_FAILURE(status)) {
1390         errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1391         return;
1392     }
1393
1394     data = ucd_open(coll, &status);
1395     if (U_FAILURE(status)) {
1396         errln("Could not open CollData object.");
1397         goto close_data;
1398     }
1399
1400     longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status);
1401     shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status);
1402     if (U_FAILURE(status)) {
1403         errln("Couldn't open pattern objects.");
1404         goto close_patterns;
1405     }
1406
1407     for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1408         UnicodeString target = targets[t].unescape();
1409
1410         bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status);
1411         if (bms_search(longPattern, 0, &start, &end)) {
1412             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1413         } else {
1414             errln("Test %d: did not find long pattern.", t);
1415         }
1416
1417         bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status);
1418         if (bms_search(shortPattern, 0, &start, &end)) {
1419             logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1420         } else {
1421             errln("Test %d: did not find short pattern.", t);
1422         }
1423     }
1424
1425     /* Add better coverage for bms code. */
1426     if(bms_empty(longPattern)) {
1427         errln("FAIL: longgPattern is empty.");
1428     }
1429
1430     if (!bms_getData(longPattern)) {
1431         errln("FAIL: bms_getData returned NULL.");
1432     }
1433
1434     if (!ucd_getCollator(data)) {
1435         errln("FAIL: ucd_getCollator returned NULL.");
1436     }
1437
1438 close_patterns:
1439     bms_close(shortPattern);
1440     bms_close(longPattern);
1441
1442 close_data:
1443     ucd_close(data);
1444     ucd_freeCache();
1445     ucol_close(coll);
1446 }
1447
1448 void SSearchTest::goodSuffixTest()
1449 {
1450     UErrorCode status = U_ZERO_ERROR;
1451     UCollator *coll = NULL;
1452     CollData *data = NULL;
1453     UnicodeString pat = /*"gcagagag"*/ "fxeld";
1454     UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
1455     BoyerMooreSearch *pattern = NULL;
1456     int32_t start = -1, end = -1;
1457
1458     coll = ucol_open(NULL, &status);
1459     if (U_FAILURE(status)) {
1460         errcheckln(status, "Couldn't open collator. - %s", u_errorName(status));
1461         return;
1462     }
1463
1464     data = CollData::open(coll, status);
1465     if (U_FAILURE(status)) {
1466         errln("Couldn't open CollData object.");
1467         goto close_data;
1468     }
1469
1470     pattern = new BoyerMooreSearch(data, pat, &target, status);
1471     if (U_FAILURE(status)) {
1472         errln("Couldn't open pattern object.");
1473         goto close_pattern;
1474     }
1475
1476     if (pattern->search(0, start, end)) {
1477         logln("Found pattern at [%d, %d].", start, end);
1478     } else {
1479         errln("Did not find pattern.");
1480     }
1481
1482 close_pattern:
1483     delete pattern;
1484
1485 close_data:
1486     CollData::close(data);
1487     ucol_close(coll);
1488 }
1489
1490 //
1491 //  searchTime()    A quick and dirty performance test for string search.
1492 //                  Probably  doesn't really belong as part of intltest, but it
1493 //                  does check that the search succeeds, and gets the right result,
1494 //                  so it serves as a functionality test also.
1495 //
1496 //                  To run as a perf test, up the loop count, select by commenting
1497 //                  and uncommenting in the code the operation to be measured,
1498 //                  rebuild, and measure the running time of this test alone.
1499 //
1500 //                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
1501 //
1502 void SSearchTest::searchTime() {
1503     static const char *longishText =
1504 "Whylom, as olde stories tellen us,\n"
1505 "Ther was a duk that highte Theseus:\n"
1506 "Of Athenes he was lord and governour,\n"
1507 "And in his tyme swich a conquerour,\n"
1508 "That gretter was ther noon under the sonne.\n"
1509 "Ful many a riche contree hadde he wonne;\n"
1510 "What with his wisdom and his chivalrye,\n"
1511 "He conquered al the regne of Femenye,\n"
1512 "That whylom was y-cleped Scithia;\n"
1513 "And weddede the quene Ipolita,\n"
1514 "And broghte hir hoom with him in his contree\n"
1515 "With muchel glorie and greet solempnitee,\n"
1516 "And eek hir yonge suster Emelye.\n"
1517 "And thus with victorie and with melodye\n"
1518 "Lete I this noble duk to Athenes ryde,\n"
1519 "And al his hoost, in armes, him bisyde.\n"
1520 "And certes, if it nere to long to here,\n"
1521 "I wolde han told yow fully the manere,\n"
1522 "How wonnen was the regne of Femenye\n"
1523 "By Theseus, and by his chivalrye;\n"
1524 "And of the grete bataille for the nones\n"
1525 "Bitwixen Athen's and Amazones;\n"
1526 "And how asseged was Ipolita,\n"
1527 "The faire hardy quene of Scithia;\n"
1528 "And of the feste that was at hir weddinge,\n"
1529 "And of the tempest at hir hoom-cominge;\n"
1530 "But al that thing I moot as now forbere.\n"
1531 "I have, God woot, a large feeld to ere,\n"
1532 "And wayke been the oxen in my plough.\n"
1533 "The remenant of the tale is long y-nough.\n"
1534 "I wol nat letten eek noon of this route;\n"
1535 "Lat every felawe telle his tale aboute,\n"
1536 "And lat see now who shal the soper winne;\n"
1537 "And ther I lefte, I wol ageyn biginne.\n"
1538 "This duk, of whom I make mencioun,\n"
1539 "When he was come almost unto the toun,\n"
1540 "In al his wele and in his moste pryde,\n"
1541 "He was war, as he caste his eye asyde,\n"
1542 "Wher that ther kneled in the hye weye\n"
1543 "A companye of ladies, tweye and tweye,\n"
1544 "Ech after other, clad in clothes blake; \n"
1545 "But swich a cry and swich a wo they make,\n"
1546 "That in this world nis creature livinge,\n"
1547 "That herde swich another weymentinge;\n"
1548 "And of this cry they nolde never stenten,\n"
1549 "Til they the reynes of his brydel henten.\n"
1550 "'What folk ben ye, that at myn hoomcominge\n"
1551 "Perturben so my feste with cryinge'?\n"
1552 "Quod Theseus, 'have ye so greet envye\n"
1553 "Of myn honour, that thus compleyne and crye? \n"
1554 "Or who hath yow misboden, or offended?\n"
1555 "And telleth me if it may been amended;\n"
1556 "And why that ye ben clothed thus in blak'?\n"
1557 "The eldest lady of hem alle spak,\n"
1558 "When she hadde swowned with a deedly chere,\n"
1559 "That it was routhe for to seen and here,\n"
1560 "And seyde: 'Lord, to whom Fortune hath yiven\n"
1561 "Victorie, and as a conquerour to liven,\n"
1562 "Noght greveth us your glorie and your honour;\n"
1563 "But we biseken mercy and socour.\n"
1564 "Have mercy on our wo and our distresse.\n"
1565 "Som drope of pitee, thurgh thy gentilesse,\n"
1566 "Up-on us wrecched wommen lat thou falle.\n"
1567 "For certes, lord, ther nis noon of us alle,\n"
1568 "That she nath been a duchesse or a quene;\n"
1569 "Now be we caitifs, as it is wel sene:\n"
1570 "Thanked be Fortune, and hir false wheel,\n"
1571 "That noon estat assureth to be weel.\n"
1572 "And certes, lord, t'abyden your presence,\n"
1573 "Here in the temple of the goddesse Clemence\n"
1574 "We han ben waytinge al this fourtenight;\n"
1575 "Now help us, lord, sith it is in thy might.\n"
1576 "I wrecche, which that wepe and waille thus,\n"
1577 "Was whylom wyf to king Capaneus,\n"
1578 "That starf at Thebes, cursed be that day!\n"
1579 "And alle we, that been in this array,\n"
1580 "And maken al this lamentacioun,\n"
1581 "We losten alle our housbondes at that toun,\n"
1582 "Whyl that the sege ther-aboute lay.\n"
1583 "And yet now th'olde Creon, weylaway!\n"
1584 "The lord is now of Thebes the citee, \n"
1585 "Fulfild of ire and of iniquitee,\n"
1586 "He, for despyt, and for his tirannye,\n"
1587 "To do the dede bodyes vileinye,\n"
1588 "Of alle our lordes, whiche that ben slawe,\n"
1589 "Hath alle the bodyes on an heep y-drawe,\n"
1590 "And wol nat suffren hem, by noon assent,\n"
1591 "Neither to been y-buried nor y-brent,\n"
1592 "But maketh houndes ete hem in despyt. zet'\n";
1593
1594 #define TEST_BOYER_MOORE 1
1595 const char *cPattern = "maketh houndes ete hem";
1596 //const char *cPattern = "Whylom";
1597 //const char *cPattern = "zet";
1598     const char *testId = "searchTime()";   // for error macros.
1599     UnicodeString target = longishText;
1600     UErrorCode status = U_ZERO_ERROR;
1601
1602
1603     LocalUCollatorPointer collator(ucol_open("en", &status));
1604     CollData *data = CollData::open(collator.getAlias(), status);
1605     if (U_FAILURE(status) || collator.isNull() || data == NULL) {
1606         errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status));
1607         return;
1608     }
1609     //ucol_setStrength(collator.getAlias(), collatorStrength);
1610     //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
1611     UnicodeString uPattern = cPattern;
1612 #ifndef TEST_BOYER_MOORE
1613     LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
1614                                                            target.getBuffer(), target.length(),
1615                                                            collator.getAlias(),
1616                                                            NULL,     // the break iterator
1617                                                            &status));
1618     TEST_ASSERT_SUCCESS(status);
1619 #else
1620     BoyerMooreSearch bms(data, uPattern, &target, status);
1621     TEST_ASSERT_SUCCESS(status);
1622 #endif
1623
1624 //  int32_t foundStart;
1625 //  int32_t foundEnd;
1626     UBool   found;
1627
1628     // Find the match position usgin strstr
1629     const char *pm = strstr(longishText, cPattern);
1630     TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
1631     int32_t  refMatchPos = (int32_t)(pm - longishText);
1632     int32_t  icuMatchPos;
1633     int32_t  icuMatchEnd;
1634 #ifndef TEST_BOYER_MOORE
1635     usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
1636     TEST_ASSERT_SUCCESS(status);
1637 #else
1638     found = bms.search(0, icuMatchPos, icuMatchEnd);
1639 #endif
1640     TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
1641
1642     int32_t i;
1643     // int32_t j=0;
1644
1645     // Try loopcounts around 100000 to some millions, depending on the operation,
1646     //   to get runtimes of at least several seconds.
1647     for (i=0; i<10000; i++) {
1648 #ifndef TEST_BOYER_MOORE
1649         found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
1650 #else
1651         found = bms.search(0, icuMatchPos, icuMatchEnd);
1652 #endif
1653         //TEST_ASSERT_SUCCESS(status);
1654         //TEST_ASSERT(found);
1655
1656         // usearch_setOffset(uss.getAlias(), 0, &status);
1657         // icuMatchPos = usearch_next(uss.getAlias(), &status);
1658
1659          // The i+j stuff is to confuse the optimizer and get it to actually leave the
1660          //   call to strstr in place.
1661          //pm = strstr(longishText+j, cPattern);
1662          //j = (j + i)%5;
1663     }
1664
1665     //printf("%ld, %d\n", pm-longishText, j);
1666 #ifdef TEST_BOYER_MOORE
1667     CollData::close(data);
1668 #endif
1669 }
1670 #endif
1671
1672 //----------------------------------------------------------------------------------------
1673 //
1674 //   Random Numbers.  Similar to standard lib rand() and srand()
1675 //                    Not using library to
1676 //                      1.  Get same results on all platforms.
1677 //                      2.  Get access to current seed, to more easily reproduce failures.
1678 //
1679 //---------------------------------------------------------------------------------------
1680 static uint32_t m_seed = 1;
1681
1682 static uint32_t m_rand()
1683 {
1684     m_seed = m_seed * 1103515245 + 12345;
1685     return (uint32_t)(m_seed/65536) % 32768;
1686 }
1687
1688 class Monkey
1689 {
1690 public:
1691     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
1692
1693 protected:
1694     Monkey();
1695     virtual ~Monkey();
1696 };
1697
1698 Monkey::Monkey()
1699 {
1700     // ook?
1701 }
1702
1703 Monkey::~Monkey()
1704 {
1705     // ook?
1706 }
1707
1708 class SetMonkey : public Monkey
1709 {
1710 public:
1711     SetMonkey(const USet *theSet);
1712     ~SetMonkey();
1713
1714     virtual void append(UnicodeString &test, UnicodeString &alternate);
1715
1716 private:
1717     const USet *set;
1718 };
1719
1720 SetMonkey::SetMonkey(const USet *theSet)
1721     : Monkey(), set(theSet)
1722 {
1723     // ook?
1724 }
1725
1726 SetMonkey::~SetMonkey()
1727 {
1728     //ook...
1729 }
1730
1731 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1732 {
1733     int32_t size = uset_size(set);
1734     int32_t index = m_rand() % size;
1735     UChar32 ch = uset_charAt(set, index);
1736     UnicodeString str(ch);
1737
1738     test.append(str);
1739     alternate.append(str); // flip case, or some junk?
1740 }
1741
1742 class StringSetMonkey : public Monkey
1743 {
1744 public:
1745     StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1746     ~StringSetMonkey();
1747
1748     void append(UnicodeString &testCase, UnicodeString &alternate);
1749
1750 private:
1751     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1752
1753     const USet *set;
1754     UCollator  *coll;
1755     CollData   *collData;
1756 };
1757
1758 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1759 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1760 {
1761     // ook.
1762 }
1763
1764 StringSetMonkey::~StringSetMonkey()
1765 {
1766     // ook?
1767 }
1768
1769 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1770 {
1771     int32_t itemCount = uset_getItemCount(set), len = 0;
1772     int32_t index = m_rand() % itemCount;
1773     UChar32 rangeStart = 0, rangeEnd = 0;
1774     UChar buffer[16];
1775     UErrorCode err = U_ZERO_ERROR;
1776
1777     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1778
1779     if (len == 0) {
1780         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1781         UChar32 ch = rangeStart + offset;
1782         UnicodeString str(ch);
1783
1784         testCase.append(str);
1785         generateAlternative(str, alternate);
1786     } else if (len > 0) {
1787         // should check that len < 16...
1788         UnicodeString str(buffer, len);
1789
1790         testCase.append(str);
1791         generateAlternative(str, alternate);
1792     } else {
1793         // shouldn't happen...
1794     }
1795 }
1796
1797 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1798 {
1799     // find out shortest string for the longest sequence of ces.
1800     // needs to be refined to use dynamic programming, but will be roughly right
1801     UErrorCode status = U_ZERO_ERROR;
1802     CEList ceList(coll, testCase, status);
1803     UnicodeString alt;
1804     int32_t offset = 0;
1805
1806     if (ceList.size() == 0) {
1807         return alternate.append(testCase);
1808     }
1809
1810     while (offset < ceList.size()) {
1811         int32_t ce = ceList.get(offset);
1812         const StringList *strings = collData->getStringList(ce);
1813
1814         if (strings == NULL) {
1815             return alternate.append(testCase);
1816         }
1817
1818         int32_t stringCount = strings->size();
1819         int32_t tries = 0;
1820
1821         // find random string that generates the same CEList
1822         const CEList *ceList2 = NULL;
1823         const UnicodeString *string = NULL;
1824               UBool matches = FALSE;
1825
1826         do {
1827             int32_t s = m_rand() % stringCount;
1828
1829             if (tries++ > stringCount) {
1830                 alternate.append(testCase);
1831                 return alternate;
1832             }
1833
1834             string = strings->get(s);
1835             ceList2 = collData->getCEList(string);
1836             matches = ceList.matchesAt(offset, ceList2);
1837
1838             if (! matches) {
1839                 collData->freeCEList((CEList *) ceList2);
1840             }
1841         } while (! matches);
1842
1843         alt.append(*string);
1844         offset += ceList2->size();
1845         collData->freeCEList(ceList2);
1846     }
1847
1848     const CEList altCEs(coll, alt, status);
1849
1850     if (ceList.matchesAt(0, &altCEs)) {
1851         return alternate.append(alt);
1852     }
1853
1854     return alternate.append(testCase);
1855 }
1856
1857 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1858 {
1859     int32_t pieces = (m_rand() % 4) + 1;
1860     UErrorCode status = U_ZERO_ERROR;
1861     UBool matches;
1862
1863     do {
1864         testCase.remove();
1865         alternate.remove();
1866         monkeys[0]->append(testCase, alternate);
1867
1868         for(int32_t piece = 0; piece < pieces; piece += 1) {
1869             int32_t monkey = m_rand() % monkeyCount;
1870
1871             monkeys[monkey]->append(testCase, alternate);
1872         }
1873
1874         const CEList ceTest(coll, testCase, status);
1875         const CEList ceAlt(coll, alternate, status);
1876
1877         matches = ceTest.matchesAt(0, &ceAlt);
1878     } while (! matches);
1879 }
1880
1881 //
1882 //  Find the next acceptable boundary following the specified starting index
1883 //     in the target text being searched.
1884 //      TODO:  refine what is an acceptable boundary.  For the moment,
1885 //             choose the next position not within a combining sequence.
1886 //
1887 #if 0
1888 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
1889     const UChar *text = string.getBuffer();
1890     int32_t textLen   = string.length();
1891
1892     if (startIndex >= textLen) {
1893         return startIndex;
1894     }
1895
1896     UChar32  c;
1897     int32_t  i = startIndex;
1898
1899     U16_NEXT(text, i, textLen, c);
1900
1901     // If we are on a control character, stop without looking for combining marks.
1902     //    Control characters do not combine.
1903     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1904     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1905         return i;
1906     }
1907
1908     // The initial character was not a control, and can thus accept trailing
1909     //   combining characters.  Advance over however many of them there are.
1910     int32_t  indexOfLastCharChecked;
1911
1912     for (;;) {
1913         indexOfLastCharChecked = i;
1914
1915         if (i>=textLen) {
1916             break;
1917         }
1918
1919         U16_NEXT(text, i, textLen, c);
1920         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1921
1922         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1923             break;
1924         }
1925     }
1926
1927     return indexOfLastCharChecked;
1928 }
1929 #endif
1930
1931 #if 0
1932 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
1933     const UChar *text = string.getBuffer();
1934     int32_t textLen   = string.length();
1935
1936     if (index>=textLen || index<=0) {
1937         return FALSE;
1938     }
1939
1940     // If the character at the current index is not a GRAPHEME_EXTEND
1941     //    then we can not be within a combining sequence.
1942     UChar32  c;
1943     U16_GET(text, 0, index, textLen, c);
1944     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1945     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1946         return FALSE;
1947     }
1948
1949     // We are at a combining mark.  If the preceding character is anything
1950     //   except a CONTROL, CR or LF, we are in a combining sequence.
1951     U16_PREV(text, 0, index, c);
1952     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1953
1954     return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1955 }
1956 #endif
1957
1958 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1959 {
1960     UErrorCode      status = U_ZERO_ERROR;
1961     OrderList       targetOrders(coll, target, offset);
1962     OrderList       patternOrders(coll, pattern);
1963     int32_t         targetSize  = targetOrders.size() - 1;
1964     int32_t         patternSize = patternOrders.size() - 1;
1965     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1966                                                   target.getBuffer(), target.length(), &status);
1967
1968     if (patternSize == 0) {
1969         // Searching for an empty pattern always fails
1970         matchStart = matchEnd = -1;
1971         ubrk_close(charBreakIterator);
1972         return FALSE;
1973     }
1974
1975     matchStart = matchEnd = -1;
1976
1977     for(int32_t i = 0; i < targetSize; i += 1) {
1978         if (targetOrders.matchesAt(i, patternOrders)) {
1979             int32_t start    = targetOrders.getLowOffset(i);
1980             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1981             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1982
1983             // if the low and high offsets of the first CE in
1984             // the match are the same, it means that the match
1985             // starts in the middle of an expansion - all but
1986             // the first CE of the expansion will have the offset
1987             // of the following character.
1988             if (start == targetOrders.getHighOffset(i)) {
1989                 continue;
1990             }
1991
1992             // Make sure match starts on a grapheme boundary
1993             if (! ubrk_isBoundary(charBreakIterator, start)) {
1994                 continue;
1995             }
1996
1997             // If the low and high offsets of the CE after the match
1998             // are the same, it means that the match ends in the middle
1999             // of an expansion sequence.
2000             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
2001                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
2002                 continue;
2003             }
2004
2005             int32_t mend = maxLimit;
2006
2007             // Find the first grapheme break after the character index
2008             // of the last CE in the match. If it's after character index
2009             // that's after the last CE in the match, use that index
2010             // as the end of the match.
2011             if (minLimit < maxLimit) {
2012                 // When the last CE's low index is same with its high index, the CE is likely
2013                 // a part of expansion. In this case, the index is located just after the
2014                 // character corresponding to the CEs compared above. If the index is right
2015                 // at the break boundary, move the position to the next boundary will result
2016                 // incorrect match length when there are ignorable characters exist between
2017                 // the position and the next character produces CE(s). See ticket#8482.
2018                 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
2019                     mend = minLimit;
2020                 } else {
2021                     int32_t nba = ubrk_following(charBreakIterator, minLimit);
2022
2023                     if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
2024                         mend = nba;
2025                     }
2026                 }
2027             }
2028
2029             if (mend > maxLimit) {
2030                 continue;
2031             }
2032
2033             if (! ubrk_isBoundary(charBreakIterator, mend)) {
2034                 continue;
2035             }
2036
2037             matchStart = start;
2038             matchEnd   = mend;
2039
2040             ubrk_close(charBreakIterator);
2041             return TRUE;
2042         }
2043     }
2044
2045     ubrk_close(charBreakIterator);
2046     return FALSE;
2047 }
2048
2049 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2050 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
2051     int32_t val = defaultVal;
2052
2053     name.append(" *= *(-?\\d+)");
2054
2055     UErrorCode status = U_ZERO_ERROR;
2056     RegexMatcher m(name, params, 0, status);
2057
2058     if (m.find()) {
2059         // The param exists.  Convert the string to an int.
2060         char valString[100];
2061         int32_t paramLength = m.end(1, status) - m.start(1, status);
2062
2063         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
2064             paramLength = (int32_t)(sizeof(valString)-2);
2065         }
2066
2067         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
2068         val = strtol(valString,  NULL, 10);
2069
2070         // Delete this parameter from the params string.
2071         m.reset();
2072         params = m.replaceFirst("", status);
2073     }
2074
2075   //U_ASSERT(U_SUCCESS(status));
2076     if (! U_SUCCESS(status)) {
2077         val = defaultVal;
2078     }
2079
2080     return val;
2081 }
2082 #endif
2083
2084 #if !UCONFIG_NO_COLLATION
2085 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
2086                                     const char *name, const char *strength, uint32_t seed)
2087 {
2088     UErrorCode status = U_ZERO_ERROR;
2089     int32_t actualStart = -1, actualEnd = -1;
2090   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2091     int32_t expectedStart = -1, expectedEnd = -1;
2092     int32_t notFoundCount = 0;
2093     LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
2094                                                            testCase.getBuffer(), testCase.length(),
2095                                                            coll,
2096                                                            NULL,     // the break iterator
2097                                                            &status));
2098
2099     // **** TODO: find *all* matches, not just first one ****
2100     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
2101
2102     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
2103
2104     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2105         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2106               "    strength=%s seed=%d",
2107               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2108     }
2109
2110     if (expectedStart == -1 && actualStart == -1) {
2111         notFoundCount += 1;
2112     }
2113
2114     // **** TODO: find *all* matches, not just first one ****
2115     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2116
2117     usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
2118
2119     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
2120
2121     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2122         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2123               "    strength=%s seed=%d",
2124               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2125     }
2126
2127     if (expectedStart == -1 && actualStart == -1) {
2128         notFoundCount += 1;
2129     }
2130
2131     return notFoundCount;
2132 }
2133
2134 static void hexForUnicodeString(const UnicodeString &ustr, char * cbuf, int32_t cbuflen)
2135 {
2136     int32_t ustri, ustrlen = ustr.length();
2137
2138     for (ustri = 0; ustri < ustrlen; ++ustri) {
2139         if (cbuflen >= 9 /* format width for single code unit(5) + terminating ellipsis(3) + null(1) */) {
2140             int len = sprintf(cbuf, " %04X", ustr.charAt(ustri));
2141             cbuflen -= len;
2142             cbuf += len;
2143         } else {
2144             if (cbuflen >= 4 /* terminating ellipsis(3) + null(1) */) {
2145                 sprintf(cbuf, "...");
2146             } else if (cbuflen >= 1) {
2147                 cbuf = 0;
2148             }
2149             break;
2150         }
2151     }
2152 }
2153
2154 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
2155                                     BoyerMooreSearch *bms, BoyerMooreSearch *abms,
2156                                     const char *name, const char *strength, uint32_t seed)
2157 {
2158     UErrorCode status = U_ZERO_ERROR;
2159     int32_t actualStart = -1, actualEnd = -1;
2160   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2161     int32_t expectedStart = -1, expectedEnd = -1;
2162     int32_t notFoundCount = 0;
2163     char    hexbuf[128];
2164
2165     // **** TODO: find *all* matches, not just first one ****
2166     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
2167
2168     bms->setTargetString(&testCase, status);
2169     bms->search(0, actualStart, actualEnd);
2170
2171     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2172         hexForUnicodeString(pattern, hexbuf, sizeof(hexbuf));
2173         errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2174               "    strength=%s seed=%d <pattern>: %s",
2175               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf);
2176     }
2177
2178     if (expectedStart == -1 && actualStart == -1) {
2179         notFoundCount += 1;
2180     }
2181
2182     // **** TODO: find *all* matches, not just first one ****
2183     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2184
2185     abms->setTargetString(&testCase, status);
2186     abms->search(0, actualStart, actualEnd);
2187
2188     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2189         hexForUnicodeString(altPattern, hexbuf, sizeof(hexbuf));
2190         errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2191               "    strength=%s seed=%d <pattern>: %s",
2192               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf);
2193     }
2194
2195     if (expectedStart == -1 && actualStart == -1) {
2196         notFoundCount += 1;
2197     }
2198
2199
2200     return notFoundCount;
2201 }
2202 #endif
2203
2204 void SSearchTest::monkeyTest(char *params)
2205 {
2206     // ook!
2207     UErrorCode status = U_ZERO_ERROR;
2208   //UCollator *coll = ucol_open(NULL, &status);
2209     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
2210
2211     if (U_FAILURE(status)) {
2212         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2213         return;
2214     }
2215
2216     CollData  *monkeyData = CollData::open(coll, status);
2217
2218     USet *expansions   = uset_openEmpty();
2219     USet *contractions = uset_openEmpty();
2220
2221     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2222
2223     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2224     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2225     USet *letters = uset_openPattern(letter_pattern, 39, &status);
2226     SetMonkey letterMonkey(letters);
2227     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2228     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2229     UnicodeString testCase;
2230     UnicodeString alternate;
2231     UnicodeString pattern, altPattern;
2232     UnicodeString prefix, altPrefix;
2233     UnicodeString suffix, altSuffix;
2234
2235     Monkey *monkeys[] = {
2236         &letterMonkey,
2237         &contractionMonkey,
2238         &expansionMonkey,
2239         &contractionMonkey,
2240         &expansionMonkey,
2241         &contractionMonkey,
2242         &expansionMonkey,
2243         &contractionMonkey,
2244         &expansionMonkey};
2245     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2246     // int32_t nonMatchCount = 0;
2247
2248     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2249     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2250     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2251     int32_t loopCount = quick? 1000 : 10000;
2252     int32_t firstStrength = 0;
2253     int32_t lastStrength  = strengthCount - 1; //*/ 0;
2254
2255     if (params != NULL) {
2256 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2257         UnicodeString p(params);
2258
2259         loopCount = getIntParam("loop", p, loopCount);
2260         m_seed    = getIntParam("seed", p, m_seed);
2261
2262         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2263         if (m.find()) {
2264             UnicodeString breakType = m.group(1, status);
2265
2266             for (int32_t s = 0; s < strengthCount; s += 1) {
2267                 if (breakType == strengthNames[s]) {
2268                     firstStrength = lastStrength = s;
2269                     break;
2270                 }
2271             }
2272
2273             m.reset();
2274             p = m.replaceFirst("", status);
2275         }
2276
2277         if (RegexMatcher("\\S", p, 0, status).find()) {
2278             // Each option is stripped out of the option string as it is processed.
2279             // All options have been checked.  The option string should have been completely emptied..
2280             char buf[100];
2281             p.extract(buf, sizeof(buf), NULL, status);
2282             buf[sizeof(buf)-1] = 0;
2283             errln("Unrecognized or extra parameter:  %s\n", buf);
2284             return;
2285         }
2286 #else
2287         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2288 #endif
2289     }
2290
2291     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2292         int32_t notFoundCount = 0;
2293
2294         logln("Setting strength to %s.", strengthNames[s]);
2295         ucol_setStrength(coll, strengths[s]);
2296
2297         // TODO: try alternate prefix and suffix too?
2298         // TODO: alterntaes are only equal at primary strength. Is this OK?
2299         for(int32_t t = 0; t < loopCount; t += 1) {
2300             uint32_t seed = m_seed;
2301             // int32_t  nmc = 0;
2302
2303             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2304             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
2305             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
2306
2307             // pattern
2308             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
2309
2310             testCase.remove();
2311             testCase.append(prefix);
2312             testCase.append(/*alt*/pattern);
2313
2314             // prefix + pattern
2315             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
2316
2317             testCase.append(suffix);
2318
2319             // prefix + pattern + suffix
2320             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
2321
2322             testCase.remove();
2323             testCase.append(pattern);
2324             testCase.append(suffix);
2325
2326             // pattern + suffix
2327             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
2328         }
2329
2330        logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2331     }
2332
2333     uset_close(contractions);
2334     uset_close(expansions);
2335     uset_close(letters);
2336
2337     CollData::close(monkeyData);
2338
2339     ucol_close(coll);
2340 }
2341
2342 void SSearchTest::bmMonkeyTest(char *params)
2343 {
2344     static const UChar skipChars[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb
2345     // ook!
2346     UErrorCode status = U_ZERO_ERROR;
2347     UCollator *coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
2348
2349     if (U_FAILURE(status)) {
2350         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2351         return;
2352     }
2353
2354     CollData  *monkeyData = CollData::open(coll, status);
2355
2356     USet *expansions   = uset_openEmpty();
2357     USet *contractions = uset_openEmpty();
2358
2359     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2360
2361     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2362     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2363     USet *letters = uset_openPattern(letter_pattern, 39, &status);
2364     SetMonkey letterMonkey(letters);
2365     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2366     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2367     UnicodeString testCase;
2368     UnicodeString alternate;
2369     UnicodeString pattern, altPattern;
2370     UnicodeString prefix, altPrefix;
2371     UnicodeString suffix, altSuffix;
2372
2373     Monkey *monkeys[] = {
2374         &letterMonkey,
2375         &contractionMonkey,
2376         &expansionMonkey,
2377         &contractionMonkey,
2378         &expansionMonkey,
2379         &contractionMonkey,
2380         &expansionMonkey,
2381         &contractionMonkey,
2382         &expansionMonkey};
2383     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2384     // int32_t nonMatchCount = 0;
2385
2386     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2387     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2388     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2389     int32_t loopCount = quick? 1000 : 10000;
2390     int32_t firstStrength = 0;
2391     int32_t lastStrength  = strengthCount - 1; //*/ 0;
2392
2393     if (params != NULL) {
2394 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2395         UnicodeString p(params);
2396
2397         loopCount = getIntParam("loop", p, loopCount);
2398         m_seed    = getIntParam("seed", p, m_seed);
2399
2400         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2401         if (m.find()) {
2402             UnicodeString breakType = m.group(1, status);
2403
2404             for (int32_t s = 0; s < strengthCount; s += 1) {
2405                 if (breakType == strengthNames[s]) {
2406                     firstStrength = lastStrength = s;
2407                     break;
2408                 }
2409             }
2410
2411             m.reset();
2412             p = m.replaceFirst("", status);
2413         }
2414
2415         if (RegexMatcher("\\S", p, 0, status).find()) {
2416             // Each option is stripped out of the option string as it is processed.
2417             // All options have been checked.  The option string should have been completely emptied..
2418             char buf[100];
2419             p.extract(buf, sizeof(buf), NULL, status);
2420             buf[sizeof(buf)-1] = 0;
2421             errln("Unrecognized or extra parameter:  %s\n", buf);
2422             return;
2423         }
2424 #else
2425         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2426 #endif
2427     }
2428
2429     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2430         int32_t notFoundCount = 0;
2431
2432         logln("Setting strength to %s.", strengthNames[s]);
2433         ucol_setStrength(coll, strengths[s]);
2434
2435         CollData *data = CollData::open(coll, status);
2436
2437         UnicodeString skipString(skipChars); // for timebomb
2438         UnicodeSet* skipSet = UnicodeSet::createFromAll(skipString); // for timebomb
2439         // TODO: try alternate prefix and suffix too?
2440         // TODO: alterntaes are only equal at primary strength. Is this OK?
2441         for(int32_t t = 0; t < loopCount; t += 1) {
2442             uint32_t seed = m_seed;
2443             // int32_t  nmc = 0;
2444
2445             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2446             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
2447             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
2448
2449             if (!isICUVersionAtLeast(50, 0) && skipSet->containsSome(pattern)) {
2450                 continue; // timebomb until ticket #8080 is resolved
2451             }
2452
2453             BoyerMooreSearch pat(data, pattern, NULL, status);
2454             BoyerMooreSearch alt(data, altPattern, NULL, status);
2455
2456             // **** need a better way to deal with this ****
2457 #if 0
2458             if (pat.empty() ||
2459                 alt.empty()) {
2460                     continue;
2461             }
2462 #endif
2463
2464             // pattern
2465             notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed);
2466
2467             testCase.remove();
2468             testCase.append(prefix);
2469             testCase.append(/*alt*/pattern);
2470
2471             // prefix + pattern
2472             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed);
2473
2474             testCase.append(suffix);
2475
2476             // prefix + pattern + suffix
2477             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed);
2478
2479             testCase.remove();
2480             testCase.append(pattern);
2481             testCase.append(suffix);
2482
2483             // pattern + suffix
2484             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed);
2485         }
2486         delete skipSet; // for timebomb
2487
2488         CollData::close(data);
2489
2490         logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2491     }
2492
2493     uset_close(contractions);
2494     uset_close(expansions);
2495     uset_close(letters);
2496
2497     CollData::close(monkeyData);
2498
2499     ucol_close(coll);
2500 }
2501
2502 void SSearchTest::stringListTest(){
2503     UErrorCode status = U_ZERO_ERROR;
2504     StringList *sl = new StringList(status);
2505     if(U_FAILURE(status)){
2506         errln("ERROR: stringListTest: Could not start StringList");
2507     }
2508
2509     const UChar chars[] = {
2510             0x0000
2511     };
2512     sl->add(chars, (int32_t) 0, status);
2513     if(U_FAILURE(status)){
2514         errln("ERROR: stringListTest: StringList::add");
2515     }
2516
2517     if(sl->getDynamicClassID() != StringList::getStaticClassID()){
2518         errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match");
2519     }
2520     delete sl;
2521 }
2522
2523 #endif
2524
2525 #endif