icuSources/i18n/bmsearch.cpp

   1 /*
   2  ******************************************************************************
   3  *   Copyright (C) 1996-2011, International Business Machines                 *
   4  *   Corporation and others.  All Rights Reserved.                            *
   5  ******************************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  11
  12 #include "unicode/unistr.h"
  13 #include "unicode/putil.h"
  14 #include "unicode/usearch.h"
  15
  16 #include "cmemory.h"
  17 #include "unicode/coll.h"
  18 #include "unicode/tblcoll.h"
  19 #include "unicode/coleitr.h"
  20 #include "unicode/ucoleitr.h"
  21
  22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  23
  24 #include "unicode/uniset.h"
  25 #include "unicode/uset.h"
  26 #include "unicode/ustring.h"
  27 #include "hash.h"
  28 #include "uhash.h"
  29 #include "ucol_imp.h"
  30 #include "normalizer2impl.h"
  31
  32 #include "unicode/colldata.h"
  33 #include "unicode/bmsearch.h"
  34
  35 U_NAMESPACE_BEGIN
  36
  37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
  38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
  39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  40
  41
  42 struct CEI
  43 {
  44     uint32_t order;
  45     int32_t  lowOffset;
  46     int32_t  highOffset;
  47 };
  48
  49 class Target : public UMemory
  50 {
  51 public:
  52     Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
  53     ~Target();
  54
  55     void setTargetString(const UnicodeString *target);
  56
  57     const CEI *nextCE(int32_t offset);
  58     const CEI *prevCE(int32_t offset);
  59
  60     int32_t stringLength();
  61     UChar charAt(int32_t offset);
  62
  63     UBool isBreakBoundary(int32_t offset);
  64     int32_t nextBreakBoundary(int32_t offset);
  65     int32_t nextSafeBoundary(int32_t offset);
  66
  67     UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
  68
  69     void setOffset(int32_t offset);
  70     void setLast(int32_t last);
  71     int32_t getOffset();
  72
  73 private:
  74     CEI *ceb;
  75     int32_t bufferSize;
  76     int32_t bufferMin;
  77     int32_t bufferMax;
  78
  79     uint32_t strengthMask;
  80     UCollationStrength strength;
  81     uint32_t variableTop;
  82     UBool toShift;
  83     UCollator *coll;
  84     const Normalizer2 &nfd;
  85
  86     const UnicodeString *targetString;
  87     const UChar *targetBuffer;
  88     int32_t targetLength;
  89
  90     UCollationElements *elements;
  91     UBreakIterator *charBreakIterator;
  92 };
  93
  94 Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
  95     : bufferSize(0), bufferMin(0), bufferMax(0),
  96       strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
  97       nfd(*Normalizer2Factory::getNFDInstance(status)),
  98       targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
  99 {
 100     strength = ucol_getStrength(coll);
 101     toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
 102     variableTop = ucol_getVariableTop(coll, &status);
 103
 104     // find the largest expansion
 105     uint8_t maxExpansion = 0;
 106     for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
 107         if (*expansion > maxExpansion) {
 108             maxExpansion = *expansion;
 109         }
 110     }
 111
 112     // room for an extra character on each end, plus 4 for safety
 113     bufferSize = patternLength + (2 * maxExpansion) + 4;
 114
 115     ceb = NEW_ARRAY(CEI, bufferSize);
 116
 117     if (ceb == NULL) {
 118         status = U_MEMORY_ALLOCATION_ERROR;
 119         return;
 120     }
 121
 122     if (target != NULL) {
 123         setTargetString(target);
 124     }
 125
 126     switch (strength)
 127     {
 128     default:
 129         strengthMask |= UCOL_TERTIARYORDERMASK;
 130         /* fall through */
 131
 132     case UCOL_SECONDARY:
 133         strengthMask |= UCOL_SECONDARYORDERMASK;
 134         /* fall through */
 135
 136     case UCOL_PRIMARY:
 137         strengthMask |= UCOL_PRIMARYORDERMASK;
 138     }
 139 }
 140
 141 Target::~Target()
 142 {
 143     ubrk_close(charBreakIterator);
 144     ucol_closeElements(elements);
 145
 146     DELETE_ARRAY(ceb);
 147 }
 148
 149 void Target::setTargetString(const UnicodeString *target)
 150 {
 151     if (charBreakIterator != NULL) {
 152         ubrk_close(charBreakIterator);
 153         ucol_closeElements(elements);
 154     }
 155
 156     targetString = target;
 157
 158     if (targetString != NULL) {
 159         UErrorCode status = U_ZERO_ERROR;
 160
 161         targetBuffer = targetString->getBuffer();
 162         targetLength = targetString->length();
 163
 164         elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
 165         ucol_forceHanImplicit(elements, &status);
 166
 167         charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
 168                                       targetBuffer, targetLength, &status);
 169     } else {
 170         targetBuffer = NULL;
 171         targetLength = 0;
 172     }
 173 }
 174
 175 const CEI *Target::nextCE(int32_t offset)
 176 {
 177     UErrorCode status = U_ZERO_ERROR;
 178     int32_t low = -1, high = -1;
 179     uint32_t order;
 180     UBool cont = FALSE;
 181
 182     if (offset >= bufferMin && offset < bufferMax) {
 183         return &ceb[offset];
 184     }
 185
 186     if (bufferMax >= bufferSize || offset != bufferMax) {
 187         return NULL;
 188     }
 189
 190     do {
 191         low   = ucol_getOffset(elements);
 192         order = ucol_next(elements, &status);
 193         high  = ucol_getOffset(elements);
 194
 195         if (order == (uint32_t)UCOL_NULLORDER) {
 196           //high = low = -1;
 197             break;
 198         }
 199
 200         cont = isContinuation(order);
 201         order &= strengthMask;
 202
 203         if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
 204             if (strength >= UCOL_QUATERNARY) {
 205                 order &= UCOL_PRIMARYORDERMASK;
 206             } else {
 207                 order = UCOL_IGNORABLE;
 208             }
 209         }
 210     } while (order == UCOL_IGNORABLE);
 211
 212     if (cont) {
 213         order |= UCOL_CONTINUATION_MARKER;
 214     }
 215
 216     ceb[offset].order = order;
 217     ceb[offset].lowOffset = low;
 218     ceb[offset].highOffset = high;
 219
 220     bufferMax += 1;
 221
 222     return &ceb[offset];
 223 }
 224
 225 const CEI *Target::prevCE(int32_t offset)
 226 {
 227     UErrorCode status = U_ZERO_ERROR;
 228     int32_t low = -1, high = -1;
 229     uint32_t order;
 230     UBool cont = FALSE;
 231
 232     if (offset >= bufferMin && offset < bufferMax) {
 233         return &ceb[offset];
 234     }
 235
 236     if (bufferMax >= bufferSize || offset != bufferMax) {
 237         return NULL;
 238     }
 239
 240     do {
 241         high  = ucol_getOffset(elements);
 242         order = ucol_previous(elements, &status);
 243         low   = ucol_getOffset(elements);
 244
 245         if (order == (uint32_t)UCOL_NULLORDER) {
 246             break;
 247         }
 248
 249         cont = isContinuation(order);
 250         order &= strengthMask;
 251
 252         if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
 253             if (strength >= UCOL_QUATERNARY) {
 254                 order &= UCOL_PRIMARYORDERMASK;
 255             } else {
 256                 order = UCOL_IGNORABLE;
 257             }
 258         }
 259     } while (order == UCOL_IGNORABLE);
 260
 261     bufferMax += 1;
 262
 263     if (cont) {
 264         order |= UCOL_CONTINUATION_MARKER;
 265     }
 266
 267     ceb[offset].order       = order;
 268     ceb[offset].lowOffset   = low;
 269     ceb[offset].highOffset = high;
 270
 271     return &ceb[offset];
 272 }
 273
 274 int32_t Target::stringLength()
 275 {
 276     if (targetString != NULL) {
 277         return targetLength;
 278     }
 279
 280     return 0;
 281 }
 282
 283 UChar Target::charAt(int32_t offset)
 284 {
 285     if (targetString != NULL) {
 286         return targetBuffer[offset];
 287     }
 288
 289     return 0x0000;
 290 }
 291
 292 void Target::setOffset(int32_t offset)
 293 {
 294     UErrorCode status = U_ZERO_ERROR;
 295
 296     bufferMin = 0;
 297     bufferMax = 0;
 298
 299     ucol_setOffset(elements, offset, &status);
 300 }
 301
 302 void Target::setLast(int32_t last)
 303 {
 304     UErrorCode status = U_ZERO_ERROR;
 305
 306     bufferMin = 0;
 307     bufferMax = 1;
 308
 309     ceb[0].order      = UCOL_NULLORDER;
 310     ceb[0].lowOffset  = last;
 311     ceb[0].highOffset = last;
 312
 313     ucol_setOffset(elements, last, &status);
 314 }
 315
 316 int32_t Target::getOffset()
 317 {
 318     return ucol_getOffset(elements);
 319 }
 320
 321 UBool Target::isBreakBoundary(int32_t offset)
 322 {
 323     return ubrk_isBoundary(charBreakIterator, offset);
 324 }
 325
 326 int32_t Target::nextBreakBoundary(int32_t offset)
 327 {
 328     return ubrk_following(charBreakIterator, offset);
 329 }
 330
 331 int32_t Target::nextSafeBoundary(int32_t offset)
 332 {
 333     while (offset < targetLength) {
 334       //UChar ch = charAt(offset);
 335         UChar ch = targetBuffer[offset];
 336
 337         if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
 338             return offset;
 339         }
 340
 341         offset += 1;
 342     }
 343
 344     return targetLength;
 345 }
 346
 347 UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
 348 {
 349     if (strength < UCOL_IDENTICAL) {
 350         return TRUE;
 351     }
 352
 353     // Note: We could use Normalizer::compare() or similar, but for short strings
 354     // which may not be in FCD it might be faster to just NFD them.
 355     UErrorCode status = U_ZERO_ERROR;
 356     UnicodeString t2, p2;
 357     nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
 358     nfd.normalize(pattern, p2, status);
 359     // return FALSE if NFD failed
 360     return U_SUCCESS(status) && t2 == p2;
 361 }
 362
 363 #define HASH_TABLE_SIZE 257
 364
 365 class BadCharacterTable : public UMemory
 366 {
 367 public:
 368     BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
 369     ~BadCharacterTable();
 370
 371     int32_t operator[](uint32_t ce) const;
 372     int32_t getMaxSkip() const;
 373     int32_t minLengthInChars(int32_t index);
 374
 375 private:
 376     static int32_t hash(uint32_t ce);
 377
 378     int32_t maxSkip;
 379     int32_t badCharacterTable[HASH_TABLE_SIZE];
 380
 381     int32_t *minLengthCache;
 382 };
 383
 384 BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
 385     : minLengthCache(NULL)
 386 {
 387     int32_t plen = patternCEs.size();
 388
 389     // **** need a better way to deal with this ****
 390     if (U_FAILURE(status) || plen == 0) {
 391         return;
 392     }
 393
 394     int32_t *history = NEW_ARRAY(int32_t, plen);
 395
 396     if (history == NULL) {
 397         status = U_MEMORY_ALLOCATION_ERROR;
 398         return;
 399     }
 400
 401     for (int32_t i = 0; i < plen; i += 1) {
 402         history[i] = -1;
 403     }
 404
 405     minLengthCache = NEW_ARRAY(int32_t, plen + 1);
 406
 407     if (minLengthCache == NULL) {
 408         DELETE_ARRAY(history);
 409         status = U_MEMORY_ALLOCATION_ERROR;
 410         return;
 411     }
 412
 413     maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
 414
 415     for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
 416         badCharacterTable[j] = maxSkip;
 417     }
 418
 419     for(int32_t p = 1; p < plen; p += 1) {
 420         minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
 421
 422         // Make sure this entry is not bigger than the previous one.
 423         // Otherwise, we might skip too far in some cases.
 424         if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
 425             minLengthCache[p] = minLengthCache[p - 1];
 426         }
 427     }
 428
 429     minLengthCache[plen] = 0;
 430
 431     for(int32_t p = 0; p < plen - 1; p += 1) {
 432         badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
 433     }
 434
 435     DELETE_ARRAY(history);
 436 }
 437
 438 BadCharacterTable::~BadCharacterTable()
 439 {
 440     DELETE_ARRAY(minLengthCache);
 441 }
 442
 443 int32_t BadCharacterTable::operator[](uint32_t ce) const
 444 {
 445     return badCharacterTable[hash(ce)];
 446 }
 447
 448 int32_t BadCharacterTable::getMaxSkip() const
 449 {
 450     return maxSkip;
 451 }
 452
 453 int32_t BadCharacterTable::minLengthInChars(int32_t index)
 454 {
 455     return minLengthCache[index];
 456 }
 457
 458 int32_t BadCharacterTable::hash(uint32_t ce)
 459 {
 460     return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
 461 }
 462
 463 class GoodSuffixTable : public UMemory
 464 {
 465 public:
 466     GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
 467     ~GoodSuffixTable();
 468
 469     int32_t operator[](int32_t offset) const;
 470
 471 private:
 472     int32_t *goodSuffixTable;
 473 };
 474
 475 GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
 476     : goodSuffixTable(NULL)
 477 {
 478     int32_t patlen = patternCEs.size();
 479
 480     // **** need a better way to deal with this ****
 481     if (U_FAILURE(status) || patlen <= 0) {
 482         return;
 483     }
 484
 485     int32_t *suff  = NEW_ARRAY(int32_t, patlen);
 486     int32_t start = patlen - 1, end = - 1;
 487     int32_t maxSkip = badCharacterTable.getMaxSkip();
 488
 489     if (suff == NULL) {
 490         status = U_MEMORY_ALLOCATION_ERROR;
 491         return;
 492     }
 493
 494     // initialze suff
 495     suff[patlen - 1] = patlen;
 496
 497     for (int32_t i = patlen - 2; i >= 0; i -= 1) {
 498         // (i > start) means we're inside the last suffix match we found
 499         // ((patlen - 1) - end) is how far the end of that match is from end of pattern
 500         // (i - start) is how far we are from start of that match
 501         // (i + (patlen - 1) - end) is index of same character at end of pattern
 502         // so if any suffix match at that character doesn't extend beyond the last match,
 503         // it's the suffix for this character as well
 504         if (i > start && suff[i + patlen - 1 - end] < i - start) {
 505             suff[i] = suff[i + patlen - 1 - end];
 506         } else {
 507             start = end = i;
 508
 509             int32_t s = patlen;
 510
 511             while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
 512                 start -= 1;
 513             }
 514
 515             suff[i] = end - start;
 516         }
 517     }
 518
 519     // now build goodSuffixTable
 520     goodSuffixTable  = NEW_ARRAY(int32_t, patlen);
 521
 522     if (goodSuffixTable == NULL) {
 523         DELETE_ARRAY(suff);
 524         status = U_MEMORY_ALLOCATION_ERROR;
 525         return;
 526     }
 527
 528
 529     // initialize entries to minLengthInChars of the pattern
 530     for (int32_t i = 0; i < patlen; i += 1) {
 531         goodSuffixTable[i] = maxSkip;
 532     }
 533
 534     int32_t prefix = 0;
 535
 536     for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
 537         if (suff[i] == i + 1) {
 538             // this matching suffix is a prefix of the pattern
 539             int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
 540
 541             // for any mis-match before this suffix, we should skip
 542             // so that the front of the pattern (i.e. the prefix)
 543             // lines up with the front of the suffix.
 544             // (patlen - 1 - i) is the start of the suffix
 545             while (prefix < patlen - 1 - i) {
 546                 // value of maxSkip means never set...
 547                 if (goodSuffixTable[prefix] == maxSkip) {
 548                     goodSuffixTable[prefix] = prefixSkip;
 549                 }
 550
 551                 prefix += 1;
 552             }
 553         }
 554     }
 555
 556     for (int32_t i = 0; i < patlen - 1; i += 1) {
 557         goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
 558     }
 559
 560     DELETE_ARRAY(suff);
 561 }
 562
 563 GoodSuffixTable::~GoodSuffixTable()
 564 {
 565     DELETE_ARRAY(goodSuffixTable);
 566 }
 567
 568 int32_t GoodSuffixTable::operator[](int32_t offset) const
 569 {
 570     return goodSuffixTable[offset];
 571 }
 572
 573 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
 574
 575
 576 UBool BoyerMooreSearch::empty()
 577 {
 578     return patCEs->size() <= 0;
 579 }
 580
 581 CollData *BoyerMooreSearch::getData()
 582 {
 583     return data;
 584 }
 585
 586 CEList *BoyerMooreSearch::getPatternCEs()
 587 {
 588     return patCEs;
 589 }
 590
 591 BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
 592 {
 593     return badCharacterTable;
 594 }
 595
 596 GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
 597 {
 598     return goodSuffixTable;
 599 }
 600
 601 BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
 602                                    UErrorCode &status)
 603     : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
 604 {
 605
 606     if (U_FAILURE(status)) {
 607         return;
 608     }
 609
 610     UCollator *collator = data->getCollator();
 611
 612     patCEs = new CEList(collator, patternString, status);
 613
 614     if (patCEs == NULL || U_FAILURE(status)) {
 615         return;
 616     }
 617
 618     badCharacterTable = new BadCharacterTable(*patCEs, data, status);
 619
 620     if (badCharacterTable == NULL || U_FAILURE(status)) {
 621         return;
 622     }
 623
 624     goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
 625
 626     if (targetString != NULL) {
 627         target = new Target(collator, targetString, patCEs->size(), status);
 628     }
 629 }
 630
 631 BoyerMooreSearch::~BoyerMooreSearch()
 632 {
 633     delete target;
 634     delete goodSuffixTable;
 635     delete badCharacterTable;
 636     delete patCEs;
 637 }
 638
 639 void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
 640 {
 641     if (U_FAILURE(status)) {
 642         return;
 643     }
 644
 645     if (target == NULL) {
 646         target = new Target(data->getCollator(), targetString, patCEs->size(), status);
 647     } else {
 648         target->setTargetString(targetString);
 649     }
 650 }
 651
 652 // **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
 653 /*
 654  * TODO:
 655  *  * deal with trailing (and leading?) ignorables.
 656  *  * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
 657  */
 658 UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
 659 {
 660     /*UCollator *coll =*/ data->getCollator();
 661     int32_t plen = patCEs->size();
 662     int32_t tlen = target->stringLength();
 663     int32_t maxSkip = badCharacterTable->getMaxSkip();
 664     int32_t tOffset = offset + maxSkip;
 665
 666     if (plen <= 0) {
 667         // Searching for a zero length pattern always fails.
 668         start = end = -1;
 669         return FALSE;
 670     }
 671
 672     while (tOffset <= tlen) {
 673         int32_t pIndex = plen - 1;
 674         int32_t tIndex = 0;
 675         int32_t lIndex = 0;
 676
 677         if (tOffset < tlen) {
 678             // **** we really want to skip ahead enough to  ****
 679             // **** be sure we get at least 1 non-ignorable ****
 680             // **** CE after the end of the pattern.        ****
 681             int32_t next = target->nextSafeBoundary(tOffset + 1);
 682
 683             target->setOffset(next);
 684
 685             for (lIndex = 0; ; lIndex += 1) {
 686                 const CEI *cei = target->prevCE(lIndex);
 687                 int32_t low = cei->lowOffset;
 688                 int32_t high = cei->highOffset;
 689
 690                 if (high == 0 || (low < high && low <= tOffset)) {
 691                     if (low < tOffset) {
 692                         while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
 693                             lIndex -= 1;
 694                         }
 695
 696                         if (high > tOffset) {
 697                             tOffset = high;
 698                         }
 699                     }
 700
 701                     break;
 702                 }
 703             }
 704         } else {
 705             target->setLast(tOffset);
 706             lIndex = 0;
 707         }
 708
 709         tIndex = ++lIndex;
 710
 711         // Iterate backward until we hit the beginning of the pattern
 712         while (pIndex >= 0) {
 713             uint32_t pce = (*patCEs)[pIndex];
 714             const CEI *tcei = target->prevCE(tIndex++);
 715
 716
 717             if (tcei->order != pce) {
 718                 // There is a mismatch at this position.  Decide how far
 719                 // over to shift the pattern, then try again.
 720
 721                 int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
 722 #ifdef EXTRA_CAUTIOUS
 723                 int32_t old = tOffset;
 724 #endif
 725
 726                 tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
 727
 728                 if (gsOffset > tOffset) {
 729                     tOffset = gsOffset;
 730                 }
 731
 732 #ifdef EXTRA_CAUTIOUS
 733                 // Make sure we don't skip backwards...
 734                 if (tOffset <= old) {
 735                     tOffset = old + 1;
 736                 }
 737 #endif
 738
 739                 break;
 740             }
 741
 742             pIndex -= 1;
 743         }
 744
 745         if (pIndex < 0) {
 746             // We made it back to the beginning of the pattern,
 747             // which means we matched it all.  Return the location.
 748             const CEI firstCEI = *target->prevCE(tIndex - 1);
 749             const CEI lastCEI  = *target->prevCE(lIndex);
 750             int32_t mStart   = firstCEI.lowOffset;
 751             int32_t minLimit = lastCEI.lowOffset;
 752             int32_t maxLimit = lastCEI.highOffset;
 753             int32_t mLimit;
 754             UBool found = TRUE;
 755
 756             target->setOffset(/*tOffset*/maxLimit);
 757
 758             const CEI nextCEI = *target->nextCE(0);
 759
 760             if (nextCEI.lowOffset > maxLimit) {
 761                 maxLimit = nextCEI.lowOffset;
 762             }
 763
 764             if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
 765                 found = FALSE;
 766             }
 767
 768             if (! target->isBreakBoundary(mStart)) {
 769                 found = FALSE;
 770             }
 771
 772             if (firstCEI.lowOffset == firstCEI.highOffset) {
 773                 found = FALSE;
 774             }
 775
 776             mLimit = maxLimit;
 777             if (minLimit < maxLimit) {
 778                 // When the last CE's low index is same with its high index, the CE is likely
 779                 // a part of expansion. In this case, the index is located just after the
 780                 // character corresponding to the CEs compared above. If the index is right
 781                 // at the break boundary, move the position to the next boundary will result
 782                 // incorrect match length when there are ignorable characters exist between
 783                 // the position and the next character produces CE(s). See ticket#8482.
 784                 if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
 785                     mLimit = minLimit;
 786                 } else {
 787                     int32_t nbb = target->nextBreakBoundary(minLimit);
 788
 789                     if (nbb >= lastCEI.highOffset) {
 790                         mLimit = nbb;
 791                     }
 792                 }
 793             }
 794
 795             if (mLimit > maxLimit) {
 796                 found = FALSE;
 797             }
 798
 799             if (! target->isBreakBoundary(mLimit)) {
 800                 found = FALSE;
 801             }
 802
 803             if (! target->isIdentical(pattern, mStart, mLimit)) {
 804                 found = FALSE;
 805             }
 806
 807             if (found) {
 808                 start = mStart;
 809                 end   = mLimit;
 810
 811                 return TRUE;
 812             }
 813
 814             tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
 815         }
 816         // Otherwise, we're here because of a mismatch, so keep going....
 817     }
 818
 819     // no match
 820    start = -1;
 821    end = -1;
 822    return FALSE;
 823 }
 824
 825 U_NAMESPACE_END
 826
 827 #endif // #if !UCONFIG_NO_COLLATION