icuSources/i18n/usearch.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
   6 **********************************************************************
   7 *   Date        Name        Description
   8 *  07/02/2001   synwee      Creation.
   9 **********************************************************************
  10 */
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  15
  16 #include "unicode/usearch.h"
  17 #include "unicode/ustring.h"
  18 #include "unicode/uchar.h"
  19 #include "unicode/utf16.h"
  20 #include "normalizer2impl.h"
  21 #include "usrchimp.h"
  22 #include "cmemory.h"
  23 #include "ucln_in.h"
  24 #include "uassert.h"
  25 #include "ustr_imp.h"
  26
  27 U_NAMESPACE_USE
  28
  29 // don't use Boyer-Moore
  30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
  31 #define BOYER_MOORE 0
  32
  33 // internal definition ---------------------------------------------------
  34
  35 #define LAST_BYTE_MASK_          0xFF
  36 #define SECOND_LAST_BYTE_SHIFT_  8
  37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
  38
  39 static const Normalizer2Impl *g_nfcImpl = NULL;
  40
  41 // internal methods -------------------------------------------------
  42
  43 /**
  44 * Fast collation element iterator setOffset.
  45 * This function does not check for bounds.
  46 * @param coleiter collation element iterator
  47 * @param offset to set
  48 */
  49 static
  50 inline void setColEIterOffset(UCollationElements *elems,
  51                       int32_t             offset)
  52 {
  53     // Note: Not "fast" any more after the 2013 collation rewrite.
  54     // We do not want to expose more internals than necessary.
  55     UErrorCode status = U_ZERO_ERROR;
  56     ucol_setOffset(elems, offset, &status);
  57 }
  58
  59 /**
  60 * Getting the mask for collation strength
  61 * @param strength collation strength
  62 * @return collation element mask
  63 */
  64 static
  65 inline uint32_t getMask(UCollationStrength strength)
  66 {
  67     switch (strength)
  68     {
  69     case UCOL_PRIMARY:
  70         return UCOL_PRIMARYORDERMASK;
  71     case UCOL_SECONDARY:
  72         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
  73     default:
  74         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
  75                UCOL_PRIMARYORDERMASK;
  76     }
  77 }
  78
  79 /**
  80 * @param ce 32-bit collation element
  81 * @return hash code
  82 */
  83 static
  84 inline int hashFromCE32(uint32_t ce)
  85 {
  86     int hc = (int)(
  87             ((((((ce >> 24) * 37) +
  88             (ce >> 16)) * 37) +
  89             (ce >> 8)) * 37) +
  90             ce);
  91     hc %= MAX_TABLE_SIZE_;
  92     if (hc < 0) {
  93         hc += MAX_TABLE_SIZE_;
  94     }
  95     return hc;
  96 }
  97
  98 U_CDECL_BEGIN
  99 static UBool U_CALLCONV
 100 usearch_cleanup(void) {
 101     g_nfcImpl = NULL;
 102     return TRUE;
 103 }
 104 U_CDECL_END
 105
 106 /**
 107 * Initializing the fcd tables.
 108 * Internal method, status assumed to be a success.
 109 * @param status output error if any, caller to check status before calling
 110 *               method, status assumed to be success when passed in.
 111 */
 112 static
 113 inline void initializeFCD(UErrorCode *status)
 114 {
 115     if (g_nfcImpl == NULL) {
 116         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
 117         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
 118     }
 119 }
 120
 121 /**
 122 * Gets the fcd value for a character at the argument index.
 123 * This method takes into accounts of the supplementary characters.
 124 * @param str UTF16 string where character for fcd retrieval resides
 125 * @param offset position of the character whose fcd is to be retrieved, to be
 126 *               overwritten with the next character position, taking
 127 *               surrogate characters into consideration.
 128 * @param strlength length of the argument string
 129 * @return fcd value
 130 */
 131 static
 132 uint16_t getFCD(const UChar   *str, int32_t *offset,
 133                              int32_t  strlength)
 134 {
 135     const UChar *temp = str + *offset;
 136     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
 137     *offset = (int32_t)(temp - str);
 138     return result;
 139 }
 140
 141 /**
 142 * Getting the modified collation elements taking into account the collation
 143 * attributes
 144 * @param strsrch string search data
 145 * @param sourcece
 146 * @return the modified collation element
 147 */
 148 static
 149 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
 150 {
 151     // note for tertiary we can't use the collator->tertiaryMask, that
 152     // is a preprocessed mask that takes into account case options. since
 153     // we are only concerned with exact matches, we don't need that.
 154     sourcece &= strsrch->ceMask;
 155
 156     if (strsrch->toShift) {
 157         // alternate handling here, since only the 16 most significant digits
 158         // is only used, we can safely do a compare without masking
 159         // if the ce is a variable, we mask and get only the primary values
 160         // no shifting to quartenary is required since all primary values
 161         // less than variabletop will need to be masked off anyway.
 162         if (strsrch->variableTop > sourcece) {
 163             if (strsrch->strength >= UCOL_QUATERNARY) {
 164                 sourcece &= UCOL_PRIMARYORDERMASK;
 165             }
 166             else {
 167                 sourcece = UCOL_IGNORABLE;
 168             }
 169         }
 170     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
 171         sourcece = 0xFFFF;
 172     }
 173
 174     return sourcece;
 175 }
 176
 177 /**
 178 * Allocate a memory and returns NULL if it failed.
 179 * Internal method, status assumed to be a success.
 180 * @param size to allocate
 181 * @param status output error if any, caller to check status before calling
 182 *               method, status assumed to be success when passed in.
 183 * @return newly allocated array, NULL otherwise
 184 */
 185 static
 186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
 187 {
 188     uint32_t *result = (uint32_t *)uprv_malloc(size);
 189     if (result == NULL) {
 190         *status = U_MEMORY_ALLOCATION_ERROR;
 191     }
 192     return result;
 193 }
 194
 195 /**
 196 * Adds a uint32_t value to a destination array.
 197 * Creates a new array if we run out of space. The caller will have to
 198 * manually deallocate the newly allocated array.
 199 * Internal method, status assumed to be success, caller has to check status
 200 * before calling this method. destination not to be NULL and has at least
 201 * size destinationlength.
 202 * @param destination target array
 203 * @param offset destination offset to add value
 204 * @param destinationlength target array size, return value for the new size
 205 * @param value to be added
 206 * @param increments incremental size expected
 207 * @param status output error if any, caller to check status before calling
 208 *               method, status assumed to be success when passed in.
 209 * @return new destination array, destination if there was no new allocation
 210 */
 211 static
 212 inline int32_t * addTouint32_tArray(int32_t    *destination,
 213                                     uint32_t    offset,
 214                                     uint32_t   *destinationlength,
 215                                     uint32_t    value,
 216                                     uint32_t    increments,
 217                                     UErrorCode *status)
 218 {
 219     uint32_t newlength = *destinationlength;
 220     if (offset + 1 == newlength) {
 221         newlength += increments;
 222         int32_t *temp = (int32_t *)allocateMemory(
 223                                          sizeof(int32_t) * newlength, status);
 224         if (U_FAILURE(*status)) {
 225             return NULL;
 226         }
 227         uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
 228         *destinationlength = newlength;
 229         destination        = temp;
 230     }
 231     destination[offset] = value;
 232     return destination;
 233 }
 234
 235 /**
 236 * Adds a uint64_t value to a destination array.
 237 * Creates a new array if we run out of space. The caller will have to
 238 * manually deallocate the newly allocated array.
 239 * Internal method, status assumed to be success, caller has to check status
 240 * before calling this method. destination not to be NULL and has at least
 241 * size destinationlength.
 242 * @param destination target array
 243 * @param offset destination offset to add value
 244 * @param destinationlength target array size, return value for the new size
 245 * @param value to be added
 246 * @param increments incremental size expected
 247 * @param status output error if any, caller to check status before calling
 248 *               method, status assumed to be success when passed in.
 249 * @return new destination array, destination if there was no new allocation
 250 */
 251 static
 252 inline int64_t * addTouint64_tArray(int64_t    *destination,
 253                                     uint32_t    offset,
 254                                     uint32_t   *destinationlength,
 255                                     uint64_t    value,
 256                                     uint32_t    increments,
 257                                     UErrorCode *status)
 258 {
 259     uint32_t newlength = *destinationlength;
 260     if (offset + 1 == newlength) {
 261         newlength += increments;
 262         int64_t *temp = (int64_t *)allocateMemory(
 263                                          sizeof(int64_t) * newlength, status);
 264
 265         if (U_FAILURE(*status)) {
 266             return NULL;
 267         }
 268
 269         uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
 270         *destinationlength = newlength;
 271         destination        = temp;
 272     }
 273
 274     destination[offset] = value;
 275
 276     return destination;
 277 }
 278
 279 /**
 280 * Initializing the ce table for a pattern.
 281 * Stores non-ignorable collation keys.
 282 * Table size will be estimated by the size of the pattern text. Table
 283 * expansion will be perform as we go along. Adding 1 to ensure that the table
 284 * size definitely increases.
 285 * Internal method, status assumed to be a success.
 286 * @param strsrch string search data
 287 * @param status output error if any, caller to check status before calling
 288 *               method, status assumed to be success when passed in.
 289 * @return total number of expansions
 290 */
 291 static
 292 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
 293                                          UErrorCode    *status)
 294 {
 295     UPattern *pattern            = &(strsrch->pattern);
 296     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
 297     int32_t  *cetable            = pattern->cesBuffer;
 298     uint32_t  patternlength      = pattern->textLength;
 299     UCollationElements *coleiter = strsrch->utilIter;
 300
 301     if (coleiter == NULL) {
 302         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 303                                      patternlength, status);
 304         // status will be checked in ucol_next(..) later and if it is an
 305         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 306         // returned.
 307         strsrch->utilIter = coleiter;
 308     }
 309     else {
 310         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
 311     }
 312     if(U_FAILURE(*status)) {
 313         return 0;
 314     }
 315
 316     if (pattern->ces != cetable && pattern->ces) {
 317         uprv_free(pattern->ces);
 318     }
 319
 320     uint16_t  offset      = 0;
 321     uint16_t  result      = 0;
 322     int32_t   ce;
 323
 324     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
 325            U_SUCCESS(*status)) {
 326         uint32_t newce = getCE(strsrch, ce);
 327         if (newce) {
 328             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
 329                                   newce,
 330                                   patternlength - ucol_getOffset(coleiter) + 1,
 331                                   status);
 332             if (U_FAILURE(*status)) {
 333                 return 0;
 334             }
 335             offset ++;
 336             if (cetable != temp && cetable != pattern->cesBuffer) {
 337                 uprv_free(cetable);
 338             }
 339             cetable = temp;
 340         }
 341         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 342     }
 343
 344     cetable[offset]   = 0;
 345     pattern->ces       = cetable;
 346     pattern->cesLength = offset;
 347
 348     return result;
 349 }
 350
 351 /**
 352 * Initializing the pce table for a pattern.
 353 * Stores non-ignorable collation keys.
 354 * Table size will be estimated by the size of the pattern text. Table
 355 * expansion will be perform as we go along. Adding 1 to ensure that the table
 356 * size definitely increases.
 357 * Internal method, status assumed to be a success.
 358 * @param strsrch string search data
 359 * @param status output error if any, caller to check status before calling
 360 *               method, status assumed to be success when passed in.
 361 * @return total number of expansions
 362 */
 363 static
 364 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
 365                                           UErrorCode    *status)
 366 {
 367     UPattern *pattern            = &(strsrch->pattern);
 368     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
 369     int64_t  *pcetable           = pattern->pcesBuffer;
 370     uint32_t  patternlength      = pattern->textLength;
 371     UCollationElements *coleiter = strsrch->utilIter;
 372
 373     if (coleiter == NULL) {
 374         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 375                                      patternlength, status);
 376         // status will be checked in ucol_next(..) later and if it is an
 377         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 378         // returned.
 379         strsrch->utilIter = coleiter;
 380     } else {
 381         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
 382     }
 383     if(U_FAILURE(*status)) {
 384         return 0;
 385     }
 386
 387     if (pattern->pces != pcetable && pattern->pces != NULL) {
 388         uprv_free(pattern->pces);
 389     }
 390
 391     uint16_t  offset = 0;
 392     uint16_t  result = 0;
 393     int64_t   pce;
 394
 395     icu::UCollationPCE iter(coleiter);
 396
 397     // ** Should processed CEs be signed or unsigned?
 398     // ** (the rest of the code in this file seems to play fast-and-loose with
 399     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
 400     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
 401            U_SUCCESS(*status)) {
 402         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
 403                               pce,
 404                               patternlength - ucol_getOffset(coleiter) + 1,
 405                               status);
 406
 407         if (U_FAILURE(*status)) {
 408             return 0;
 409         }
 410
 411         offset += 1;
 412
 413         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
 414             uprv_free(pcetable);
 415         }
 416
 417         pcetable = temp;
 418         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 419     }
 420
 421     pcetable[offset]   = 0;
 422     pattern->pces       = pcetable;
 423     pattern->pcesLength = offset;
 424
 425     return result;
 426 }
 427
 428 /**
 429 * Initializes the pattern struct.
 430 * Internal method, status assumed to be success.
 431 * @param strsrch UStringSearch data storage
 432 * @param status output error if any, caller to check status before calling
 433 *               method, status assumed to be success when passed in.
 434 * @return expansionsize the total expansion size of the pattern
 435 */
 436 static
 437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
 438 {
 439     if (U_FAILURE(*status)) { return 0; }
 440           UPattern   *pattern     = &(strsrch->pattern);
 441     const UChar      *patterntext = pattern->text;
 442           int32_t     length      = pattern->textLength;
 443           int32_t index       = 0;
 444
 445     // Since the strength is primary, accents are ignored in the pattern.
 446     if (strsrch->strength == UCOL_PRIMARY) {
 447         pattern->hasPrefixAccents = 0;
 448         pattern->hasSuffixAccents = 0;
 449     } else {
 450         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
 451                                                          SECOND_LAST_BYTE_SHIFT_;
 452         index = length;
 453         U16_BACK_1(patterntext, 0, index);
 454         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
 455                                                                  LAST_BYTE_MASK_;
 456     }
 457
 458     // ** HACK **
 459     if (strsrch->pattern.pces != NULL) {
 460         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
 461             uprv_free(strsrch->pattern.pces);
 462         }
 463
 464         strsrch->pattern.pces = NULL;
 465     }
 466
 467     // since intializePattern is an internal method status is a success.
 468     return initializePatternCETable(strsrch, status);
 469 }
 470
 471 /**
 472 * Initializing shift tables, with the default values.
 473 * If a corresponding default value is 0, the shift table is not set.
 474 * @param shift table for forwards shift
 475 * @param backshift table for backwards shift
 476 * @param cetable table containing pattern ce
 477 * @param cesize size of the pattern ces
 478 * @param expansionsize total size of the expansions
 479 * @param defaultforward the default forward value
 480 * @param defaultbackward the default backward value
 481 */
 482 static
 483 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
 484                           int32_t  *cetable, int32_t cesize,
 485                           int16_t   expansionsize,
 486                           int16_t   defaultforward,
 487                           int16_t   defaultbackward)
 488 {
 489     // estimate the value to shift. to do that we estimate the smallest
 490     // number of characters to give the relevant ces, ie approximately
 491     // the number of ces minus their expansion, since expansions can come
 492     // from a character.
 493     int32_t count;
 494     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 495         shift[count] = defaultforward;
 496     }
 497     cesize --; // down to the last index
 498     for (count = 0; count < cesize; count ++) {
 499         // number of ces from right of array to the count
 500         int temp = defaultforward - count - 1;
 501         shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
 502     }
 503     shift[hashFromCE32(cetable[cesize])] = 1;
 504     // for ignorables we just shift by one. see test examples.
 505     shift[hashFromCE32(0)] = 1;
 506
 507     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 508         backshift[count] = defaultbackward;
 509     }
 510     for (count = cesize; count > 0; count --) {
 511         // the original value count does not seem to work
 512         backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
 513                                           (int16_t)(count - expansionsize) : 1;
 514     }
 515     backshift[hashFromCE32(cetable[0])] = 1;
 516     backshift[hashFromCE32(0)] = 1;
 517 }
 518
 519 /**
 520 * Building of the pattern collation element list and the boyer moore strsrch
 521 * table.
 522 * The canonical match will only be performed after the default match fails.
 523 * For both cases we need to remember the size of the composed and decomposed
 524 * versions of the string. Since the Boyer-Moore shift calculations shifts by
 525 * a number of characters in the text and tries to match the pattern from that
 526 * offset, the shift value can not be too large in case we miss some
 527 * characters. To choose a right shift size, we estimate the NFC form of the
 528 * and use its size as a shift guide. The NFC form should be the small
 529 * possible representation of the pattern. Anyways, we'll err on the smaller
 530 * shift size. Hence the calculation for minlength.
 531 * Canonical match will be performed slightly differently. We'll split the
 532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
 533 * the first and last base character (MS), the ending accents (EA). Matches
 534 * will be done on MS first, and only when we match MS then some processing
 535 * will be required for the prefix and end accents in order to determine if
 536 * they match PA and EA. Hence the default shift values
 537 * for the canonical match will take the size of either end's accent into
 538 * consideration. Forwards search will take the end accents into consideration
 539 * for the default shift values and the backwards search will take the prefix
 540 * accents into consideration.
 541 * If pattern has no non-ignorable ce, we return a illegal argument error.
 542 * Internal method, status assumed to be success.
 543 * @param strsrch UStringSearch data storage
 544 * @param status  for output errors if it occurs, status is assumed to be a
 545 *                success when it is passed in.
 546 */
 547 static
 548 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
 549 {
 550     int16_t expandlength  = initializePattern(strsrch, status);
 551     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
 552         UPattern *pattern = &strsrch->pattern;
 553         int32_t   cesize  = pattern->cesLength;
 554
 555         int16_t minlength = cesize > expandlength
 556                             ? (int16_t)cesize - expandlength : 1;
 557         pattern->defaultShiftSize    = minlength;
 558         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
 559                       cesize, expandlength, minlength, minlength);
 560         return;
 561     }
 562     strsrch->pattern.defaultShiftSize = 0;
 563 }
 564
 565 #if BOYER_MOORE
 566 /**
 567 * Check to make sure that the match length is at the end of the character by
 568 * using the breakiterator.
 569 * @param strsrch string search data
 570 * @param start target text start offset
 571 * @param end target text end offset
 572 */
 573 static
 574 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
 575                                int32_t *end)
 576 {
 577 #if !UCONFIG_NO_BREAK_ITERATION
 578     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
 579     if (breakiterator) {
 580         int32_t matchend = *end;
 581         //int32_t matchstart = *start;
 582
 583         if (!ubrk_isBoundary(breakiterator, matchend)) {
 584             *end = ubrk_following(breakiterator, matchend);
 585         }
 586
 587         /* Check the start of the matched text to make sure it doesn't have any accents
 588          * before it.  This code may not be necessary and so it is commented out */
 589         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
 590             *start = ubrk_preceding(breakiterator, matchstart);
 591         }*/
 592     }
 593 #endif
 594 }
 595
 596 /**
 597 * Determine whether the target text in UStringSearch bounded by the offset
 598 * start and end is one or more whole units of text as
 599 * determined by the breakiterator in UStringSearch.
 600 * @param strsrch string search data
 601 * @param start target text start offset
 602 * @param end target text end offset
 603 */
 604 static
 605 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
 606                                int32_t    end)
 607 {
 608 #if !UCONFIG_NO_BREAK_ITERATION
 609     UBreakIterator *breakiterator = strsrch->search->breakIter;
 610     //TODO: Add here.
 611     if (breakiterator) {
 612         int32_t startindex = ubrk_first(breakiterator);
 613         int32_t endindex   = ubrk_last(breakiterator);
 614
 615         // out-of-range indexes are never boundary positions
 616         if (start < startindex || start > endindex ||
 617             end < startindex || end > endindex) {
 618             return FALSE;
 619         }
 620         // otherwise, we can use following() on the position before the
 621         // specified one and return true of the position we get back is the
 622         // one the user specified
 623         UBool result = (start == startindex ||
 624                 ubrk_following(breakiterator, start - 1) == start) &&
 625                (end == endindex ||
 626                 ubrk_following(breakiterator, end - 1) == end);
 627         if (result) {
 628             // iterates the individual ces
 629                   UCollationElements *coleiter  = strsrch->utilIter;
 630             const UChar              *text      = strsrch->search->text +
 631                                                                       start;
 632                   UErrorCode          status    = U_ZERO_ERROR;
 633             ucol_setText(coleiter, text, end - start, &status);
 634             for (int32_t count = 0; count < strsrch->pattern.cesLength;
 635                  count ++) {
 636                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 637                 if (ce == UCOL_IGNORABLE) {
 638                     count --;
 639                     continue;
 640                 }
 641                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
 642                     return FALSE;
 643                 }
 644             }
 645             int32_t nextce = ucol_next(coleiter, &status);
 646             while (ucol_getOffset(coleiter) == (end - start)
 647                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
 648                 nextce = ucol_next(coleiter, &status);
 649             }
 650             if (ucol_getOffset(coleiter) == (end - start)
 651                 && nextce != UCOL_NULLORDER) {
 652                 // extra collation elements at the end of the match
 653                 return FALSE;
 654             }
 655         }
 656         return result;
 657     }
 658 #endif
 659     return TRUE;
 660 }
 661
 662 /**
 663 * Getting the next base character offset if current offset is an accent,
 664 * or the current offset if the current character contains a base character.
 665 * accents the following base character will be returned
 666 * @param text string
 667 * @param textoffset current offset
 668 * @param textlength length of text string
 669 * @return the next base character or the current offset
 670 *         if the current character is contains a base character.
 671 */
 672 static
 673 inline int32_t getNextBaseOffset(const UChar       *text,
 674                                            int32_t  textoffset,
 675                                            int32_t      textlength)
 676 {
 677     if (textoffset < textlength) {
 678         int32_t temp = textoffset;
 679         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 680             while (temp < textlength) {
 681                 int32_t result = temp;
 682                 if ((getFCD(text, &temp, textlength) >>
 683                      SECOND_LAST_BYTE_SHIFT_) == 0) {
 684                     return result;
 685                 }
 686             }
 687             return textlength;
 688         }
 689     }
 690     return textoffset;
 691 }
 692
 693 /**
 694 * Gets the next base character offset depending on the string search pattern
 695 * data
 696 * @param strsrch string search data
 697 * @param textoffset current offset, one offset away from the last character
 698 *                   to search for.
 699 * @return start index of the next base character or the current offset
 700 *         if the current character is contains a base character.
 701 */
 702 static
 703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
 704                                                   int32_t    textoffset)
 705 {
 706     int32_t textlength = strsrch->search->textLength;
 707     if (strsrch->pattern.hasSuffixAccents &&
 708         textoffset < textlength) {
 709               int32_t  temp       = textoffset;
 710         const UChar       *text       = strsrch->search->text;
 711         U16_BACK_1(text, 0, temp);
 712         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 713             return getNextBaseOffset(text, textoffset, textlength);
 714         }
 715     }
 716     return textoffset;
 717 }
 718
 719 /**
 720 * Shifting the collation element iterator position forward to prepare for
 721 * a following match. If the last character is a unsafe character, we'll only
 722 * shift by 1 to capture contractions, normalization etc.
 723 * Internal method, status assumed to be success.
 724 * @param text strsrch string search data
 725 * @param textoffset start text position to do search
 726 * @param ce the text ce which failed the match.
 727 * @param patternceindex index of the ce within the pattern ce buffer which
 728 *        failed the match
 729 * @return final offset
 730 */
 731 static
 732 inline int32_t shiftForward(UStringSearch *strsrch,
 733                                 int32_t    textoffset,
 734                                 int32_t       ce,
 735                                 int32_t        patternceindex)
 736 {
 737     UPattern *pattern = &(strsrch->pattern);
 738     if (ce != UCOL_NULLORDER) {
 739         int32_t shift = pattern->shift[hashFromCE32(ce)];
 740         // this is to adjust for characters in the middle of the
 741         // substring for matching that failed.
 742         int32_t adjust = pattern->cesLength - patternceindex;
 743         if (adjust > 1 && shift >= adjust) {
 744             shift -= adjust - 1;
 745         }
 746         textoffset += shift;
 747     }
 748     else {
 749         textoffset += pattern->defaultShiftSize;
 750     }
 751
 752     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
 753     // check for unsafe characters
 754     // * if it is the start or middle of a contraction: to be done after
 755     //   a initial match is found
 756     // * thai or lao base consonant character: similar to contraction
 757     // * high surrogate character: similar to contraction
 758     // * next character is a accent: shift to the next base character
 759     return textoffset;
 760 }
 761 #endif // #if BOYER_MOORE
 762
 763 /**
 764 * sets match not found
 765 * @param strsrch string search data
 766 */
 767 static
 768 inline void setMatchNotFound(UStringSearch *strsrch)
 769 {
 770     // this method resets the match result regardless of the error status.
 771     strsrch->search->matchedIndex = USEARCH_DONE;
 772     strsrch->search->matchedLength = 0;
 773     if (strsrch->search->isForwardSearching) {
 774         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
 775     }
 776     else {
 777         setColEIterOffset(strsrch->textIter, 0);
 778     }
 779 }
 780
 781 #if BOYER_MOORE
 782 /**
 783 * Gets the offset to the next safe point in text.
 784 * ie. not the middle of a contraction, swappable characters or supplementary
 785 * characters.
 786 * @param collator collation sata
 787 * @param text string to work with
 788 * @param textoffset offset in string
 789 * @param textlength length of text string
 790 * @return offset to the next safe character
 791 */
 792 static
 793 inline int32_t getNextSafeOffset(const UCollator   *collator,
 794                                      const UChar       *text,
 795                                            int32_t  textoffset,
 796                                            int32_t      textlength)
 797 {
 798     int32_t result = textoffset; // first contraction character
 799     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
 800         result ++;
 801     }
 802     return result;
 803 }
 804
 805 /**
 806 * This checks for accents in the potential match started with a .
 807 * composite character.
 808 * This is really painful... we have to check that composite character do not
 809 * have any extra accents. We have to normalize the potential match and find
 810 * the immediate decomposed character before the match.
 811 * The first composite character would have been taken care of by the fcd
 812 * checks in checkForwardExactMatch.
 813 * This is the slow path after the fcd of the first character and
 814 * the last character has been checked by checkForwardExactMatch and we
 815 * determine that the potential match has extra non-ignorable preceding
 816 * ces.
 817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 819 * Note here that accents checking are slow and cautioned in the API docs.
 820 * Internal method, status assumed to be a success, caller should check status
 821 * before calling this method
 822 * @param strsrch string search data
 823 * @param start index of the potential unfriendly composite character
 824 * @param end index of the potential unfriendly composite character
 825 * @param status output error status if any.
 826 * @return TRUE if there is non-ignorable accents before at the beginning
 827 *              of the match, FALSE otherwise.
 828 */
 829
 830 static
 831 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
 832                                    int32_t    end,
 833                                    UErrorCode    *status)
 834 {
 835     UBool result = FALSE;
 836     if (strsrch->pattern.hasPrefixAccents) {
 837               int32_t  length = end - start;
 838               int32_t  offset = 0;
 839         const UChar       *text   = strsrch->search->text + start;
 840
 841         U16_FWD_1(text, offset, length);
 842         // we are only concerned with the first composite character
 843         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
 844             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
 845                                                        text, 0, length);
 846             if (safeoffset != length) {
 847                 safeoffset ++;
 848             }
 849             UChar   *norm = NULL;
 850             UChar    buffer[INITIAL_ARRAY_SIZE_];
 851             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
 852                                             buffer, INITIAL_ARRAY_SIZE_,
 853                                             status);
 854             if (U_FAILURE(*status)) {
 855                 return FALSE;
 856             }
 857             if (size >= INITIAL_ARRAY_SIZE_) {
 858                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
 859                                                status);
 860                 // if allocation failed, status will be set to
 861                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
 862                 // checks for it.
 863                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
 864                                        size, status);
 865                 if (U_FAILURE(*status) && norm != NULL) {
 866                     uprv_free(norm);
 867                     return FALSE;
 868                 }
 869             }
 870             else {
 871                 norm = buffer;
 872             }
 873
 874             UCollationElements *coleiter  = strsrch->utilIter;
 875             ucol_setText(coleiter, norm, size, status);
 876             uint32_t            firstce   = strsrch->pattern.ces[0];
 877             UBool               ignorable = TRUE;
 878             uint32_t            ce        = UCOL_IGNORABLE;
 879             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
 880                 offset = ucol_getOffset(coleiter);
 881                 if (ce != firstce && ce != UCOL_IGNORABLE) {
 882                     ignorable = FALSE;
 883                 }
 884                 ce = ucol_next(coleiter, status);
 885             }
 886             UChar32 codepoint;
 887             U16_PREV(norm, 0, offset, codepoint);
 888             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
 889
 890             if (norm != buffer) {
 891                 uprv_free(norm);
 892             }
 893         }
 894     }
 895
 896     return result;
 897 }
 898
 899 /**
 900 * Used by exact matches, checks if there are accents before the match.
 901 * This is really painful... we have to check that composite characters at
 902 * the start of the matches have to not have any extra accents.
 903 * We check the FCD of the character first, if it starts with an accent and
 904 * the first pattern ce does not match the first ce of the character, we bail.
 905 * Otherwise we try normalizing the first composite
 906 * character and find the immediate decomposed character before the match to
 907 * see if it is an non-ignorable accent.
 908 * Now normalizing the first composite character is enough because we ensure
 909 * that when the match is passed in here with extra beginning ces, the
 910 * first or last ce that match has to occur within the first character.
 911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 913 * Note here that accents checking are slow and cautioned in the API docs.
 914 * @param strsrch string search data
 915 * @param start offset
 916 * @param end offset
 917 * @return TRUE if there are accents on either side of the match,
 918 *         FALSE otherwise
 919 */
 920 static
 921 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
 922                                   int32_t    end)
 923 {
 924     if (strsrch->pattern.hasPrefixAccents) {
 925         UCollationElements *coleiter  = strsrch->textIter;
 926         UErrorCode          status    = U_ZERO_ERROR;
 927         // we have been iterating forwards previously
 928         uint32_t            ignorable = TRUE;
 929         int32_t             firstce   = strsrch->pattern.ces[0];
 930
 931         setColEIterOffset(coleiter, start);
 932         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
 933         if (U_FAILURE(status)) {
 934             return TRUE;
 935         }
 936         while (ce != firstce) {
 937             if (ce != UCOL_IGNORABLE) {
 938                 ignorable = FALSE;
 939             }
 940             ce = getCE(strsrch, ucol_next(coleiter, &status));
 941             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
 942                 return TRUE;
 943             }
 944         }
 945         if (!ignorable && inNormBuf(coleiter)) {
 946             // within normalization buffer, discontiguous handled here
 947             return TRUE;
 948         }
 949
 950         // within text
 951         int32_t temp = start;
 952         // original code
 953         // accent = (getFCD(strsrch->search->text, &temp,
 954         //                  strsrch->search->textLength)
 955         //            >> SECOND_LAST_BYTE_SHIFT_);
 956         // however this code does not work well with VC7 .net in release mode.
 957         // maybe the inlines for getFCD combined with shifting has bugs in
 958         // VC7. anyways this is a work around.
 959         UBool accent = getFCD(strsrch->search->text, &temp,
 960                               strsrch->search->textLength) > 0xFF;
 961         if (!accent) {
 962             return checkExtraMatchAccents(strsrch, start, end, &status);
 963         }
 964         if (!ignorable) {
 965             return TRUE;
 966         }
 967         if (start > 0) {
 968             temp = start;
 969             U16_BACK_1(strsrch->search->text, 0, temp);
 970             if (getFCD(strsrch->search->text, &temp,
 971                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
 972                 setColEIterOffset(coleiter, start);
 973                 ce = ucol_previous(coleiter, &status);
 974                 if (U_FAILURE(status) ||
 975                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
 976                     return TRUE;
 977                 }
 978             }
 979         }
 980     }
 981
 982     return FALSE;
 983 }
 984
 985 /**
 986 * Used by exact matches, checks if there are accents bounding the match.
 987 * Note this is the initial boundary check. If the potential match
 988 * starts or ends with composite characters, the accents in those
 989 * characters will be determined later.
 990 * Not doing backwards iteration here, since discontiguos contraction for
 991 * backwards collation element iterator, use up too many characters.
 992 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
 993 * should fail since there is a acute at the end of \u01FA
 994 * Note here that accents checking are slow and cautioned in the API docs.
 995 * @param strsrch string search data
 996 * @param start offset of match
 997 * @param end end offset of the match
 998 * @return TRUE if there are accents on either side of the match,
 999 *         FALSE otherwise
1000 */
1001 static
1002 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1003                                  int32_t    end)
1004 {
1005     if (strsrch->pattern.hasSuffixAccents) {
1006         const UChar       *text       = strsrch->search->text;
1007               int32_t  temp       = end;
1008               int32_t      textlength = strsrch->search->textLength;
1009         U16_BACK_1(text, 0, temp);
1010         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011             int32_t             firstce  = strsrch->pattern.ces[0];
1012             UCollationElements *coleiter = strsrch->textIter;
1013             UErrorCode          status   = U_ZERO_ERROR;
1014             int32_t ce;
1015             setColEIterOffset(coleiter, start);
1016             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1018                     return TRUE;
1019                 }
1020             }
1021             int32_t count = 1;
1022             while (count < strsrch->pattern.cesLength) {
1023                 if (getCE(strsrch, ucol_next(coleiter, &status))
1024                     == UCOL_IGNORABLE) {
1025                     // Thai can give an ignorable here.
1026                     count --;
1027                 }
1028                 if (U_FAILURE(status)) {
1029                     return TRUE;
1030                 }
1031                 count ++;
1032             }
1033
1034             ce = ucol_next(coleiter, &status);
1035             if (U_FAILURE(status)) {
1036                 return TRUE;
1037             }
1038             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039                 ce = getCE(strsrch, ce);
1040             }
1041             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042                 if (ucol_getOffset(coleiter) <= end) {
1043                     return TRUE;
1044                 }
1045                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1046                     return TRUE;
1047                 }
1048             }
1049         }
1050     }
1051     return FALSE;
1052 }
1053 #endif // #if BOYER_MOORE
1054
1055 /**
1056 * Checks if the offset runs out of the text string
1057 * @param offset
1058 * @param textlength of the text string
1059 * @return TRUE if offset is out of bounds, FALSE otherwise
1060 */
1061 static
1062 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1063 {
1064     return offset < 0 || offset > textlength;
1065 }
1066
1067 /**
1068 * Checks for identical match
1069 * @param strsrch string search data
1070 * @param start offset of possible match
1071 * @param end offset of possible match
1072 * @return TRUE if identical match is found
1073 */
1074 static
1075 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1076                                   int32_t    end)
1077 {
1078     if (strsrch->strength != UCOL_IDENTICAL) {
1079         return TRUE;
1080     }
1081
1082     // Note: We could use Normalizer::compare() or similar, but for short strings
1083     // which may not be in FCD it might be faster to just NFD them.
1084     UErrorCode status = U_ZERO_ERROR;
1085     UnicodeString t2, p2;
1086     strsrch->nfd->normalize(
1087         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1088     strsrch->nfd->normalize(
1089         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090     // return FALSE if NFD failed
1091     return U_SUCCESS(status) && t2 == p2;
1092 }
1093
1094 #if BOYER_MOORE
1095 /**
1096 * Checks to see if the match is repeated
1097 * @param strsrch string search data
1098 * @param start new match start index
1099 * @param end new match end index
1100 * @return TRUE if the the match is repeated, FALSE otherwise
1101 */
1102 static
1103 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1104                                 int32_t    start,
1105                                 int32_t    end)
1106 {
1107     int32_t lastmatchindex = strsrch->search->matchedIndex;
1108     UBool       result;
1109     if (lastmatchindex == USEARCH_DONE) {
1110         return FALSE;
1111     }
1112     if (strsrch->search->isForwardSearching) {
1113         result = start <= lastmatchindex;
1114     }
1115     else {
1116         result = start >= lastmatchindex;
1117     }
1118     if (!result && !strsrch->search->isOverlap) {
1119         if (strsrch->search->isForwardSearching) {
1120             result = start < lastmatchindex + strsrch->search->matchedLength;
1121         }
1122         else {
1123             result = end > lastmatchindex;
1124         }
1125     }
1126     return result;
1127 }
1128
1129 /**
1130 * Gets the collation element iterator's current offset.
1131 * @param coleiter collation element iterator
1132 * @param forwards flag TRUE if we are moving in th forwards direction
1133 * @return current offset
1134 */
1135 static
1136 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1137                                               UBool               forwards)
1138 {
1139     int32_t result = ucol_getOffset(coleiter);
1140     // intricacies of the the backwards collation element iterator
1141     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1142         result ++;
1143     }
1144     return result;
1145 }
1146
1147 /**
1148 * Checks match for contraction.
1149 * If the match ends with a partial contraction we fail.
1150 * If the match starts too far off (because of backwards iteration) we try to
1151 * chip off the extra characters depending on whether a breakiterator has
1152 * been used.
1153 * Internal method, error assumed to be success, caller has to check status
1154 * before calling this method.
1155 * @param strsrch string search data
1156 * @param start offset of potential match, to be modified if necessary
1157 * @param end offset of potential match, to be modified if necessary
1158 * @param status output error status if any
1159 * @return TRUE if match passes the contraction test, FALSE otherwise
1160 */
1161
1162 static
1163 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1164                                      int32_t   *start,
1165                                      int32_t   *end, UErrorCode  *status)
1166 {
1167           UCollationElements *coleiter   = strsrch->textIter;
1168           int32_t             textlength = strsrch->search->textLength;
1169           int32_t             temp       = *start;
1170     const UCollator          *collator   = strsrch->collator;
1171     const UChar              *text       = strsrch->search->text;
1172     // This part checks if either ends of the match contains potential
1173     // contraction. If so we'll have to iterate through them
1174     // The start contraction needs to be checked since ucol_previous dumps
1175     // all characters till the first safe character into the buffer.
1176     // *start + 1 is used to test for the unsafe characters instead of *start
1177     // because ucol_prev takes all unsafe characters till the first safe
1178     // character ie *start. so by testing *start + 1, we can estimate if
1179     // excess prefix characters has been included in the potential search
1180     // results.
1181     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1182         (*start + 1 < textlength
1183          && ucol_unsafeCP(text[*start + 1], collator))) {
1184         int32_t expansion  = getExpansionPrefix(coleiter);
1185         UBool   expandflag = expansion > 0;
1186         setColEIterOffset(coleiter, *start);
1187         while (expansion > 0) {
1188             // getting rid of the redundant ce, caused by setOffset.
1189             // since backward contraction/expansion may have extra ces if we
1190             // are in the normalization buffer, hasAccentsBeforeMatch would
1191             // have taken care of it.
1192             // E.g. the character \u01FA will have an expansion of 3, but if
1193             // we are only looking for acute and ring \u030A and \u0301, we'll
1194             // have to skip the first ce in the expansion buffer.
1195             ucol_next(coleiter, status);
1196             if (U_FAILURE(*status)) {
1197                 return FALSE;
1198             }
1199             if (ucol_getOffset(coleiter) != temp) {
1200                 *start = temp;
1201                 temp  = ucol_getOffset(coleiter);
1202             }
1203             expansion --;
1204         }
1205
1206         int32_t  *patternce       = strsrch->pattern.ces;
1207         int32_t   patterncelength = strsrch->pattern.cesLength;
1208         int32_t   count           = 0;
1209         while (count < patterncelength) {
1210             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211             if (ce == UCOL_IGNORABLE) {
1212                 continue;
1213             }
1214             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1215                 *start = temp;
1216                 temp   = ucol_getOffset(coleiter);
1217             }
1218             if (U_FAILURE(*status) || ce != patternce[count]) {
1219                 (*end) ++;
1220                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1221                 return FALSE;
1222             }
1223             count ++;
1224         }
1225     }
1226     return TRUE;
1227 }
1228
1229 /**
1230 * Checks and sets the match information if found.
1231 * Checks
1232 * <ul>
1233 * <li> the potential match does not repeat the previous match
1234 * <li> boundaries are correct
1235 * <li> exact matches has no extra accents
1236 * <li> identical matchesb
1237 * <li> potential match does not end in the middle of a contraction
1238 * <\ul>
1239 * Otherwise the offset will be shifted to the next character.
1240 * Internal method, status assumed to be success, caller has to check status
1241 * before calling this method.
1242 * @param strsrch string search data
1243 * @param textoffset offset in the collation element text. the returned value
1244 *        will be the truncated end offset of the match or the new start
1245 *        search offset.
1246 * @param status output error status if any
1247 * @return TRUE if the match is valid, FALSE otherwise
1248 */
1249 static
1250 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251                                  int32_t   *textoffset, UErrorCode *status)
1252 {
1253     UCollationElements *coleiter = strsrch->textIter;
1254     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1255
1256     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1257         return FALSE;
1258     }
1259
1260     // this totally matches, however we need to check if it is repeating
1261     if (!isBreakUnit(strsrch, start, *textoffset) ||
1262         checkRepeatedMatch(strsrch, start, *textoffset) ||
1263         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1264         !checkIdentical(strsrch, start, *textoffset) ||
1265         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1266
1267         (*textoffset) ++;
1268         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1269         return FALSE;
1270     }
1271
1272     //Add breakiterator boundary check for primary strength search.
1273     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274         checkBreakBoundary(strsrch, &start, textoffset);
1275     }
1276
1277     // totally match, we will get rid of the ending ignorables.
1278     strsrch->search->matchedIndex  = start;
1279     strsrch->search->matchedLength = *textoffset - start;
1280     return TRUE;
1281 }
1282
1283 /**
1284 * Getting the previous base character offset, or the current offset if the
1285 * current character is a base character
1286 * @param text string
1287 * @param textoffset one offset after the current character
1288 * @return the offset of the next character after the base character or the first
1289 *         composed character with accents
1290 */
1291 static
1292 inline int32_t getPreviousBaseOffset(const UChar       *text,
1293                                                int32_t  textoffset)
1294 {
1295     if (textoffset > 0) {
1296         for (;;) {
1297             int32_t result = textoffset;
1298             U16_BACK_1(text, 0, textoffset);
1299             int32_t temp = textoffset;
1300             uint16_t fcd = getFCD(text, &temp, result);
1301             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1302                 if (fcd & LAST_BYTE_MASK_) {
1303                     return textoffset;
1304                 }
1305                 return result;
1306             }
1307             if (textoffset == 0) {
1308                 return 0;
1309             }
1310         }
1311     }
1312     return textoffset;
1313 }
1314
1315 /**
1316 * Getting the indexes of the accents that are not blocked in the argument
1317 * accent array
1318 * @param accents array of accents in nfd terminated by a 0.
1319 * @param accentsindex array of indexes of the accents that are not blocked
1320 */
1321 static
1322 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1323 {
1324     int32_t index     = 0;
1325     int32_t     length    = u_strlen(accents);
1326     UChar32     codepoint = 0;
1327     int         cclass    = 0;
1328     int         result    = 0;
1329     int32_t temp;
1330     while (index < length) {
1331         temp = index;
1332         U16_NEXT(accents, index, length, codepoint);
1333         if (u_getCombiningClass(codepoint) != cclass) {
1334             cclass        = u_getCombiningClass(codepoint);
1335             accentsindex[result] = temp;
1336             result ++;
1337         }
1338     }
1339     accentsindex[result] = length;
1340     return result;
1341 }
1342
1343 /**
1344 * Appends 3 UChar arrays to a destination array.
1345 * Creates a new array if we run out of space. The caller will have to
1346 * manually deallocate the newly allocated array.
1347 * Internal method, status assumed to be success, caller has to check status
1348 * before calling this method. destination not to be NULL and has at least
1349 * size destinationlength.
1350 * @param destination target array
1351 * @param destinationlength target array size, returning the appended length
1352 * @param source1 null-terminated first array
1353 * @param source2 second array
1354 * @param source2length length of seond array
1355 * @param source3 null-terminated third array
1356 * @param status error status if any
1357 * @return new destination array, destination if there was no new allocation
1358 */
1359 static
1360 inline UChar * addToUCharArray(      UChar      *destination,
1361                                      int32_t    *destinationlength,
1362                                const UChar      *source1,
1363                                const UChar      *source2,
1364                                      int32_t     source2length,
1365                                const UChar      *source3,
1366                                      UErrorCode *status)
1367 {
1368     int32_t source1length = source1 ? u_strlen(source1) : 0;
1369     int32_t source3length = source3 ? u_strlen(source3) : 0;
1370     if (*destinationlength < source1length + source2length + source3length +
1371                                                                            1)
1372     {
1373         destination = (UChar *)allocateMemory(
1374           (source1length + source2length + source3length + 1) * sizeof(UChar),
1375           status);
1376         // if error allocating memory, status will be
1377         // U_MEMORY_ALLOCATION_ERROR
1378         if (U_FAILURE(*status)) {
1379             *destinationlength = 0;
1380             return NULL;
1381         }
1382     }
1383     if (source1length != 0) {
1384         u_memcpy(destination, source1, source1length);
1385     }
1386     if (source2length != 0) {
1387         uprv_memcpy(destination + source1length, source2,
1388                     sizeof(UChar) * source2length);
1389     }
1390     if (source3length != 0) {
1391         uprv_memcpy(destination + source1length + source2length, source3,
1392                     sizeof(UChar) * source3length);
1393     }
1394     *destinationlength = source1length + source2length + source3length;
1395     return destination;
1396 }
1397
1398 /**
1399 * Running through a collation element iterator to see if the contents matches
1400 * pattern in string search data
1401 * @param strsrch string search data
1402 * @param coleiter collation element iterator
1403 * @return TRUE if a match if found, FALSE otherwise
1404 */
1405 static
1406 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1407                                        UCollationElements *coleiter)
1408 {
1409     int         patternceindex = strsrch->pattern.cesLength;
1410     int32_t    *patternce      = strsrch->pattern.ces;
1411     UErrorCode  status = U_ZERO_ERROR;
1412     while (patternceindex > 0) {
1413         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414         if (ce == UCOL_IGNORABLE) {
1415             continue;
1416         }
1417         if (U_FAILURE(status) || ce != *patternce) {
1418             return FALSE;
1419         }
1420         patternce ++;
1421         patternceindex --;
1422     }
1423     return TRUE;
1424 }
1425
1426 /**
1427 * Rearranges the front accents to try matching.
1428 * Prefix accents in the text will be grouped according to their combining
1429 * class and the groups will be mixed and matched to try find the perfect
1430 * match with the pattern.
1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1434 *         "\u0301\u0325".
1435 * step 2: check if any of the generated substrings matches the pattern.
1436 * Internal method, status is assumed to be success, caller has to check status
1437 * before calling this method.
1438 * @param strsrch string search match
1439 * @param start first offset of the accents to start searching
1440 * @param end start of the last accent set
1441 * @param status output error status if any
1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1443 *         offset of the match. Note this start includes all preceding accents.
1444 */
1445 static
1446 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1447                                        int32_t    start,
1448                                        int32_t    end,
1449                                        UErrorCode    *status)
1450 {
1451     const UChar       *text       = strsrch->search->text;
1452           int32_t      textlength = strsrch->search->textLength;
1453           int32_t  tempstart  = start;
1454
1455     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1456         // die... failed at a base character
1457         return USEARCH_DONE;
1458     }
1459
1460     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461     start = getPreviousBaseOffset(text, tempstart);
1462
1463     UChar       accents[INITIAL_ARRAY_SIZE_];
1464     // normalizing the offensive string
1465     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1466                     INITIAL_ARRAY_SIZE_, status);
1467     if (U_FAILURE(*status)) {
1468         return USEARCH_DONE;
1469     }
1470
1471     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1472     int32_t         accentsize = getUnblockedAccentIndex(accents,
1473                                                                  accentsindex);
1474     int32_t         count      = (2 << (accentsize - 1)) - 1;
1475     UChar               buffer[INITIAL_ARRAY_SIZE_];
1476     UCollationElements *coleiter   = strsrch->utilIter;
1477     while (U_SUCCESS(*status) && count > 0) {
1478         UChar *rearrange = strsrch->canonicalPrefixAccents;
1479         // copy the base characters
1480         for (int k = 0; k < accentsindex[0]; k ++) {
1481             *rearrange ++ = accents[k];
1482         }
1483         // forming all possible canonical rearrangement by dropping
1484         // sets of accents
1485         for (int i = 0; i <= accentsize - 1; i ++) {
1486             int32_t mask = 1 << (accentsize - i - 1);
1487             if (count & mask) {
1488                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1489                     *rearrange ++ = accents[j];
1490                 }
1491             }
1492         }
1493         *rearrange = 0;
1494         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1495         UChar   *match     = addToUCharArray(buffer, &matchsize,
1496                                            strsrch->canonicalPrefixAccents,
1497                                            strsrch->search->text + offset,
1498                                            end - offset,
1499                                            strsrch->canonicalSuffixAccents,
1500                                            status);
1501
1502         // if status is a failure, ucol_setText does nothing.
1503         // run the collator iterator through this match
1504         ucol_setText(coleiter, match, matchsize, status);
1505         if (U_SUCCESS(*status)) {
1506             if (checkCollationMatch(strsrch, coleiter)) {
1507                 if (match != buffer) {
1508                     uprv_free(match);
1509                 }
1510                 return start;
1511             }
1512         }
1513         count --;
1514     }
1515     return USEARCH_DONE;
1516 }
1517
1518 /**
1519 * Gets the offset to the safe point in text before textoffset.
1520 * ie. not the middle of a contraction, swappable characters or supplementary
1521 * characters.
1522 * @param collator collation sata
1523 * @param text string to work with
1524 * @param textoffset offset in string
1525 * @param textlength length of text string
1526 * @return offset to the previous safe character
1527 */
1528 static
1529 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1530                                       const UChar       *text,
1531                                             int32_t  textoffset)
1532 {
1533     int32_t result = textoffset; // first contraction character
1534     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1535         result --;
1536     }
1537     if (result != 0) {
1538         // the first contraction character is consider unsafe here
1539         result --;
1540     }
1541     return result;
1542 }
1543
1544 /**
1545 * Cleaning up after we passed the safe zone
1546 * @param strsrch string search data
1547 * @param safetext safe text array
1548 * @param safebuffer safe text buffer
1549 * @param coleiter collation element iterator for safe text
1550 */
1551 static
1552 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1553                                   UChar         *safebuffer)
1554 {
1555     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1556     {
1557        uprv_free(safetext);
1558     }
1559 }
1560
1561 /**
1562 * Take the rearranged end accents and tries matching. If match failed at
1563 * a seperate preceding set of accents (seperated from the rearranged on by
1564 * at least a base character) then we rearrange the preceding accents and
1565 * tries matching again.
1566 * We allow skipping of the ends of the accent set if the ces do not match.
1567 * However if the failure is found before the accent set, it fails.
1568 * Internal method, status assumed to be success, caller has to check status
1569 * before calling this method.
1570 * @param strsrch string search data
1571 * @param textoffset of the start of the rearranged accent
1572 * @param status output error status if any
1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1574 *         offset of the match. Note this start includes all preceding accents.
1575 */
1576 static
1577 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1578                                        int32_t    textoffset,
1579                                        UErrorCode    *status)
1580 {
1581     const UChar              *text           = strsrch->search->text;
1582     const UCollator          *collator       = strsrch->collator;
1583           int32_t             safelength     = 0;
1584           UChar              *safetext;
1585           int32_t             safetextlength;
1586           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1587           UCollationElements *coleiter       = strsrch->utilIter;
1588           int32_t         safeoffset     = textoffset;
1589
1590     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1591                                          collator)) {
1592         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1593         safelength     = textoffset - safeoffset;
1594         safetextlength = INITIAL_ARRAY_SIZE_;
1595         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1596                                          text + safeoffset, safelength,
1597                                          strsrch->canonicalSuffixAccents,
1598                                          status);
1599     }
1600     else {
1601         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602         safetext       = strsrch->canonicalSuffixAccents;
1603     }
1604
1605     // if status is a failure, ucol_setText does nothing
1606     ucol_setText(coleiter, safetext, safetextlength, status);
1607     // status checked in loop below
1608
1609     int32_t  *ce        = strsrch->pattern.ces;
1610     int32_t   celength  = strsrch->pattern.cesLength;
1611     int       ceindex   = celength - 1;
1612     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1613
1614     while (ceindex >= 0) {
1615         int32_t textce = ucol_previous(coleiter, status);
1616         if (U_FAILURE(*status)) {
1617             if (isSafe) {
1618                 cleanUpSafeText(strsrch, safetext, safebuffer);
1619             }
1620             return USEARCH_DONE;
1621         }
1622         if (textce == UCOL_NULLORDER) {
1623             // check if we have passed the safe buffer
1624             if (coleiter == strsrch->textIter) {
1625                 cleanUpSafeText(strsrch, safetext, safebuffer);
1626                 return USEARCH_DONE;
1627             }
1628             cleanUpSafeText(strsrch, safetext, safebuffer);
1629             safetext = safebuffer;
1630             coleiter = strsrch->textIter;
1631             setColEIterOffset(coleiter, safeoffset);
1632             // status checked at the start of the loop
1633             isSafe = FALSE;
1634             continue;
1635         }
1636         textce = getCE(strsrch, textce);
1637         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638             // do the beginning stuff
1639             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640             if (isSafe && failedoffset >= safelength) {
1641                 // alas... no hope. failed at rearranged accent set
1642                 cleanUpSafeText(strsrch, safetext, safebuffer);
1643                 return USEARCH_DONE;
1644             }
1645             else {
1646                 if (isSafe) {
1647                     failedoffset += safeoffset;
1648                     cleanUpSafeText(strsrch, safetext, safebuffer);
1649                 }
1650
1651                 // try rearranging the front accents
1652                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653                                         failedoffset, textoffset, status);
1654                 if (result != USEARCH_DONE) {
1655                     // if status is a failure, ucol_setOffset does nothing
1656                     setColEIterOffset(strsrch->textIter, result);
1657                 }
1658                 if (U_FAILURE(*status)) {
1659                     return USEARCH_DONE;
1660                 }
1661                 return result;
1662             }
1663         }
1664         if (textce == ce[ceindex]) {
1665             ceindex --;
1666         }
1667     }
1668     // set offset here
1669     if (isSafe) {
1670         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1671         // sets the text iterator here with the correct expansion and offset
1672         int32_t    leftoverces = getExpansionPrefix(coleiter);
1673         cleanUpSafeText(strsrch, safetext, safebuffer);
1674         if (result >= safelength) {
1675             result = textoffset;
1676         }
1677         else {
1678             result += safeoffset;
1679         }
1680         setColEIterOffset(strsrch->textIter, result);
1681         strsrch->textIter->iteratordata_.toReturn =
1682                        setExpansionPrefix(strsrch->textIter, leftoverces);
1683         return result;
1684     }
1685
1686     return ucol_getOffset(coleiter);
1687 }
1688
1689 /**
1690 * Trying out the substring and sees if it can be a canonical match.
1691 * This will try normalizing the end accents and arranging them into canonical
1692 * equivalents and check their corresponding ces with the pattern ce.
1693 * Suffix accents in the text will be grouped according to their combining
1694 * class and the groups will be mixed and matched to try find the perfect
1695 * match with the pattern.
1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1699 *         "\u0301\u0325".
1700 * step 2: check if any of the generated substrings matches the pattern.
1701 * Internal method, status assumed to be success, caller has to check status
1702 * before calling this method.
1703 * @param strsrch string search data
1704 * @param textoffset end offset in the collation element text that ends with
1705 *                   the accents to be rearranged
1706 * @param status error status if any
1707 * @return TRUE if the match is valid, FALSE otherwise
1708 */
1709 static
1710 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1711                            int32_t    textoffset,
1712                            UErrorCode    *status)
1713 {
1714     const UChar       *text = strsrch->search->text;
1715           int32_t  temp = textoffset;
1716     U16_BACK_1(text, 0, temp);
1717     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1718         UCollationElements *coleiter = strsrch->textIter;
1719         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1720         if (strsrch->pattern.hasPrefixAccents) {
1721             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1722                                                 status);
1723             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724                 setColEIterOffset(coleiter, offset);
1725                 return TRUE;
1726             }
1727         }
1728         return FALSE;
1729     }
1730
1731     if (!strsrch->pattern.hasSuffixAccents) {
1732         return FALSE;
1733     }
1734
1735     UChar       accents[INITIAL_ARRAY_SIZE_];
1736     // offset to the last base character in substring to search
1737     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738     // normalizing the offensive string
1739     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740                                0, accents, INITIAL_ARRAY_SIZE_, status);
1741     // status checked in loop below
1742
1743     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1745
1746     // 2 power n - 1 plus the full set of accents
1747     int32_t  count = (2 << (size - 1)) - 1;
1748     while (U_SUCCESS(*status) && count > 0) {
1749         UChar *rearrange = strsrch->canonicalSuffixAccents;
1750         // copy the base characters
1751         for (int k = 0; k < accentsindex[0]; k ++) {
1752             *rearrange ++ = accents[k];
1753         }
1754         // forming all possible canonical rearrangement by dropping
1755         // sets of accents
1756         for (int i = 0; i <= size - 1; i ++) {
1757             int32_t mask = 1 << (size - i - 1);
1758             if (count & mask) {
1759                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1760                     *rearrange ++ = accents[j];
1761                 }
1762             }
1763         }
1764         *rearrange = 0;
1765         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1766                                                         status);
1767         if (offset != USEARCH_DONE) {
1768             return TRUE; // match found
1769         }
1770         count --;
1771     }
1772     return FALSE;
1773 }
1774
1775 /**
1776 * Gets the previous base character offset depending on the string search
1777 * pattern data
1778 * @param strsrch string search data
1779 * @param textoffset current offset, current character
1780 * @return the offset of the next character after this base character or itself
1781 *         if it is a composed character with accents
1782 */
1783 static
1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1785                                                       int32_t textoffset)
1786 {
1787     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1788         const UChar       *text = strsrch->search->text;
1789               int32_t  offset = textoffset;
1790         if (getFCD(text, &offset, strsrch->search->textLength) >>
1791                                                    SECOND_LAST_BYTE_SHIFT_) {
1792             return getPreviousBaseOffset(text, textoffset);
1793         }
1794     }
1795     return textoffset;
1796 }
1797
1798 /**
1799 * Checks match for contraction.
1800 * If the match ends with a partial contraction we fail.
1801 * If the match starts too far off (because of backwards iteration) we try to
1802 * chip off the extra characters
1803 * Internal method, status assumed to be success, caller has to check status
1804 * before calling this method.
1805 * @param strsrch string search data
1806 * @param start offset of potential match, to be modified if necessary
1807 * @param end offset of potential match, to be modified if necessary
1808 * @param status output error status if any
1809 * @return TRUE if match passes the contraction test, FALSE otherwise
1810 */
1811 static
1812 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1813                                          int32_t   *start,
1814                                          int32_t   *end,
1815                                          UErrorCode    *status)
1816 {
1817           UCollationElements *coleiter   = strsrch->textIter;
1818           int32_t             textlength = strsrch->search->textLength;
1819           int32_t         temp       = *start;
1820     const UCollator          *collator   = strsrch->collator;
1821     const UChar              *text       = strsrch->search->text;
1822     // This part checks if either ends of the match contains potential
1823     // contraction. If so we'll have to iterate through them
1824     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1825         (*start + 1 < textlength
1826          && ucol_unsafeCP(text[*start + 1], collator))) {
1827         int32_t expansion  = getExpansionPrefix(coleiter);
1828         UBool   expandflag = expansion > 0;
1829         setColEIterOffset(coleiter, *start);
1830         while (expansion > 0) {
1831             // getting rid of the redundant ce, caused by setOffset.
1832             // since backward contraction/expansion may have extra ces if we
1833             // are in the normalization buffer, hasAccentsBeforeMatch would
1834             // have taken care of it.
1835             // E.g. the character \u01FA will have an expansion of 3, but if
1836             // we are only looking for acute and ring \u030A and \u0301, we'll
1837             // have to skip the first ce in the expansion buffer.
1838             ucol_next(coleiter, status);
1839             if (U_FAILURE(*status)) {
1840                 return FALSE;
1841             }
1842             if (ucol_getOffset(coleiter) != temp) {
1843                 *start = temp;
1844                 temp  = ucol_getOffset(coleiter);
1845             }
1846             expansion --;
1847         }
1848
1849         int32_t  *patternce       = strsrch->pattern.ces;
1850         int32_t   patterncelength = strsrch->pattern.cesLength;
1851         int32_t   count           = 0;
1852         int32_t   textlength      = strsrch->search->textLength;
1853         while (count < patterncelength) {
1854             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855             // status checked below, note that if status is a failure
1856             // ucol_next returns UCOL_NULLORDER
1857             if (ce == UCOL_IGNORABLE) {
1858                 continue;
1859             }
1860             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1861                 *start = temp;
1862                 temp   = ucol_getOffset(coleiter);
1863             }
1864
1865             if (count == 0 && ce != patternce[0]) {
1866                 // accents may have extra starting ces, this occurs when a
1867                 // pure accent pattern is matched without rearrangement
1868                 // text \u0325\u0300 and looking for \u0300
1869                 int32_t expected = patternce[0];
1870                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871                     ce = getCE(strsrch, ucol_next(coleiter, status));
1872                     while (U_SUCCESS(*status) && ce != expected &&
1873                            ce != UCOL_NULLORDER &&
1874                            ucol_getOffset(coleiter) <= *end) {
1875                         ce = getCE(strsrch, ucol_next(coleiter, status));
1876                     }
1877                 }
1878             }
1879             if (U_FAILURE(*status) || ce != patternce[count]) {
1880                 (*end) ++;
1881                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1882                 return FALSE;
1883             }
1884             count ++;
1885         }
1886     }
1887     return TRUE;
1888 }
1889
1890 /**
1891 * Checks and sets the match information if found.
1892 * Checks
1893 * <ul>
1894 * <li> the potential match does not repeat the previous match
1895 * <li> boundaries are correct
1896 * <li> potential match does not end in the middle of a contraction
1897 * <li> identical matches
1898 * <\ul>
1899 * Otherwise the offset will be shifted to the next character.
1900 * Internal method, status assumed to be success, caller has to check the
1901 * status before calling this method.
1902 * @param strsrch string search data
1903 * @param textoffset offset in the collation element text. the returned value
1904 *        will be the truncated end offset of the match or the new start
1905 *        search offset.
1906 * @param status output error status if any
1907 * @return TRUE if the match is valid, FALSE otherwise
1908 */
1909 static
1910 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911                                      int32_t   *textoffset,
1912                                      UErrorCode    *status)
1913 {
1914     // to ensure that the start and ends are not composite characters
1915     UCollationElements *coleiter = strsrch->textIter;
1916     // if we have a canonical accent match
1917     if ((strsrch->pattern.hasSuffixAccents &&
1918         strsrch->canonicalSuffixAccents[0]) ||
1919         (strsrch->pattern.hasPrefixAccents &&
1920         strsrch->canonicalPrefixAccents[0])) {
1921         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1922                                                     strsrch,
1923                                                     ucol_getOffset(coleiter));
1924         strsrch->search->matchedLength = *textoffset -
1925                                                 strsrch->search->matchedIndex;
1926         return TRUE;
1927     }
1928
1929     int32_t start = getColElemIterOffset(coleiter, FALSE);
1930     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931                                             status) || U_FAILURE(*status)) {
1932         return FALSE;
1933     }
1934
1935     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936     // this totally matches, however we need to check if it is repeating
1937     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1938         !isBreakUnit(strsrch, start, *textoffset) ||
1939         !checkIdentical(strsrch, start, *textoffset)) {
1940         (*textoffset) ++;
1941         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1942                                         strsrch->search->textLength);
1943         return FALSE;
1944     }
1945
1946     strsrch->search->matchedIndex  = start;
1947     strsrch->search->matchedLength = *textoffset - start;
1948     return TRUE;
1949 }
1950
1951 /**
1952 * Shifting the collation element iterator position forward to prepare for
1953 * a preceding match. If the first character is a unsafe character, we'll only
1954 * shift by 1 to capture contractions, normalization etc.
1955 * Internal method, status assumed to be success, caller has to check status
1956 * before calling this method.
1957 * @param text strsrch string search data
1958 * @param textoffset start text position to do search
1959 * @param ce the text ce which failed the match.
1960 * @param patternceindex index of the ce within the pattern ce buffer which
1961 *        failed the match
1962 * @return final offset
1963 */
1964 static
1965 inline int32_t reverseShift(UStringSearch *strsrch,
1966                                 int32_t    textoffset,
1967                                 int32_t       ce,
1968                                 int32_t        patternceindex)
1969 {
1970     if (strsrch->search->isOverlap) {
1971         if (textoffset != strsrch->search->textLength) {
1972             textoffset --;
1973         }
1974         else {
1975             textoffset -= strsrch->pattern.defaultShiftSize;
1976         }
1977     }
1978     else {
1979         if (ce != UCOL_NULLORDER) {
1980             int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1981
1982             // this is to adjust for characters in the middle of the substring
1983             // for matching that failed.
1984             int32_t adjust = patternceindex;
1985             if (adjust > 1 && shift > adjust) {
1986                 shift -= adjust - 1;
1987             }
1988             textoffset -= shift;
1989         }
1990         else {
1991             textoffset -= strsrch->pattern.defaultShiftSize;
1992         }
1993     }
1994     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1995     return textoffset;
1996 }
1997
1998 /**
1999 * Checks match for contraction.
2000 * If the match starts with a partial contraction we fail.
2001 * Internal method, status assumed to be success, caller has to check status
2002 * before calling this method.
2003 * @param strsrch string search data
2004 * @param start offset of potential match, to be modified if necessary
2005 * @param end offset of potential match, to be modified if necessary
2006 * @param status output error status if any
2007 * @return TRUE if match passes the contraction test, FALSE otherwise
2008 */
2009 static
2010 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2011                                      int32_t   *start,
2012                                      int32_t   *end, UErrorCode  *status)
2013 {
2014           UCollationElements *coleiter   = strsrch->textIter;
2015           int32_t             textlength = strsrch->search->textLength;
2016           int32_t             temp       = *end;
2017     const UCollator          *collator   = strsrch->collator;
2018     const UChar              *text       = strsrch->search->text;
2019     // This part checks if either if the start of the match contains potential
2020     // contraction. If so we'll have to iterate through them
2021     // Since we used ucol_next while previously looking for the potential
2022     // match, this guarantees that our end will not be a partial contraction,
2023     // or a partial supplementary character.
2024     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2025         int32_t expansion  = getExpansionSuffix(coleiter);
2026         UBool   expandflag = expansion > 0;
2027         setColEIterOffset(coleiter, *end);
2028         while (U_SUCCESS(*status) && expansion > 0) {
2029             // getting rid of the redundant ce
2030             // since forward contraction/expansion may have extra ces
2031             // if we are in the normalization buffer, hasAccentsBeforeMatch
2032             // would have taken care of it.
2033             // E.g. the character \u01FA will have an expansion of 3, but if
2034             // we are only looking for A ring A\u030A, we'll have to skip the
2035             // last ce in the expansion buffer
2036             ucol_previous(coleiter, status);
2037             if (U_FAILURE(*status)) {
2038                 return FALSE;
2039             }
2040             if (ucol_getOffset(coleiter) != temp) {
2041                 *end = temp;
2042                 temp  = ucol_getOffset(coleiter);
2043             }
2044             expansion --;
2045         }
2046
2047         int32_t  *patternce       = strsrch->pattern.ces;
2048         int32_t   patterncelength = strsrch->pattern.cesLength;
2049         int32_t   count           = patterncelength;
2050         while (count > 0) {
2051             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052             // status checked below, note that if status is a failure
2053             // ucol_previous returns UCOL_NULLORDER
2054             if (ce == UCOL_IGNORABLE) {
2055                 continue;
2056             }
2057             if (expandflag && count == 0 &&
2058                 getColElemIterOffset(coleiter, FALSE) != temp) {
2059                 *end = temp;
2060                 temp  = ucol_getOffset(coleiter);
2061             }
2062             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2063                 (*start) --;
2064                 *start = getPreviousBaseOffset(text, *start);
2065                 return FALSE;
2066             }
2067             count --;
2068         }
2069     }
2070     return TRUE;
2071 }
2072
2073 /**
2074 * Checks and sets the match information if found.
2075 * Checks
2076 * <ul>
2077 * <li> the current match does not repeat the last match
2078 * <li> boundaries are correct
2079 * <li> exact matches has no extra accents
2080 * <li> identical matches
2081 * <\ul>
2082 * Otherwise the offset will be shifted to the preceding character.
2083 * Internal method, status assumed to be success, caller has to check status
2084 * before calling this method.
2085 * @param strsrch string search data
2086 * @param collator
2087 * @param coleiter collation element iterator
2088 * @param text string
2089 * @param textoffset offset in the collation element text. the returned value
2090 *        will be the truncated start offset of the match or the new start
2091 *        search offset.
2092 * @param status output error status if any
2093 * @return TRUE if the match is valid, FALSE otherwise
2094 */
2095 static
2096 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097                                      int32_t   *textoffset,
2098                                      UErrorCode    *status)
2099 {
2100     // to ensure that the start and ends are not composite characters
2101     int32_t end = ucol_getOffset(strsrch->textIter);
2102     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103         || U_FAILURE(*status)) {
2104             return FALSE;
2105     }
2106
2107     // this totally matches, however we need to check if it is repeating
2108     // the old match
2109     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2110         !isBreakUnit(strsrch, *textoffset, end) ||
2111         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2112         !checkIdentical(strsrch, *textoffset, end) ||
2113         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2114         (*textoffset) --;
2115         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2116                                             *textoffset);
2117         return FALSE;
2118     }
2119
2120     //Add breakiterator boundary check for primary strength search.
2121     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122         checkBreakBoundary(strsrch, textoffset, &end);
2123     }
2124
2125     strsrch->search->matchedIndex = *textoffset;
2126     strsrch->search->matchedLength = end - *textoffset;
2127     return TRUE;
2128 }
2129
2130 /**
2131 * Rearranges the end accents to try matching.
2132 * Suffix accents in the text will be grouped according to their combining
2133 * class and the groups will be mixed and matched to try find the perfect
2134 * match with the pattern.
2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2138 *         "\u0301\u0325".
2139 * step 2: check if any of the generated substrings matches the pattern.
2140 * Internal method, status assumed to be success, user has to check status
2141 * before calling this method.
2142 * @param strsrch string search match
2143 * @param start offset of the first base character
2144 * @param end start of the last accent set
2145 * @param status only error status if any
2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2147 *         offset of the match. Note this start includes all following accents.
2148 */
2149 static
2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2151                                            int32_t    start,
2152                                            int32_t    end,
2153                                            UErrorCode    *status)
2154 {
2155     const UChar       *text       = strsrch->search->text;
2156           int32_t  tempend    = end;
2157
2158     U16_BACK_1(text, 0, tempend);
2159     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2160                                                            LAST_BYTE_MASK_)) {
2161         // die... failed at a base character
2162         return USEARCH_DONE;
2163     }
2164     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2165
2166     if (U_SUCCESS(*status)) {
2167         UChar       accents[INITIAL_ARRAY_SIZE_];
2168         int32_t offset = getPreviousBaseOffset(text, end);
2169         // normalizing the offensive string
2170         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2171                         INITIAL_ARRAY_SIZE_, status);
2172
2173         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2174         int32_t         accentsize = getUnblockedAccentIndex(accents,
2175                                                          accentsindex);
2176         int32_t         count      = (2 << (accentsize - 1)) - 1;
2177         UChar               buffer[INITIAL_ARRAY_SIZE_];
2178         UCollationElements *coleiter = strsrch->utilIter;
2179         while (U_SUCCESS(*status) && count > 0) {
2180             UChar *rearrange = strsrch->canonicalSuffixAccents;
2181             // copy the base characters
2182             for (int k = 0; k < accentsindex[0]; k ++) {
2183                 *rearrange ++ = accents[k];
2184             }
2185             // forming all possible canonical rearrangement by dropping
2186             // sets of accents
2187             for (int i = 0; i <= accentsize - 1; i ++) {
2188                 int32_t mask = 1 << (accentsize - i - 1);
2189                 if (count & mask) {
2190                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2191                         *rearrange ++ = accents[j];
2192                     }
2193                 }
2194             }
2195             *rearrange = 0;
2196             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2197             UChar   *match     = addToUCharArray(buffer, &matchsize,
2198                                            strsrch->canonicalPrefixAccents,
2199                                            strsrch->search->text + start,
2200                                            offset - start,
2201                                            strsrch->canonicalSuffixAccents,
2202                                            status);
2203
2204             // run the collator iterator through this match
2205             // if status is a failure ucol_setText does nothing
2206             ucol_setText(coleiter, match, matchsize, status);
2207             if (U_SUCCESS(*status)) {
2208                 if (checkCollationMatch(strsrch, coleiter)) {
2209                     if (match != buffer) {
2210                         uprv_free(match);
2211                     }
2212                     return end;
2213                 }
2214             }
2215             count --;
2216         }
2217     }
2218     return USEARCH_DONE;
2219 }
2220
2221 /**
2222 * Take the rearranged start accents and tries matching. If match failed at
2223 * a seperate following set of accents (seperated from the rearranged on by
2224 * at least a base character) then we rearrange the preceding accents and
2225 * tries matching again.
2226 * We allow skipping of the ends of the accent set if the ces do not match.
2227 * However if the failure is found before the accent set, it fails.
2228 * Internal method, status assumed to be success, caller has to check status
2229 * before calling this method.
2230 * @param strsrch string search data
2231 * @param textoffset of the ends of the rearranged accent
2232 * @param status output error status if any
2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2234 *         offset of the match. Note this start includes all following accents.
2235 */
2236 static
2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2238                                            int32_t    textoffset,
2239                                            UErrorCode    *status)
2240 {
2241     const UChar       *text       = strsrch->search->text;
2242     const UCollator   *collator   = strsrch->collator;
2243           int32_t      safelength = 0;
2244           UChar       *safetext;
2245           int32_t      safetextlength;
2246           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2247           int32_t  safeoffset = textoffset;
2248
2249     if (textoffset &&
2250         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2252                                          ], collator)) {
2253         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2254                                            strsrch->search->textLength);
2255         safelength     = safeoffset - textoffset;
2256         safetextlength = INITIAL_ARRAY_SIZE_;
2257         safetext       = addToUCharArray(safebuffer, &safetextlength,
2258                                          strsrch->canonicalPrefixAccents,
2259                                          text + textoffset, safelength,
2260                                          NULL, status);
2261     }
2262     else {
2263         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264         safetext       = strsrch->canonicalPrefixAccents;
2265     }
2266
2267     UCollationElements *coleiter = strsrch->utilIter;
2268      // if status is a failure, ucol_setText does nothing
2269     ucol_setText(coleiter, safetext, safetextlength, status);
2270     // status checked in loop below
2271
2272     int32_t  *ce           = strsrch->pattern.ces;
2273     int32_t   celength     = strsrch->pattern.cesLength;
2274     int       ceindex      = 0;
2275     UBool     isSafe       = TRUE; // safe zone indication flag for position
2276     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2277
2278     while (ceindex < celength) {
2279         int32_t textce = ucol_next(coleiter, status);
2280         if (U_FAILURE(*status)) {
2281             if (isSafe) {
2282                 cleanUpSafeText(strsrch, safetext, safebuffer);
2283             }
2284             return USEARCH_DONE;
2285         }
2286         if (textce == UCOL_NULLORDER) {
2287             // check if we have passed the safe buffer
2288             if (coleiter == strsrch->textIter) {
2289                 cleanUpSafeText(strsrch, safetext, safebuffer);
2290                 return USEARCH_DONE;
2291             }
2292             cleanUpSafeText(strsrch, safetext, safebuffer);
2293             safetext = safebuffer;
2294             coleiter = strsrch->textIter;
2295             setColEIterOffset(coleiter, safeoffset);
2296             // status checked at the start of the loop
2297             isSafe = FALSE;
2298             continue;
2299         }
2300         textce = getCE(strsrch, textce);
2301         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302             // do the beginning stuff
2303             int32_t failedoffset = ucol_getOffset(coleiter);
2304             if (isSafe && failedoffset <= prefixlength) {
2305                 // alas... no hope. failed at rearranged accent set
2306                 cleanUpSafeText(strsrch, safetext, safebuffer);
2307                 return USEARCH_DONE;
2308             }
2309             else {
2310                 if (isSafe) {
2311                     failedoffset = safeoffset - failedoffset;
2312                     cleanUpSafeText(strsrch, safetext, safebuffer);
2313                 }
2314
2315                 // try rearranging the end accents
2316                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317                                         textoffset, failedoffset, status);
2318                 if (result != USEARCH_DONE) {
2319                     // if status is a failure, ucol_setOffset does nothing
2320                     setColEIterOffset(strsrch->textIter, result);
2321                 }
2322                 if (U_FAILURE(*status)) {
2323                     return USEARCH_DONE;
2324                 }
2325                 return result;
2326             }
2327         }
2328         if (textce == ce[ceindex]) {
2329             ceindex ++;
2330         }
2331     }
2332     // set offset here
2333     if (isSafe) {
2334         int32_t result      = ucol_getOffset(coleiter);
2335         // sets the text iterator here with the correct expansion and offset
2336         int32_t     leftoverces = getExpansionSuffix(coleiter);
2337         cleanUpSafeText(strsrch, safetext, safebuffer);
2338         if (result <= prefixlength) {
2339             result = textoffset;
2340         }
2341         else {
2342             result = textoffset + (safeoffset - result);
2343         }
2344         setColEIterOffset(strsrch->textIter, result);
2345         setExpansionSuffix(strsrch->textIter, leftoverces);
2346         return result;
2347     }
2348
2349     return ucol_getOffset(coleiter);
2350 }
2351
2352 /**
2353 * Trying out the substring and sees if it can be a canonical match.
2354 * This will try normalizing the starting accents and arranging them into
2355 * canonical equivalents and check their corresponding ces with the pattern ce.
2356 * Prefix accents in the text will be grouped according to their combining
2357 * class and the groups will be mixed and matched to try find the perfect
2358 * match with the pattern.
2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2362 *         "\u0301\u0325".
2363 * step 2: check if any of the generated substrings matches the pattern.
2364 * Internal method, status assumed to be success, caller has to check status
2365 * before calling this method.
2366 * @param strsrch string search data
2367 * @param textoffset start offset in the collation element text that starts
2368 *                   with the accents to be rearranged
2369 * @param status output error status if any
2370 * @return TRUE if the match is valid, FALSE otherwise
2371 */
2372 static
2373 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2374                                int32_t    textoffset,
2375                                UErrorCode    *status)
2376 {
2377     const UChar       *text       = strsrch->search->text;
2378           int32_t  temp       = textoffset;
2379           int32_t      textlength = strsrch->search->textLength;
2380     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2381         UCollationElements *coleiter = strsrch->textIter;
2382         int32_t         offset   = ucol_getOffset(coleiter);
2383         if (strsrch->pattern.hasSuffixAccents) {
2384             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2385                                                     offset, status);
2386             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387                 setColEIterOffset(coleiter, offset);
2388                 return TRUE;
2389             }
2390         }
2391         return FALSE;
2392     }
2393
2394     if (!strsrch->pattern.hasPrefixAccents) {
2395         return FALSE;
2396     }
2397
2398     UChar       accents[INITIAL_ARRAY_SIZE_];
2399     // offset to the last base character in substring to search
2400     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401     // normalizing the offensive string
2402     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403                                0, accents, INITIAL_ARRAY_SIZE_, status);
2404     // status checked in loop
2405
2406     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2408
2409     // 2 power n - 1 plus the full set of accents
2410     int32_t  count = (2 << (size - 1)) - 1;
2411     while (U_SUCCESS(*status) && count > 0) {
2412         UChar *rearrange = strsrch->canonicalPrefixAccents;
2413         // copy the base characters
2414         for (int k = 0; k < accentsindex[0]; k ++) {
2415             *rearrange ++ = accents[k];
2416         }
2417         // forming all possible canonical rearrangement by dropping
2418         // sets of accents
2419         for (int i = 0; i <= size - 1; i ++) {
2420             int32_t mask = 1 << (size - i - 1);
2421             if (count & mask) {
2422                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2423                     *rearrange ++ = accents[j];
2424                 }
2425             }
2426         }
2427         *rearrange = 0;
2428         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429                                                           baseoffset, status);
2430         if (offset != USEARCH_DONE) {
2431             return TRUE; // match found
2432         }
2433         count --;
2434     }
2435     return FALSE;
2436 }
2437
2438 /**
2439 * Checks match for contraction.
2440 * If the match starts with a partial contraction we fail.
2441 * Internal method, status assumed to be success, caller has to check status
2442 * before calling this method.
2443 * @param strsrch string search data
2444 * @param start offset of potential match, to be modified if necessary
2445 * @param end offset of potential match, to be modified if necessary
2446 * @param status only error status if any
2447 * @return TRUE if match passes the contraction test, FALSE otherwise
2448 */
2449 static
2450 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2451                                      int32_t   *start,
2452                                      int32_t   *end, UErrorCode  *status)
2453 {
2454           UCollationElements *coleiter   = strsrch->textIter;
2455           int32_t             textlength = strsrch->search->textLength;
2456           int32_t         temp       = *end;
2457     const UCollator          *collator   = strsrch->collator;
2458     const UChar              *text       = strsrch->search->text;
2459     // This part checks if either if the start of the match contains potential
2460     // contraction. If so we'll have to iterate through them
2461     // Since we used ucol_next while previously looking for the potential
2462     // match, this guarantees that our end will not be a partial contraction,
2463     // or a partial supplementary character.
2464     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2465         int32_t expansion  = getExpansionSuffix(coleiter);
2466         UBool   expandflag = expansion > 0;
2467         setColEIterOffset(coleiter, *end);
2468         while (expansion > 0) {
2469             // getting rid of the redundant ce
2470             // since forward contraction/expansion may have extra ces
2471             // if we are in the normalization buffer, hasAccentsBeforeMatch
2472             // would have taken care of it.
2473             // E.g. the character \u01FA will have an expansion of 3, but if
2474             // we are only looking for A ring A\u030A, we'll have to skip the
2475             // last ce in the expansion buffer
2476             ucol_previous(coleiter, status);
2477             if (U_FAILURE(*status)) {
2478                 return FALSE;
2479             }
2480             if (ucol_getOffset(coleiter) != temp) {
2481                 *end = temp;
2482                 temp  = ucol_getOffset(coleiter);
2483             }
2484             expansion --;
2485         }
2486
2487         int32_t  *patternce       = strsrch->pattern.ces;
2488         int32_t   patterncelength = strsrch->pattern.cesLength;
2489         int32_t   count           = patterncelength;
2490         while (count > 0) {
2491             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492             // status checked below, note that if status is a failure
2493             // ucol_previous returns UCOL_NULLORDER
2494             if (ce == UCOL_IGNORABLE) {
2495                 continue;
2496             }
2497             if (expandflag && count == 0 &&
2498                 getColElemIterOffset(coleiter, FALSE) != temp) {
2499                 *end = temp;
2500                 temp  = ucol_getOffset(coleiter);
2501             }
2502             if (count == patterncelength &&
2503                 ce != patternce[patterncelength - 1]) {
2504                 // accents may have extra starting ces, this occurs when a
2505                 // pure accent pattern is matched without rearrangement
2506                 int32_t    expected = patternce[patterncelength - 1];
2507                 U16_BACK_1(text, 0, *end);
2508                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2510                     while (U_SUCCESS(*status) && ce != expected &&
2511                            ce != UCOL_NULLORDER &&
2512                            ucol_getOffset(coleiter) <= *start) {
2513                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2514                     }
2515                 }
2516             }
2517             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2518                 (*start) --;
2519                 *start = getPreviousBaseOffset(text, *start);
2520                 return FALSE;
2521             }
2522             count --;
2523         }
2524     }
2525     return TRUE;
2526 }
2527
2528 /**
2529 * Checks and sets the match information if found.
2530 * Checks
2531 * <ul>
2532 * <li> the potential match does not repeat the previous match
2533 * <li> boundaries are correct
2534 * <li> potential match does not end in the middle of a contraction
2535 * <li> identical matches
2536 * <\ul>
2537 * Otherwise the offset will be shifted to the next character.
2538 * Internal method, status assumed to be success, caller has to check status
2539 * before calling this method.
2540 * @param strsrch string search data
2541 * @param textoffset offset in the collation element text. the returned value
2542 *        will be the truncated start offset of the match or the new start
2543 *        search offset.
2544 * @param status only error status if any
2545 * @return TRUE if the match is valid, FALSE otherwise
2546 */
2547 static
2548 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549                                          int32_t   *textoffset,
2550                                          UErrorCode    *status)
2551 {
2552     // to ensure that the start and ends are not composite characters
2553     UCollationElements *coleiter = strsrch->textIter;
2554     // if we have a canonical accent match
2555     if ((strsrch->pattern.hasSuffixAccents &&
2556         strsrch->canonicalSuffixAccents[0]) ||
2557         (strsrch->pattern.hasPrefixAccents &&
2558         strsrch->canonicalPrefixAccents[0])) {
2559         strsrch->search->matchedIndex  = *textoffset;
2560         strsrch->search->matchedLength =
2561             getNextUStringSearchBaseOffset(strsrch,
2562                                       getColElemIterOffset(coleiter, FALSE))
2563             - *textoffset;
2564         return TRUE;
2565     }
2566
2567     int32_t end = ucol_getOffset(coleiter);
2568     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2569                                                 status) ||
2570          U_FAILURE(*status)) {
2571         return FALSE;
2572     }
2573
2574     end = getNextUStringSearchBaseOffset(strsrch, end);
2575     // this totally matches, however we need to check if it is repeating
2576     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2577         !isBreakUnit(strsrch, *textoffset, end) ||
2578         !checkIdentical(strsrch, *textoffset, end)) {
2579         (*textoffset) --;
2580         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2581                                             *textoffset);
2582         return FALSE;
2583     }
2584
2585     strsrch->search->matchedIndex  = *textoffset;
2586     strsrch->search->matchedLength = end - *textoffset;
2587     return TRUE;
2588 }
2589 #endif // #if BOYER_MOORE
2590
2591 // constructors and destructor -------------------------------------------
2592
2593 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594                                           int32_t         patternlength,
2595                                     const UChar          *text,
2596                                           int32_t         textlength,
2597                                     const char           *locale,
2598                                           UBreakIterator *breakiter,
2599                                           UErrorCode     *status)
2600 {
2601     if (U_FAILURE(*status)) {
2602         return NULL;
2603     }
2604 #if UCONFIG_NO_BREAK_ITERATION
2605     if (breakiter != NULL) {
2606         *status = U_UNSUPPORTED_ERROR;
2607         return NULL;
2608     }
2609 #endif
2610     if (locale) {
2611         // ucol_open internally checks for status
2612         UCollator     *collator = ucol_open(locale, status);
2613         // pattern, text checks are done in usearch_openFromCollator
2614         UStringSearch *result   = usearch_openFromCollator(pattern,
2615                                               patternlength, text, textlength,
2616                                               collator, breakiter, status);
2617
2618         if (result == NULL || U_FAILURE(*status)) {
2619             if (collator) {
2620                 ucol_close(collator);
2621             }
2622             return NULL;
2623         }
2624         else {
2625             result->ownCollator = TRUE;
2626         }
2627         return result;
2628     }
2629     *status = U_ILLEGAL_ARGUMENT_ERROR;
2630     return NULL;
2631 }
2632
2633 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634                                   const UChar          *pattern,
2635                                         int32_t         patternlength,
2636                                   const UChar          *text,
2637                                         int32_t         textlength,
2638                                   const UCollator      *collator,
2639                                         UBreakIterator *breakiter,
2640                                         UErrorCode     *status)
2641 {
2642     if (U_FAILURE(*status)) {
2643         return NULL;
2644     }
2645 #if UCONFIG_NO_BREAK_ITERATION
2646     if (breakiter != NULL) {
2647         *status = U_UNSUPPORTED_ERROR;
2648         return NULL;
2649     }
2650 #endif
2651     if (pattern == NULL || text == NULL || collator == NULL) {
2652         *status = U_ILLEGAL_ARGUMENT_ERROR;
2653         return NULL;
2654     }
2655
2656     // string search does not really work when numeric collation is turned on
2657     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658         *status = U_UNSUPPORTED_ERROR;
2659         return NULL;
2660     }
2661
2662     if (U_SUCCESS(*status)) {
2663         initializeFCD(status);
2664         if (U_FAILURE(*status)) {
2665             return NULL;
2666         }
2667
2668         UStringSearch *result;
2669         if (textlength == -1) {
2670             textlength = u_strlen(text);
2671         }
2672         if (patternlength == -1) {
2673             patternlength = u_strlen(pattern);
2674         }
2675         if (textlength <= 0 || patternlength <= 0) {
2676             *status = U_ILLEGAL_ARGUMENT_ERROR;
2677             return NULL;
2678         }
2679
2680         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2681         if (result == NULL) {
2682             *status = U_MEMORY_ALLOCATION_ERROR;
2683             return NULL;
2684         }
2685
2686         result->collator    = collator;
2687         result->strength    = ucol_getStrength(collator);
2688         result->ceMask      = getMask(result->strength);
2689         result->toShift     =
2690              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2691                                                             UCOL_SHIFTED;
2692         result->variableTop = ucol_getVariableTop(collator, status);
2693
2694         result->nfd         = Normalizer2::getNFDInstance(*status);
2695
2696         if (U_FAILURE(*status)) {
2697             uprv_free(result);
2698             return NULL;
2699         }
2700
2701         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2702         if (result->search == NULL) {
2703             *status = U_MEMORY_ALLOCATION_ERROR;
2704             uprv_free(result);
2705             return NULL;
2706         }
2707
2708         result->search->text       = text;
2709         result->search->textLength = textlength;
2710
2711         result->pattern.text       = pattern;
2712         result->pattern.textLength = patternlength;
2713         result->pattern.ces         = NULL;
2714         result->pattern.pces        = NULL;
2715
2716         result->search->breakIter  = breakiter;
2717 #if !UCONFIG_NO_BREAK_ITERATION
2718         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2719         if (breakiter) {
2720             ubrk_setText(breakiter, text, textlength, status);
2721         }
2722 #endif
2723
2724         result->ownCollator           = FALSE;
2725         result->search->matchedLength = 0;
2726         result->search->matchedIndex  = USEARCH_DONE;
2727         result->utilIter              = NULL;
2728         result->textIter              = ucol_openElements(collator, text,
2729                                                           textlength, status);
2730         result->textProcessedIter     = NULL;
2731         if (U_FAILURE(*status)) {
2732             usearch_close(result);
2733             return NULL;
2734         }
2735
2736         result->search->isOverlap          = FALSE;
2737         result->search->isCanonicalMatch   = FALSE;
2738         result->search->elementComparisonType = 0;
2739         result->search->isForwardSearching = TRUE;
2740         result->search->reset              = TRUE;
2741
2742         initialize(result, status);
2743
2744         if (U_FAILURE(*status)) {
2745             usearch_close(result);
2746             return NULL;
2747         }
2748
2749         return result;
2750     }
2751     return NULL;
2752 }
2753
2754 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2755 {
2756     if (strsrch) {
2757         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758             strsrch->pattern.ces) {
2759             uprv_free(strsrch->pattern.ces);
2760         }
2761
2762         if (strsrch->pattern.pces != NULL &&
2763             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764             uprv_free(strsrch->pattern.pces);
2765         }
2766
2767         delete strsrch->textProcessedIter;
2768         ucol_closeElements(strsrch->textIter);
2769         ucol_closeElements(strsrch->utilIter);
2770
2771         if (strsrch->ownCollator && strsrch->collator) {
2772             ucol_close((UCollator *)strsrch->collator);
2773         }
2774
2775 #if !UCONFIG_NO_BREAK_ITERATION
2776         if (strsrch->search->internalBreakIter) {
2777             ubrk_close(strsrch->search->internalBreakIter);
2778         }
2779 #endif
2780
2781         uprv_free(strsrch->search);
2782         uprv_free(strsrch);
2783     }
2784 }
2785
2786 namespace {
2787
2788 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2789     if (U_FAILURE(*status)) { return FALSE; }
2790     if (strsrch->textProcessedIter == NULL) {
2791         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2792         if (strsrch->textProcessedIter == NULL) {
2793             *status = U_MEMORY_ALLOCATION_ERROR;
2794             return FALSE;
2795         }
2796     } else {
2797         strsrch->textProcessedIter->init(strsrch->textIter);
2798     }
2799     return TRUE;
2800 }
2801
2802 }
2803
2804 // set and get methods --------------------------------------------------
2805
2806 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2807                                         int32_t    position,
2808                                         UErrorCode    *status)
2809 {
2810     if (U_SUCCESS(*status) && strsrch) {
2811         if (isOutOfBounds(strsrch->search->textLength, position)) {
2812             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2813         }
2814         else {
2815             setColEIterOffset(strsrch->textIter, position);
2816         }
2817         strsrch->search->matchedIndex  = USEARCH_DONE;
2818         strsrch->search->matchedLength = 0;
2819         strsrch->search->reset         = FALSE;
2820     }
2821 }
2822
2823 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2824 {
2825     if (strsrch) {
2826         int32_t result = ucol_getOffset(strsrch->textIter);
2827         if (isOutOfBounds(strsrch->search->textLength, result)) {
2828             return USEARCH_DONE;
2829         }
2830         return result;
2831     }
2832     return USEARCH_DONE;
2833 }
2834
2835 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836                                  USearchAttribute attribute,
2837                                  USearchAttributeValue value,
2838                                  UErrorCode *status)
2839 {
2840     if (U_SUCCESS(*status) && strsrch) {
2841         switch (attribute)
2842         {
2843         case USEARCH_OVERLAP :
2844             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2845             break;
2846         case USEARCH_CANONICAL_MATCH :
2847             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2848                                                                       FALSE);
2849             break;
2850         case USEARCH_ELEMENT_COMPARISON :
2851             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852                 strsrch->search->elementComparisonType = (int16_t)value;
2853             } else {
2854                 strsrch->search->elementComparisonType = 0;
2855             }
2856             break;
2857         case USEARCH_ATTRIBUTE_COUNT :
2858         default:
2859             *status = U_ILLEGAL_ARGUMENT_ERROR;
2860         }
2861     }
2862     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863         *status = U_ILLEGAL_ARGUMENT_ERROR;
2864     }
2865 }
2866
2867 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868                                                 const UStringSearch *strsrch,
2869                                                 USearchAttribute attribute)
2870 {
2871     if (strsrch) {
2872         switch (attribute) {
2873         case USEARCH_OVERLAP :
2874             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2875                                                         USEARCH_OFF);
2876         case USEARCH_CANONICAL_MATCH :
2877             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2878                                                                USEARCH_OFF);
2879         case USEARCH_ELEMENT_COMPARISON :
2880             {
2881                 int16_t value = strsrch->search->elementComparisonType;
2882                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883                     return (USearchAttributeValue)value;
2884                 } else {
2885                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
2886                 }
2887             }
2888         case USEARCH_ATTRIBUTE_COUNT :
2889             return USEARCH_DEFAULT;
2890         }
2891     }
2892     return USEARCH_DEFAULT;
2893 }
2894
2895 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896                                                 const UStringSearch *strsrch)
2897 {
2898     if (strsrch == NULL) {
2899         return USEARCH_DONE;
2900     }
2901     return strsrch->search->matchedIndex;
2902 }
2903
2904
2905 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2906                                             UChar         *result,
2907                                             int32_t        resultCapacity,
2908                                             UErrorCode    *status)
2909 {
2910     if (U_FAILURE(*status)) {
2911         return USEARCH_DONE;
2912     }
2913     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2914         result == NULL)) {
2915         *status = U_ILLEGAL_ARGUMENT_ERROR;
2916         return USEARCH_DONE;
2917     }
2918
2919     int32_t     copylength = strsrch->search->matchedLength;
2920     int32_t copyindex  = strsrch->search->matchedIndex;
2921     if (copyindex == USEARCH_DONE) {
2922         u_terminateUChars(result, resultCapacity, 0, status);
2923         return USEARCH_DONE;
2924     }
2925
2926     if (resultCapacity < copylength) {
2927         copylength = resultCapacity;
2928     }
2929     if (copylength > 0) {
2930         uprv_memcpy(result, strsrch->search->text + copyindex,
2931                     copylength * sizeof(UChar));
2932     }
2933     return u_terminateUChars(result, resultCapacity,
2934                              strsrch->search->matchedLength, status);
2935 }
2936
2937 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938                                               const UStringSearch *strsrch)
2939 {
2940     if (strsrch) {
2941         return strsrch->search->matchedLength;
2942     }
2943     return USEARCH_DONE;
2944 }
2945
2946 #if !UCONFIG_NO_BREAK_ITERATION
2947
2948 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2949                                                UBreakIterator *breakiter,
2950                                                UErrorCode     *status)
2951 {
2952     if (U_SUCCESS(*status) && strsrch) {
2953         strsrch->search->breakIter = breakiter;
2954         if (breakiter) {
2955             ubrk_setText(breakiter, strsrch->search->text,
2956                          strsrch->search->textLength, status);
2957         }
2958     }
2959 }
2960
2961 U_CAPI const UBreakIterator* U_EXPORT2
2962 usearch_getBreakIterator(const UStringSearch *strsrch)
2963 {
2964     if (strsrch) {
2965         return strsrch->search->breakIter;
2966     }
2967     return NULL;
2968 }
2969
2970 #endif
2971
2972 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2973                                       const UChar         *text,
2974                                             int32_t        textlength,
2975                                             UErrorCode    *status)
2976 {
2977     if (U_SUCCESS(*status)) {
2978         if (strsrch == NULL || text == NULL || textlength < -1 ||
2979             textlength == 0) {
2980             *status = U_ILLEGAL_ARGUMENT_ERROR;
2981         }
2982         else {
2983             if (textlength == -1) {
2984                 textlength = u_strlen(text);
2985             }
2986             strsrch->search->text       = text;
2987             strsrch->search->textLength = textlength;
2988             ucol_setText(strsrch->textIter, text, textlength, status);
2989             strsrch->search->matchedIndex  = USEARCH_DONE;
2990             strsrch->search->matchedLength = 0;
2991             strsrch->search->reset         = TRUE;
2992 #if !UCONFIG_NO_BREAK_ITERATION
2993             if (strsrch->search->breakIter != NULL) {
2994                 ubrk_setText(strsrch->search->breakIter, text,
2995                              textlength, status);
2996             }
2997             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2998 #endif
2999         }
3000     }
3001 }
3002
3003 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3004                                                      int32_t       *length)
3005 {
3006     if (strsrch) {
3007         *length = strsrch->search->textLength;
3008         return strsrch->search->text;
3009     }
3010     return NULL;
3011 }
3012
3013 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3014                                           const UCollator     *collator,
3015                                                 UErrorCode    *status)
3016 {
3017     if (U_SUCCESS(*status)) {
3018         if (collator == NULL) {
3019             *status = U_ILLEGAL_ARGUMENT_ERROR;
3020             return;
3021         }
3022
3023         if (strsrch) {
3024             delete strsrch->textProcessedIter;
3025             strsrch->textProcessedIter = NULL;
3026             ucol_closeElements(strsrch->textIter);
3027             ucol_closeElements(strsrch->utilIter);
3028             strsrch->textIter = strsrch->utilIter = NULL;
3029             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030                 ucol_close((UCollator *)strsrch->collator);
3031                 strsrch->ownCollator = FALSE;
3032             }
3033             strsrch->collator    = collator;
3034             strsrch->strength    = ucol_getStrength(collator);
3035             strsrch->ceMask      = getMask(strsrch->strength);
3036 #if !UCONFIG_NO_BREAK_ITERATION
3037             ubrk_close(strsrch->search->internalBreakIter);
3038             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039                                                      strsrch->search->text, strsrch->search->textLength, status);
3040 #endif
3041             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042             strsrch->toShift     =
3043                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3044                                                                 UCOL_SHIFTED;
3045             // if status is a failure, ucol_getVariableTop returns 0
3046             strsrch->variableTop = ucol_getVariableTop(collator, status);
3047             strsrch->textIter = ucol_openElements(collator,
3048                                       strsrch->search->text,
3049                                       strsrch->search->textLength,
3050                                       status);
3051             strsrch->utilIter = ucol_openElements(
3052                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053             // initialize() _after_ setting the iterators for the new collator.
3054             initialize(strsrch, status);
3055         }
3056
3057         // **** are these calls needed?
3058         // **** we call uprv_init_pce in initializePatternPCETable
3059         // **** and the CEIBuffer constructor...
3060 #if 0
3061         uprv_init_pce(strsrch->textIter);
3062         uprv_init_pce(strsrch->utilIter);
3063 #endif
3064     }
3065 }
3066
3067 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3068 {
3069     if (strsrch) {
3070         return (UCollator *)strsrch->collator;
3071     }
3072     return NULL;
3073 }
3074
3075 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3076                                          const UChar         *pattern,
3077                                                int32_t        patternlength,
3078                                                UErrorCode    *status)
3079 {
3080     if (U_SUCCESS(*status)) {
3081         if (strsrch == NULL || pattern == NULL) {
3082             *status = U_ILLEGAL_ARGUMENT_ERROR;
3083         }
3084         else {
3085             if (patternlength == -1) {
3086                 patternlength = u_strlen(pattern);
3087             }
3088             if (patternlength == 0) {
3089                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3090                 return;
3091             }
3092             strsrch->pattern.text       = pattern;
3093             strsrch->pattern.textLength = patternlength;
3094             initialize(strsrch, status);
3095         }
3096     }
3097 }
3098
3099 U_CAPI const UChar* U_EXPORT2
3100 usearch_getPattern(const UStringSearch *strsrch,
3101                    int32_t       *length)
3102 {
3103     if (strsrch) {
3104         *length = strsrch->pattern.textLength;
3105         return strsrch->pattern.text;
3106     }
3107     return NULL;
3108 }
3109
3110 // miscellanous methods --------------------------------------------------
3111
3112 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3113                                            UErrorCode    *status)
3114 {
3115     if (strsrch && U_SUCCESS(*status)) {
3116         strsrch->search->isForwardSearching = TRUE;
3117         usearch_setOffset(strsrch, 0, status);
3118         if (U_SUCCESS(*status)) {
3119             return usearch_next(strsrch, status);
3120         }
3121     }
3122     return USEARCH_DONE;
3123 }
3124
3125 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3126                                                int32_t    position,
3127                                                UErrorCode    *status)
3128 {
3129     if (strsrch && U_SUCCESS(*status)) {
3130         strsrch->search->isForwardSearching = TRUE;
3131         // position checked in usearch_setOffset
3132         usearch_setOffset(strsrch, position, status);
3133         if (U_SUCCESS(*status)) {
3134             return usearch_next(strsrch, status);
3135         }
3136     }
3137     return USEARCH_DONE;
3138 }
3139
3140 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3141                                           UErrorCode    *status)
3142 {
3143     if (strsrch && U_SUCCESS(*status)) {
3144         strsrch->search->isForwardSearching = FALSE;
3145         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146         if (U_SUCCESS(*status)) {
3147             return usearch_previous(strsrch, status);
3148         }
3149     }
3150     return USEARCH_DONE;
3151 }
3152
3153 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3154                                                int32_t    position,
3155                                                UErrorCode    *status)
3156 {
3157     if (strsrch && U_SUCCESS(*status)) {
3158         strsrch->search->isForwardSearching = FALSE;
3159         // position checked in usearch_setOffset
3160         usearch_setOffset(strsrch, position, status);
3161         if (U_SUCCESS(*status)) {
3162             return usearch_previous(strsrch, status);
3163         }
3164     }
3165     return USEARCH_DONE;
3166 }
3167
3168 /**
3169 * If a direction switch is required, we'll count the number of ces till the
3170 * beginning of the collation element iterator and iterate forwards that
3171 * number of times. This is so that we get to the correct point within the
3172 * string to continue the search in. Imagine when we are in the middle of the
3173 * normalization buffer when the change in direction is request. arrrgghh....
3174 * After searching the offset within the collation element iterator will be
3175 * shifted to the start of the match. If a match is not found, the offset would
3176 * have been set to the end of the text string in the collation element
3177 * iterator.
3178 * Okay, here's my take on normalization buffer. The only time when there can
3179 * be 2 matches within the same normalization is when the pattern is consists
3180 * of all accents. But since the offset returned is from the text string, we
3181 * should not confuse the caller by returning the second match within the
3182 * same normalization buffer. If we do, the 2 results will have the same match
3183 * offsets, and that'll be confusing. I'll return the next match that doesn't
3184 * fall within the same normalization buffer. Note this does not affect the
3185 * results of matches spanning the text and the normalization buffer.
3186 * The position to start searching is taken from the collation element
3187 * iterator. Callers of this API would have to set the offset in the collation
3188 * element iterator before using this method.
3189 */
3190 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3191                                           UErrorCode    *status)
3192 {
3193     if (U_SUCCESS(*status) && strsrch) {
3194         // note offset is either equivalent to the start of the previous match
3195         // or is set by the user
3196         int32_t      offset       = usearch_getOffset(strsrch);
3197         USearch     *search       = strsrch->search;
3198         search->reset             = FALSE;
3199         int32_t      textlength   = search->textLength;
3200         if (search->isForwardSearching) {
3201 #if BOYER_MOORE
3202             if (offset == textlength
3203                 || (!search->isOverlap &&
3204                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3205                     (search->matchedIndex != USEARCH_DONE &&
3206                      offset + search->matchedLength >= textlength)))) {
3207                 // not enough characters to match
3208                 setMatchNotFound(strsrch);
3209                 return USEARCH_DONE;
3210             }
3211 #else
3212             if (offset == textlength ||
3213                 (! search->isOverlap &&
3214                 (search->matchedIndex != USEARCH_DONE &&
3215                 offset + search->matchedLength > textlength))) {
3216                     // not enough characters to match
3217                     setMatchNotFound(strsrch);
3218                     return USEARCH_DONE;
3219             }
3220 #endif
3221         }
3222         else {
3223             // switching direction.
3224             // if matchedIndex == USEARCH_DONE, it means that either a
3225             // setOffset has been called or that previous ran off the text
3226             // string. the iterator would have been set to offset 0 if a
3227             // match is not found.
3228             search->isForwardSearching = TRUE;
3229             if (search->matchedIndex != USEARCH_DONE) {
3230                 // there's no need to set the collation element iterator
3231                 // the next call to next will set the offset.
3232                 return search->matchedIndex;
3233             }
3234         }
3235
3236         if (U_SUCCESS(*status)) {
3237             if (strsrch->pattern.cesLength == 0) {
3238                 if (search->matchedIndex == USEARCH_DONE) {
3239                     search->matchedIndex = offset;
3240                 }
3241                 else { // moves by codepoints
3242                     U16_FWD_1(search->text, search->matchedIndex, textlength);
3243                 }
3244
3245                 search->matchedLength = 0;
3246                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247                 // status checked below
3248                 if (search->matchedIndex == textlength) {
3249                     search->matchedIndex = USEARCH_DONE;
3250                 }
3251             }
3252             else {
3253                 if (search->matchedLength > 0) {
3254                     // if matchlength is 0 we are at the start of the iteration
3255                     if (search->isOverlap) {
3256                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3257                     }
3258                     else {
3259                         ucol_setOffset(strsrch->textIter,
3260                                        offset + search->matchedLength, status);
3261                     }
3262                 }
3263                 else {
3264                     // for boundary check purposes. this will ensure that the
3265                     // next match will not preceed the current offset
3266                     // note search->matchedIndex will always be set to something
3267                     // in the code
3268                     search->matchedIndex = offset - 1;
3269                 }
3270
3271                 if (search->isCanonicalMatch) {
3272                     // can't use exact here since extra accents are allowed.
3273                     usearch_handleNextCanonical(strsrch, status);
3274                 }
3275                 else {
3276                     usearch_handleNextExact(strsrch, status);
3277                 }
3278             }
3279
3280             if (U_FAILURE(*status)) {
3281                 return USEARCH_DONE;
3282             }
3283
3284 #if !BOYER_MOORE
3285             if (search->matchedIndex == USEARCH_DONE) {
3286                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3287             } else {
3288                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3289             }
3290 #endif
3291
3292             return search->matchedIndex;
3293         }
3294     }
3295     return USEARCH_DONE;
3296 }
3297
3298 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3299                                               UErrorCode *status)
3300 {
3301     if (U_SUCCESS(*status) && strsrch) {
3302         int32_t offset;
3303         USearch *search = strsrch->search;
3304         if (search->reset) {
3305             offset                     = search->textLength;
3306             search->isForwardSearching = FALSE;
3307             search->reset              = FALSE;
3308             setColEIterOffset(strsrch->textIter, offset);
3309         }
3310         else {
3311             offset = usearch_getOffset(strsrch);
3312         }
3313
3314         int32_t matchedindex = search->matchedIndex;
3315         if (search->isForwardSearching == TRUE) {
3316             // switching direction.
3317             // if matchedIndex == USEARCH_DONE, it means that either a
3318             // setOffset has been called or that next ran off the text
3319             // string. the iterator would have been set to offset textLength if
3320             // a match is not found.
3321             search->isForwardSearching = FALSE;
3322             if (matchedindex != USEARCH_DONE) {
3323                 return matchedindex;
3324             }
3325         }
3326         else {
3327 #if BOYER_MOORE
3328             if (offset == 0 || matchedindex == 0 ||
3329                 (!search->isOverlap &&
3330                     (offset < strsrch->pattern.defaultShiftSize ||
3331                     (matchedindex != USEARCH_DONE &&
3332                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333                 // not enough characters to match
3334                 setMatchNotFound(strsrch);
3335                 return USEARCH_DONE;
3336             }
3337 #else
3338             // Could check pattern length, but the
3339             // linear search will do the right thing
3340             if (offset == 0 || matchedindex == 0) {
3341                 setMatchNotFound(strsrch);
3342                 return USEARCH_DONE;
3343             }
3344 #endif
3345         }
3346
3347         if (U_SUCCESS(*status)) {
3348             if (strsrch->pattern.cesLength == 0) {
3349                 search->matchedIndex =
3350                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3351                 if (search->matchedIndex == 0) {
3352                     setMatchNotFound(strsrch);
3353                     // status checked below
3354                 }
3355                 else { // move by codepoints
3356                     U16_BACK_1(search->text, 0, search->matchedIndex);
3357                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358                     // status checked below
3359                     search->matchedLength = 0;
3360                 }
3361             }
3362             else {
3363                 if (strsrch->search->isCanonicalMatch) {
3364                     // can't use exact here since extra accents are allowed.
3365                     usearch_handlePreviousCanonical(strsrch, status);
3366                     // status checked below
3367                 }
3368                 else {
3369                     usearch_handlePreviousExact(strsrch, status);
3370                     // status checked below
3371                 }
3372             }
3373
3374             if (U_FAILURE(*status)) {
3375                 return USEARCH_DONE;
3376             }
3377
3378             return search->matchedIndex;
3379         }
3380     }
3381     return USEARCH_DONE;
3382 }
3383
3384
3385
3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387 {
3388     /*
3389     reset is setting the attributes that are already in
3390     string search, hence all attributes in the collator should
3391     be retrieved without any problems
3392     */
3393     if (strsrch) {
3394         UErrorCode status            = U_ZERO_ERROR;
3395         UBool      sameCollAttribute = TRUE;
3396         uint32_t   ceMask;
3397         UBool      shift;
3398         uint32_t   varTop;
3399
3400         // **** hack to deal w/ how processed CEs encode quaternary ****
3401         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404                 sameCollAttribute = FALSE;
3405         }
3406
3407         strsrch->strength    = ucol_getStrength(strsrch->collator);
3408         ceMask = getMask(strsrch->strength);
3409         if (strsrch->ceMask != ceMask) {
3410             strsrch->ceMask = ceMask;
3411             sameCollAttribute = FALSE;
3412         }
3413
3414         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416                                   &status) == UCOL_SHIFTED;
3417         if (strsrch->toShift != shift) {
3418             strsrch->toShift  = shift;
3419             sameCollAttribute = FALSE;
3420         }
3421
3422         // if status is a failure, ucol_getVariableTop returns 0
3423         varTop = ucol_getVariableTop(strsrch->collator, &status);
3424         if (strsrch->variableTop != varTop) {
3425             strsrch->variableTop = varTop;
3426             sameCollAttribute    = FALSE;
3427         }
3428         if (!sameCollAttribute) {
3429             initialize(strsrch, &status);
3430         }
3431         ucol_setText(strsrch->textIter, strsrch->search->text,
3432                               strsrch->search->textLength,
3433                               &status);
3434         strsrch->search->matchedLength      = 0;
3435         strsrch->search->matchedIndex       = USEARCH_DONE;
3436         strsrch->search->isOverlap          = FALSE;
3437         strsrch->search->isCanonicalMatch   = FALSE;
3438         strsrch->search->elementComparisonType = 0;
3439         strsrch->search->isForwardSearching = TRUE;
3440         strsrch->search->reset              = TRUE;
3441     }
3442 }
3443
3444 //
3445 //  CEI  Collation Element + source text index.
3446 //       These structs are kept in the circular buffer.
3447 //
3448 struct  CEI {
3449     int64_t ce;
3450     int32_t lowIndex;
3451     int32_t highIndex;
3452 };
3453
3454 U_NAMESPACE_BEGIN
3455
3456 namespace {
3457 //
3458 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
3459 //
3460 #define   DEFAULT_CEBUFFER_SIZE 96
3461 #define   CEBUFFER_EXTRA 32
3462 // Some typical max values to make buffer size more reasonable for asymmetric search.
3463 // #8694 is for a better long-term solution to allocation of this buffer.
3464 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3467 struct CEIBuffer {
3468     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3469     CEI                 *buf;
3470     int32_t              bufSize;
3471     int32_t              firstIx;
3472     int32_t              limitIx;
3473     UCollationElements  *ceIter;
3474     UStringSearch       *strSearch;
3475
3476
3477
3478                CEIBuffer(UStringSearch *ss, UErrorCode *status);
3479                ~CEIBuffer();
3480    const CEI   *get(int32_t index);
3481    const CEI   *getPrevious(int32_t index);
3482 };
3483
3484
3485 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3486     buf = defBuf;
3487     strSearch = ss;
3488     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489     if (ss->search->elementComparisonType != 0) {
3490         const UChar * patText = ss->pattern.text;
3491         if (patText) {
3492             const UChar * patTextLimit = patText + ss->pattern.textLength;
3493             while ( patText < patTextLimit ) {
3494                 UChar c = *patText++;
3495                 if (MIGHT_BE_JAMO_L(c)) {
3496                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3497                 } else {
3498                     // No check for surrogates, we might allocate slightly more buffer than necessary.
3499                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3500                 }
3501             }
3502         }
3503     }
3504     ceIter    = ss->textIter;
3505     firstIx = 0;
3506     limitIx = 0;
3507
3508     if (!initTextProcessedIter(ss, status)) { return; }
3509
3510     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3512         if (buf == NULL) {
3513             *status = U_MEMORY_ALLOCATION_ERROR;
3514         }
3515     }
3516 }
3517
3518 // TODO: add a reset or init function so that allocated
3519 //       buffers can be retained & reused.
3520
3521 CEIBuffer::~CEIBuffer() {
3522     if (buf != defBuf) {
3523         uprv_free(buf);
3524     }
3525 }
3526
3527
3528 // Get the CE with the specified index.
3529 //   Index must be in the range
3530 //          n-history_size < index < n+1
3531 //   where n is the largest index to have been fetched by some previous call to this function.
3532 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3533 //
3534 const CEI *CEIBuffer::get(int32_t index) {
3535     int i = index % bufSize;
3536
3537     if (index>=firstIx && index<limitIx) {
3538         // The request was for an entry already in our buffer.
3539         //  Just return it.
3540         return &buf[i];
3541     }
3542
3543     // Caller is requesting a new, never accessed before, CE.
3544     //   Verify that it is the next one in sequence, which is all
3545     //   that is allowed.
3546     if (index != limitIx) {
3547         U_ASSERT(FALSE);
3548
3549         return NULL;
3550     }
3551
3552     // Manage the circular CE buffer indexing
3553     limitIx++;
3554
3555     if (limitIx - firstIx >= bufSize) {
3556         // The buffer is full, knock out the lowest-indexed entry.
3557         firstIx++;
3558     }
3559
3560     UErrorCode status = U_ZERO_ERROR;
3561
3562     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3563
3564     return &buf[i];
3565 }
3566
3567 // Get the CE with the specified index.
3568 //   Index must be in the range
3569 //          n-history_size < index < n+1
3570 //   where n is the largest index to have been fetched by some previous call to this function.
3571 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3572 //
3573 const CEI *CEIBuffer::getPrevious(int32_t index) {
3574     int i = index % bufSize;
3575
3576     if (index>=firstIx && index<limitIx) {
3577         // The request was for an entry already in our buffer.
3578         //  Just return it.
3579         return &buf[i];
3580     }
3581
3582     // Caller is requesting a new, never accessed before, CE.
3583     //   Verify that it is the next one in sequence, which is all
3584     //   that is allowed.
3585     if (index != limitIx) {
3586         U_ASSERT(FALSE);
3587
3588         return NULL;
3589     }
3590
3591     // Manage the circular CE buffer indexing
3592     limitIx++;
3593
3594     if (limitIx - firstIx >= bufSize) {
3595         // The buffer is full, knock out the lowest-indexed entry.
3596         firstIx++;
3597     }
3598
3599     UErrorCode status = U_ZERO_ERROR;
3600
3601     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3602
3603     return &buf[i];
3604 }
3605
3606 }
3607
3608 U_NAMESPACE_END
3609
3610
3611 // #define USEARCH_DEBUG
3612
3613 #ifdef USEARCH_DEBUG
3614 #include <stdio.h>
3615 #include <stdlib.h>
3616 #endif
3617
3618 /*
3619  * Find the next break boundary after startIndex. If the UStringSearch object
3620  * has an external break iterator, use that. Otherwise use the internal character
3621  * break iterator.
3622  */
3623 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3624 #if 0
3625     const UChar *text = strsrch->search->text;
3626     int32_t textLen   = strsrch->search->textLength;
3627
3628     U_ASSERT(startIndex>=0);
3629     U_ASSERT(startIndex<=textLen);
3630
3631     if (startIndex >= textLen) {
3632         return startIndex;
3633     }
3634
3635     UChar32  c;
3636     int32_t  i = startIndex;
3637     U16_NEXT(text, i, textLen, c);
3638
3639     // If we are on a control character, stop without looking for combining marks.
3640     //    Control characters do not combine.
3641     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3642     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3643         return i;
3644     }
3645
3646     // The initial character was not a control, and can thus accept trailing
3647     //   combining characters.  Advance over however many of them there are.
3648     int32_t  indexOfLastCharChecked;
3649     for (;;) {
3650         indexOfLastCharChecked = i;
3651         if (i>=textLen) {
3652             break;
3653         }
3654         U16_NEXT(text, i, textLen, c);
3655         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3656         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3657             break;
3658         }
3659     }
3660     return indexOfLastCharChecked;
3661 #elif !UCONFIG_NO_BREAK_ITERATION
3662     UBreakIterator *breakiterator = strsrch->search->breakIter;
3663
3664     if (breakiterator == NULL) {
3665         breakiterator = strsrch->search->internalBreakIter;
3666     }
3667
3668     if (breakiterator != NULL) {
3669         return ubrk_following(breakiterator, startIndex);
3670     }
3671
3672     return startIndex;
3673 #else
3674     // **** or should we use the original code? ****
3675     return startIndex;
3676 #endif
3677
3678 }
3679
3680 /*
3681  * Returns TRUE if index is on a break boundary. If the UStringSearch
3682  * has an external break iterator, test using that, otherwise test
3683  * using the internal character break iterator.
3684  */
3685 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3686 #if 0
3687     const UChar *text = strsrch->search->text;
3688     int32_t textLen   = strsrch->search->textLength;
3689
3690     U_ASSERT(index>=0);
3691     U_ASSERT(index<=textLen);
3692
3693     if (index>=textLen || index<=0) {
3694         return TRUE;
3695     }
3696
3697     // If the character at the current index is not a GRAPHEME_EXTEND
3698     //    then we can not be within a combining sequence.
3699     UChar32  c;
3700     U16_GET(text, 0, index, textLen, c);
3701     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3702     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3703         return TRUE;
3704     }
3705
3706     // We are at a combining mark.  If the preceding character is anything
3707     //   except a CONTROL, CR or LF, we are in a combining sequence.
3708     U16_PREV(text, 0, index, c);
3709     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3710     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3711     return !combining;
3712 #elif !UCONFIG_NO_BREAK_ITERATION
3713     UBreakIterator *breakiterator = strsrch->search->breakIter;
3714
3715     if (breakiterator == NULL) {
3716         breakiterator = strsrch->search->internalBreakIter;
3717     }
3718
3719     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3720 #else
3721     // **** or use the original code? ****
3722     return TRUE;
3723 #endif
3724 }
3725
3726 #if 0
3727 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3728 {
3729 #if !UCONFIG_NO_BREAK_ITERATION
3730     UBreakIterator *breakiterator = strsrch->search->breakIter;
3731
3732     if (breakiterator != NULL) {
3733         int32_t startindex = ubrk_first(breakiterator);
3734         int32_t endindex   = ubrk_last(breakiterator);
3735
3736         // out-of-range indexes are never boundary positions
3737         if (start < startindex || start > endindex ||
3738             end < startindex || end > endindex) {
3739             return FALSE;
3740         }
3741
3742         return ubrk_isBoundary(breakiterator, start) &&
3743                ubrk_isBoundary(breakiterator, end);
3744     }
3745 #endif
3746
3747     return TRUE;
3748 }
3749 #endif
3750
3751 typedef enum {
3752     U_CE_MATCH = -1,
3753     U_CE_NO_MATCH = 0,
3754     U_CE_SKIP_TARG,
3755     U_CE_SKIP_PATN
3756 } UCompareCEsResult;
3757 #define U_CE_LEVEL2_BASE 0x00000005
3758 #define U_CE_LEVEL3_BASE 0x00050000
3759
3760 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3761     if (targCE == patCE) {
3762         return U_CE_MATCH;
3763     }
3764     if (compareType == 0) {
3765         return U_CE_NO_MATCH;
3766     }
3767
3768     int64_t targCEshifted = targCE >> 32;
3769     int64_t patCEshifted = patCE >> 32;
3770     int64_t mask;
3771
3772     mask = 0xFFFF0000;
3773     int32_t targLev1 = (int32_t)(targCEshifted & mask);
3774     int32_t patLev1 = (int32_t)(patCEshifted & mask);
3775     if ( targLev1 != patLev1 ) {
3776         if ( targLev1 == 0 ) {
3777             return U_CE_SKIP_TARG;
3778         }
3779         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3780             return U_CE_SKIP_PATN;
3781         }
3782         return U_CE_NO_MATCH;
3783     }
3784
3785     mask = 0x0000FFFF;
3786     int32_t targLev2 = (int32_t)(targCEshifted & mask);
3787     int32_t patLev2 = (int32_t)(patCEshifted & mask);
3788     if ( targLev2 != patLev2 ) {
3789         if ( targLev2 == 0 ) {
3790             return U_CE_SKIP_TARG;
3791         }
3792         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3793             return U_CE_SKIP_PATN;
3794         }
3795         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3796             U_CE_MATCH: U_CE_NO_MATCH;
3797     }
3798
3799     mask = 0xFFFF0000;
3800     int32_t targLev3 = (int32_t)(targCE & mask);
3801     int32_t patLev3 = (int32_t)(patCE & mask);
3802     if ( targLev3 != patLev3 ) {
3803         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3804             U_CE_MATCH: U_CE_NO_MATCH;
3805    }
3806
3807     return U_CE_MATCH;
3808 }
3809
3810 #if BOYER_MOORE
3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3812 #endif
3813
3814 namespace {
3815
3816 UChar32 codePointAt(const USearch &search, int32_t index) {
3817     if (index < search.textLength) {
3818         UChar32 c;
3819         U16_NEXT(search.text, index, search.textLength, c);
3820         return c;
3821     }
3822     return U_SENTINEL;
3823 }
3824
3825 UChar32 codePointBefore(const USearch &search, int32_t index) {
3826     if (0 < index) {
3827         UChar32 c;
3828         U16_PREV(search.text, 0, index, c);
3829         return c;
3830     }
3831     return U_SENTINEL;
3832 }
3833
3834 }  // namespace
3835
3836 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3837                                        int32_t        startIdx,
3838                                        int32_t        *matchStart,
3839                                        int32_t        *matchLimit,
3840                                        UErrorCode     *status)
3841 {
3842     if (U_FAILURE(*status)) {
3843         return FALSE;
3844     }
3845
3846     // TODO:  reject search patterns beginning with a combining char.
3847
3848 #ifdef USEARCH_DEBUG
3849     if (getenv("USEARCH_DEBUG") != NULL) {
3850         printf("Pattern CEs\n");
3851         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3852             printf(" %8x", strsrch->pattern.ces[ii]);
3853         }
3854         printf("\n");
3855     }
3856
3857 #endif
3858     // Input parameter sanity check.
3859     //  TODO:  should input indicies clip to the text length
3860     //         in the same way that UText does.
3861     if(strsrch->pattern.cesLength == 0         ||
3862        startIdx < 0                           ||
3863        startIdx > strsrch->search->textLength ||
3864        strsrch->pattern.ces == NULL) {
3865            *status = U_ILLEGAL_ARGUMENT_ERROR;
3866            return FALSE;
3867     }
3868
3869     if (strsrch->pattern.pces == NULL) {
3870         initializePatternPCETable(strsrch, status);
3871     }
3872
3873     ucol_setOffset(strsrch->textIter, startIdx, status);
3874     CEIBuffer ceb(strsrch, status);
3875
3876
3877     int32_t    targetIx = 0;
3878     const CEI *targetCEI = NULL;
3879     int32_t    patIx;
3880     UBool      found;
3881
3882     int32_t  mStart = -1;
3883     int32_t  mLimit = -1;
3884     int32_t  minLimit;
3885     int32_t  maxLimit;
3886
3887
3888
3889     // Outer loop moves over match starting positions in the
3890     //      target CE space.
3891     // Here we see the target as a sequence of collation elements, resulting from the following:
3892     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3893     //    (for example, digraphs such as IJ may be broken into two characters).
3894     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3895     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3896     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3897     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3898     //    then the CE is deleted, so the following code sees only CEs that are relevant.
3899     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3900     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3901     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3902     //
3903     for(targetIx=0; ; targetIx++)
3904     {
3905         found = TRUE;
3906         //  Inner loop checks for a match beginning at each
3907         //  position from the outer loop.
3908         int32_t targetIxOffset = 0;
3909         int64_t patCE = 0;
3910         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3911         // (compared to the last CE fetched for the previous targetIx value) as we need to go
3912         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3913         const CEI *firstCEI = ceb.get(targetIx);
3914         if (firstCEI == NULL) {
3915             *status = U_INTERNAL_PROGRAM_ERROR;
3916             found = FALSE;
3917             break;
3918         }
3919
3920         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3921             patCE = strsrch->pattern.pces[patIx];
3922             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3923             //  Compare CE from target string with CE from the pattern.
3924             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3925             //    which will fail the compare, below.
3926             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3927             if ( ceMatch == U_CE_NO_MATCH ) {
3928                 found = FALSE;
3929                 break;
3930             } else if ( ceMatch > U_CE_NO_MATCH ) {
3931                 if ( ceMatch == U_CE_SKIP_TARG ) {
3932                     // redo with same patCE, next targCE
3933                     patIx--;
3934                     targetIxOffset++;
3935                 } else { // ceMatch == U_CE_SKIP_PATN
3936                     // redo with same targCE, next patCE
3937                     targetIxOffset--;
3938                 }
3939             }
3940         }
3941         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3942
3943         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3944             // No match at this targetIx.  Try again at the next.
3945             continue;
3946         }
3947
3948         if (!found) {
3949             // No match at all, we have run off the end of the target text.
3950             break;
3951         }
3952
3953
3954         // We have found a match in CE space.
3955         // Now determine the bounds in string index space.
3956         //  There still is a chance of match failure if the CE range not correspond to
3957         //     an acceptable character range.
3958         //
3959         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3960
3961         mStart   = firstCEI->lowIndex;
3962         minLimit = lastCEI->lowIndex;
3963
3964         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3965         //   extended to the end of input, and the match is good.
3966
3967         // Look at the high and low indices of the CE following the match. If
3968         // they are the same it means one of two things:
3969         //    1. The match extended to the last CE from the target text, which is OK, or
3970         //    2. The last CE that was part of the match is in an expansion that extends
3971         //       to the first CE after the match. In this case, we reject the match.
3972         const CEI *nextCEI = 0;
3973         if (strsrch->search->elementComparisonType == 0) {
3974             nextCEI  = ceb.get(targetIx + targetIxOffset);
3975             maxLimit = nextCEI->lowIndex;
3976             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3977                 found = FALSE;
3978             }
3979         } else {
3980             for ( ; ; ++targetIxOffset ) {
3981                 nextCEI = ceb.get(targetIx + targetIxOffset);
3982                 maxLimit = nextCEI->lowIndex;
3983                 // If we are at the end of the target too, match succeeds
3984                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3985                     break;
3986                 }
3987                 // As long as the next CE has primary weight of 0,
3988                 // it is part of the last target element matched by the pattern;
3989                 // make sure it can be part of a match with the last patCE
3990                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3991                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3992                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3993                         found = FALSE;
3994                         break;
3995                     }
3996                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3997                 // target element, but it has non-zero primary weight => match fails
3998                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3999                     found = false;
4000                     break;
4001                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4002                 } else {
4003                     break;
4004                 }
4005             }
4006         }
4007
4008
4009         // Check for the start of the match being within a combining sequence.
4010         //   This can happen if the pattern itself begins with a combining char, and
4011         //   the match found combining marks in the target text that were attached
4012         //    to something else.
4013         //   This type of match should be rejected for not completely consuming a
4014         //   combining sequence.
4015         if (!isBreakBoundary(strsrch, mStart)) {
4016             found = FALSE;
4017         }
4018
4019         // Check for the start of the match being within an Collation Element Expansion,
4020         //   meaning that the first char of the match is only partially matched.
4021         //   With exapnsions, the first CE will report the index of the source
4022         //   character, and all subsequent (expansions) CEs will report the source index of the
4023         //    _following_ character.
4024         int32_t secondIx = firstCEI->highIndex;
4025         if (mStart == secondIx) {
4026             found = FALSE;
4027         }
4028
4029         // Allow matches to end in the middle of a grapheme cluster if the following
4030         // conditions are met; this is needed to make prefix search work properly in
4031         // Indic, see #11750
4032         // * the default breakIter is being used
4033         // * the next collation element after this combining sequence
4034         //   - has non-zero primary weight
4035         //   - corresponds to a separate character following the one at end of the current match
4036         //   (the second of these conditions, and perhaps both, may be redundant given the
4037         //   subsequent check for normalization boundary; however they are likely much faster
4038         //   tests in any case)
4039         // * the match limit is a normalization boundary
4040         UBool allowMidclusterMatch = FALSE;
4041         if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4042             allowMidclusterMatch =
4043                     strsrch->search->breakIter == NULL &&
4044                     nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4045                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4046                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4047                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4048         }
4049         // If those conditions are met, then:
4050         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4051         //   the match limit may be backed off to a previous break boundary. This handles
4052         //   cases in which mLimit includes target characters that are ignorable with current
4053         //   settings (such as space) and which extend beyond the pattern match.
4054         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4055         // * do NOT require that match limit be on a breakIter boundary
4056
4057         //  Advance the match end position to the first acceptable match boundary.
4058         //    This advances the index over any combining charcters.
4059         mLimit = maxLimit;
4060         if (minLimit < maxLimit) {
4061             // When the last CE's low index is same with its high index, the CE is likely
4062             // a part of expansion. In this case, the index is located just after the
4063             // character corresponding to the CEs compared above. If the index is right
4064             // at the break boundary, move the position to the next boundary will result
4065             // incorrect match length when there are ignorable characters exist between
4066             // the position and the next character produces CE(s). See ticket#8482.
4067             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4068                 mLimit = minLimit;
4069             } else {
4070                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4071                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4072                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4073                 // (i.e. we back off mLimit to the previous breakIterator boundary).
4074                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4075                     mLimit = nba;
4076                 }
4077             }
4078         }
4079
4080     #ifdef USEARCH_DEBUG
4081         if (getenv("USEARCH_DEBUG") != NULL) {
4082             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4083         }
4084     #endif
4085
4086         if (!allowMidclusterMatch) {
4087             // If advancing to the end of a combining sequence in character indexing space
4088             //   advanced us beyond the end of the match in CE space, reject this match.
4089             if (mLimit > maxLimit) {
4090                 found = FALSE;
4091             }
4092
4093             if (!isBreakBoundary(strsrch, mLimit)) {
4094                 found = FALSE;
4095             }
4096         }
4097
4098         if (! checkIdentical(strsrch, mStart, mLimit)) {
4099             found = FALSE;
4100         }
4101
4102         if (found) {
4103             break;
4104         }
4105     }
4106
4107     #ifdef USEARCH_DEBUG
4108     if (getenv("USEARCH_DEBUG") != NULL) {
4109         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4110         int32_t  lastToPrint = ceb.limitIx+2;
4111         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4112             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4113         }
4114         printf("\n%s\n", found? "match found" : "no match");
4115     }
4116     #endif
4117
4118     // All Done.  Store back the match bounds to the caller.
4119     //
4120     if (found==FALSE) {
4121         mLimit = -1;
4122         mStart = -1;
4123     }
4124
4125     if (matchStart != NULL) {
4126         *matchStart= mStart;
4127     }
4128
4129     if (matchLimit != NULL) {
4130         *matchLimit = mLimit;
4131     }
4132
4133     return found;
4134 }
4135
4136 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4137                                                 int32_t        startIdx,
4138                                                 int32_t        *matchStart,
4139                                                 int32_t        *matchLimit,
4140                                                 UErrorCode     *status)
4141 {
4142     if (U_FAILURE(*status)) {
4143         return FALSE;
4144     }
4145
4146     // TODO:  reject search patterns beginning with a combining char.
4147
4148 #ifdef USEARCH_DEBUG
4149     if (getenv("USEARCH_DEBUG") != NULL) {
4150         printf("Pattern CEs\n");
4151         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4152             printf(" %8x", strsrch->pattern.ces[ii]);
4153         }
4154         printf("\n");
4155     }
4156
4157 #endif
4158     // Input parameter sanity check.
4159     //  TODO:  should input indicies clip to the text length
4160     //         in the same way that UText does.
4161     if(strsrch->pattern.cesLength == 0         ||
4162        startIdx < 0                           ||
4163        startIdx > strsrch->search->textLength ||
4164        strsrch->pattern.ces == NULL) {
4165            *status = U_ILLEGAL_ARGUMENT_ERROR;
4166            return FALSE;
4167     }
4168
4169     if (strsrch->pattern.pces == NULL) {
4170         initializePatternPCETable(strsrch, status);
4171     }
4172
4173     CEIBuffer ceb(strsrch, status);
4174     int32_t    targetIx = 0;
4175
4176     /*
4177      * Pre-load the buffer with the CE's for the grapheme
4178      * after our starting position so that we're sure that
4179      * we can look at the CE following the match when we
4180      * check the match boundaries.
4181      *
4182      * This will also pre-fetch the first CE that we'll
4183      * consider for the match.
4184      */
4185     if (startIdx < strsrch->search->textLength) {
4186         UBreakIterator *bi = strsrch->search->internalBreakIter;
4187         int32_t next = ubrk_following(bi, startIdx);
4188
4189         ucol_setOffset(strsrch->textIter, next, status);
4190
4191         for (targetIx = 0; ; targetIx += 1) {
4192             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4193                 break;
4194             }
4195         }
4196     } else {
4197         ucol_setOffset(strsrch->textIter, startIdx, status);
4198     }
4199
4200
4201     const CEI *targetCEI = NULL;
4202     int32_t    patIx;
4203     UBool      found;
4204
4205     int32_t  limitIx = targetIx;
4206     int32_t  mStart = -1;
4207     int32_t  mLimit = -1;
4208     int32_t  minLimit;
4209     int32_t  maxLimit;
4210
4211
4212
4213     // Outer loop moves over match starting positions in the
4214     //      target CE space.
4215     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4216     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4217     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4218     // and the beginning of the base text.
4219     for(targetIx = limitIx; ; targetIx += 1)
4220     {
4221         found = TRUE;
4222         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4223         // (compared to the last CE fetched for the previous targetIx value) as we need to go
4224         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4225         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4226         if (lastCEI == NULL) {
4227             *status = U_INTERNAL_PROGRAM_ERROR;
4228             found = FALSE;
4229              break;
4230         }
4231         //  Inner loop checks for a match beginning at each
4232         //  position from the outer loop.
4233         int32_t targetIxOffset = 0;
4234         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4235             int64_t patCE = strsrch->pattern.pces[patIx];
4236
4237             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4238             //  Compare CE from target string with CE from the pattern.
4239             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4240             //    which will fail the compare, below.
4241             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4242             if ( ceMatch == U_CE_NO_MATCH ) {
4243                 found = FALSE;
4244                 break;
4245             } else if ( ceMatch > U_CE_NO_MATCH ) {
4246                 if ( ceMatch == U_CE_SKIP_TARG ) {
4247                     // redo with same patCE, next targCE
4248                     patIx++;
4249                     targetIxOffset++;
4250                 } else { // ceMatch == U_CE_SKIP_PATN
4251                     // redo with same targCE, next patCE
4252                     targetIxOffset--;
4253                 }
4254             }
4255         }
4256
4257         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4258             // No match at this targetIx.  Try again at the next.
4259             continue;
4260         }
4261
4262         if (!found) {
4263             // No match at all, we have run off the end of the target text.
4264             break;
4265         }
4266
4267
4268         // We have found a match in CE space.
4269         // Now determine the bounds in string index space.
4270         //  There still is a chance of match failure if the CE range not correspond to
4271         //     an acceptable character range.
4272         //
4273         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4274         mStart   = firstCEI->lowIndex;
4275
4276         // Check for the start of the match being within a combining sequence.
4277         //   This can happen if the pattern itself begins with a combining char, and
4278         //   the match found combining marks in the target text that were attached
4279         //    to something else.
4280         //   This type of match should be rejected for not completely consuming a
4281         //   combining sequence.
4282         if (!isBreakBoundary(strsrch, mStart)) {
4283             found = FALSE;
4284         }
4285
4286         // Look at the high index of the first CE in the match. If it's the same as the
4287         // low index, the first CE in the match is in the middle of an expansion.
4288         if (mStart == firstCEI->highIndex) {
4289             found = FALSE;
4290         }
4291
4292
4293         minLimit = lastCEI->lowIndex;
4294
4295         if (targetIx > 0) {
4296             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4297             //   extended to the end of input, and the match is good.
4298
4299             // Look at the high and low indices of the CE following the match. If
4300             // they are the same it means one of two things:
4301             //    1. The match extended to the last CE from the target text, which is OK, or
4302             //    2. The last CE that was part of the match is in an expansion that extends
4303             //       to the first CE after the match. In this case, we reject the match.
4304             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4305
4306             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4307                 found = FALSE;
4308             }
4309
4310             mLimit = maxLimit = nextCEI->lowIndex;
4311
4312             // Allow matches to end in the middle of a grapheme cluster if the following
4313             // conditions are met; this is needed to make prefix search work properly in
4314             // Indic, see #11750
4315             // * the default breakIter is being used
4316             // * the next collation element after this combining sequence
4317             //   - has non-zero primary weight
4318             //   - corresponds to a separate character following the one at end of the current match
4319             //   (the second of these conditions, and perhaps both, may be redundant given the
4320             //   subsequent check for normalization boundary; however they are likely much faster
4321             //   tests in any case)
4322             // * the match limit is a normalization boundary
4323             UBool allowMidclusterMatch = FALSE;
4324             if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4325                 allowMidclusterMatch =
4326                         strsrch->search->breakIter == NULL &&
4327                         nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4328                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4329                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4330                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4331             }
4332             // If those conditions are met, then:
4333             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4334             //   the match limit may be backed off to a previous break boundary. This handles
4335             //   cases in which mLimit includes target characters that are ignorable with current
4336             //   settings (such as space) and which extend beyond the pattern match.
4337             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4338             // * do NOT require that match limit be on a breakIter boundary
4339
4340             //  Advance the match end position to the first acceptable match boundary.
4341             //    This advances the index over any combining characters.
4342             if (minLimit < maxLimit) {
4343                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4344                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4345                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4346                 // (i.e. we back off mLimit to the previous breakIterator boundary).
4347                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4348                     mLimit = nba;
4349                 }
4350             }
4351
4352             if (!allowMidclusterMatch) {
4353                 // If advancing to the end of a combining sequence in character indexing space
4354                 //   advanced us beyond the end of the match in CE space, reject this match.
4355                 if (mLimit > maxLimit) {
4356                     found = FALSE;
4357                 }
4358
4359                 // Make sure the end of the match is on a break boundary
4360                 if (!isBreakBoundary(strsrch, mLimit)) {
4361                     found = FALSE;
4362                 }
4363             }
4364
4365         } else {
4366             // No non-ignorable CEs after this point.
4367             // The maximum position is detected by boundary after
4368             // the last non-ignorable CE. Combining sequence
4369             // across the start index will be truncated.
4370             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4371             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4372         }
4373
4374     #ifdef USEARCH_DEBUG
4375         if (getenv("USEARCH_DEBUG") != NULL) {
4376             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4377         }
4378     #endif
4379
4380
4381         if (! checkIdentical(strsrch, mStart, mLimit)) {
4382             found = FALSE;
4383         }
4384
4385         if (found) {
4386             break;
4387         }
4388     }
4389
4390     #ifdef USEARCH_DEBUG
4391     if (getenv("USEARCH_DEBUG") != NULL) {
4392         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4393         int32_t  lastToPrint = ceb.limitIx+2;
4394         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4395             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4396         }
4397         printf("\n%s\n", found? "match found" : "no match");
4398     }
4399     #endif
4400
4401     // All Done.  Store back the match bounds to the caller.
4402     //
4403     if (found==FALSE) {
4404         mLimit = -1;
4405         mStart = -1;
4406     }
4407
4408     if (matchStart != NULL) {
4409         *matchStart= mStart;
4410     }
4411
4412     if (matchLimit != NULL) {
4413         *matchLimit = mLimit;
4414     }
4415
4416     return found;
4417 }
4418
4419 // internal use methods declared in usrchimp.h -----------------------------
4420
4421 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4422 {
4423     if (U_FAILURE(*status)) {
4424         setMatchNotFound(strsrch);
4425         return FALSE;
4426     }
4427
4428 #if BOYER_MOORE
4429     UCollationElements *coleiter        = strsrch->textIter;
4430     int32_t             textlength      = strsrch->search->textLength;
4431     int32_t            *patternce       = strsrch->pattern.ces;
4432     int32_t             patterncelength = strsrch->pattern.cesLength;
4433     int32_t             textoffset      = ucol_getOffset(coleiter);
4434
4435     // status used in setting coleiter offset, since offset is checked in
4436     // shiftForward before setting the coleiter offset, status never
4437     // a failure
4438     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4439                               patterncelength);
4440     while (textoffset <= textlength)
4441     {
4442         uint32_t    patternceindex = patterncelength - 1;
4443         int32_t     targetce;
4444         UBool       found          = FALSE;
4445         int32_t    lastce          = UCOL_NULLORDER;
4446
4447         setColEIterOffset(coleiter, textoffset);
4448
4449         for (;;) {
4450             // finding the last pattern ce match, imagine composite characters
4451             // for example: search for pattern A in text \u00C0
4452             // we'll have to skip \u0300 the grave first before we get to A
4453             targetce = ucol_previous(coleiter, status);
4454             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4455                 found = FALSE;
4456                 break;
4457             }
4458             targetce = getCE(strsrch, targetce);
4459             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4460                 // this is for the text \u0315\u0300 that requires
4461                 // normalization and pattern \u0300, where \u0315 is ignorable
4462                 continue;
4463             }
4464             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4465                 lastce = targetce;
4466             }
4467             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4468             if (targetce == patternce[patternceindex]) {
4469                 // the first ce can be a contraction
4470                 found = TRUE;
4471                 break;
4472             }
4473             if (!hasExpansion(coleiter)) {
4474                 found = FALSE;
4475                 break;
4476             }
4477         }
4478
4479         //targetce = lastce;
4480
4481         while (found && patternceindex > 0) {
4482             lastce = targetce;
4483             targetce    = ucol_previous(coleiter, status);
4484             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4485                 found = FALSE;
4486                 break;
4487             }
4488             targetce    = getCE(strsrch, targetce);
4489             if (targetce == UCOL_IGNORABLE) {
4490                 continue;
4491             }
4492
4493             patternceindex --;
4494             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4495             found = found && targetce == patternce[patternceindex];
4496         }
4497
4498         targetce = lastce;
4499
4500         if (!found) {
4501             if (U_FAILURE(*status)) {
4502                 break;
4503             }
4504             textoffset = shiftForward(strsrch, textoffset, lastce,
4505                                       patternceindex);
4506             // status checked at loop.
4507             patternceindex = patterncelength;
4508             continue;
4509         }
4510
4511         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4512             // status checked in ucol_setOffset
4513             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4514             return TRUE;
4515         }
4516     }
4517     setMatchNotFound(strsrch);
4518     return FALSE;
4519 #else
4520     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4521     int32_t start = -1;
4522     int32_t end = -1;
4523
4524     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4525         strsrch->search->matchedIndex  = start;
4526         strsrch->search->matchedLength = end - start;
4527         return TRUE;
4528     } else {
4529         setMatchNotFound(strsrch);
4530         return FALSE;
4531     }
4532 #endif
4533 }
4534
4535 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4536 {
4537     if (U_FAILURE(*status)) {
4538         setMatchNotFound(strsrch);
4539         return FALSE;
4540     }
4541
4542 #if BOYER_MOORE
4543     UCollationElements *coleiter        = strsrch->textIter;
4544     int32_t             textlength      = strsrch->search->textLength;
4545     int32_t            *patternce       = strsrch->pattern.ces;
4546     int32_t             patterncelength = strsrch->pattern.cesLength;
4547     int32_t             textoffset      = ucol_getOffset(coleiter);
4548     UBool               hasPatternAccents =
4549        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4550
4551     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4552                               patterncelength);
4553     strsrch->canonicalPrefixAccents[0] = 0;
4554     strsrch->canonicalSuffixAccents[0] = 0;
4555
4556     while (textoffset <= textlength)
4557     {
4558         int32_t     patternceindex = patterncelength - 1;
4559         int32_t     targetce;
4560         UBool       found          = FALSE;
4561         int32_t     lastce         = UCOL_NULLORDER;
4562
4563         setColEIterOffset(coleiter, textoffset);
4564
4565         for (;;) {
4566             // finding the last pattern ce match, imagine composite characters
4567             // for example: search for pattern A in text \u00C0
4568             // we'll have to skip \u0300 the grave first before we get to A
4569             targetce = ucol_previous(coleiter, status);
4570             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4571                 found = FALSE;
4572                 break;
4573             }
4574             targetce = getCE(strsrch, targetce);
4575             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4576                 lastce = targetce;
4577             }
4578             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4579             if (targetce == patternce[patternceindex]) {
4580                 // the first ce can be a contraction
4581                 found = TRUE;
4582                 break;
4583             }
4584             if (!hasExpansion(coleiter)) {
4585                 found = FALSE;
4586                 break;
4587             }
4588         }
4589
4590         while (found && patternceindex > 0) {
4591             targetce    = ucol_previous(coleiter, status);
4592             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4593                 found = FALSE;
4594                 break;
4595             }
4596             targetce    = getCE(strsrch, targetce);
4597             if (targetce == UCOL_IGNORABLE) {
4598                 continue;
4599             }
4600
4601             patternceindex --;
4602             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603             found = found && targetce == patternce[patternceindex];
4604         }
4605
4606         // initializing the rearranged accent array
4607         if (hasPatternAccents && !found) {
4608             strsrch->canonicalPrefixAccents[0] = 0;
4609             strsrch->canonicalSuffixAccents[0] = 0;
4610             if (U_FAILURE(*status)) {
4611                 break;
4612             }
4613             found = doNextCanonicalMatch(strsrch, textoffset, status);
4614         }
4615
4616         if (!found) {
4617             if (U_FAILURE(*status)) {
4618                 break;
4619             }
4620             textoffset = shiftForward(strsrch, textoffset, lastce,
4621                                       patternceindex);
4622             // status checked at loop
4623             patternceindex = patterncelength;
4624             continue;
4625         }
4626
4627         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4628             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4629             return TRUE;
4630         }
4631     }
4632     setMatchNotFound(strsrch);
4633     return FALSE;
4634 #else
4635     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4636     int32_t start = -1;
4637     int32_t end = -1;
4638
4639     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4640         strsrch->search->matchedIndex  = start;
4641         strsrch->search->matchedLength = end - start;
4642         return TRUE;
4643     } else {
4644         setMatchNotFound(strsrch);
4645         return FALSE;
4646     }
4647 #endif
4648 }
4649
4650 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4651 {
4652     if (U_FAILURE(*status)) {
4653         setMatchNotFound(strsrch);
4654         return FALSE;
4655     }
4656
4657 #if BOYER_MOORE
4658     UCollationElements *coleiter        = strsrch->textIter;
4659     int32_t            *patternce       = strsrch->pattern.ces;
4660     int32_t             patterncelength = strsrch->pattern.cesLength;
4661     int32_t             textoffset      = ucol_getOffset(coleiter);
4662
4663     // shifting it check for setting offset
4664     // if setOffset is called previously or there was no previous match, we
4665     // leave the offset as it is.
4666     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4667         textoffset = strsrch->search->matchedIndex;
4668     }
4669
4670     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4671                               patterncelength);
4672
4673     while (textoffset >= 0)
4674     {
4675         int32_t     patternceindex = 1;
4676         int32_t     targetce;
4677         UBool       found          = FALSE;
4678         int32_t     firstce        = UCOL_NULLORDER;
4679
4680         // if status is a failure, ucol_setOffset does nothing
4681         setColEIterOffset(coleiter, textoffset);
4682
4683         for (;;) {
4684             // finding the first pattern ce match, imagine composite
4685             // characters. for example: search for pattern \u0300 in text
4686             // \u00C0, we'll have to skip A first before we get to
4687             // \u0300 the grave accent
4688             targetce = ucol_next(coleiter, status);
4689             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4690                 found = FALSE;
4691                 break;
4692             }
4693             targetce = getCE(strsrch, targetce);
4694             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4695                 firstce = targetce;
4696             }
4697             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4698                 continue;
4699             }
4700             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4701             if (targetce == patternce[0]) {
4702                 found = TRUE;
4703                 break;
4704             }
4705             if (!hasExpansion(coleiter)) {
4706                 // checking for accents in composite character
4707                 found = FALSE;
4708                 break;
4709             }
4710         }
4711
4712         //targetce = firstce;
4713
4714         while (found && (patternceindex < patterncelength)) {
4715             firstce = targetce;
4716             targetce    = ucol_next(coleiter, status);
4717             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4718                 found = FALSE;
4719                 break;
4720             }
4721             targetce    = getCE(strsrch, targetce);
4722             if (targetce == UCOL_IGNORABLE) {
4723                 continue;
4724             }
4725
4726             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4727             found = found && targetce == patternce[patternceindex];
4728             patternceindex ++;
4729         }
4730
4731         targetce = firstce;
4732
4733         if (!found) {
4734             if (U_FAILURE(*status)) {
4735                 break;
4736             }
4737
4738             textoffset = reverseShift(strsrch, textoffset, targetce,
4739                                       patternceindex);
4740             patternceindex = 0;
4741             continue;
4742         }
4743
4744         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4745             setColEIterOffset(coleiter, textoffset);
4746             return TRUE;
4747         }
4748     }
4749     setMatchNotFound(strsrch);
4750     return FALSE;
4751 #else
4752     int32_t textOffset;
4753
4754     if (strsrch->search->isOverlap) {
4755         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4756             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4757         } else {
4758             // move the start position at the end of possible match
4759             initializePatternPCETable(strsrch, status);
4760             if (!initTextProcessedIter(strsrch, status)) {
4761                 setMatchNotFound(strsrch);
4762                 return FALSE;
4763             }
4764             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4765                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4766                 if (pce == UCOL_PROCESSED_NULLORDER) {
4767                     // at the end of the text
4768                     break;
4769                 }
4770             }
4771             if (U_FAILURE(*status)) {
4772                 setMatchNotFound(strsrch);
4773                 return FALSE;
4774             }
4775             textOffset = ucol_getOffset(strsrch->textIter);
4776         }
4777     } else {
4778         textOffset = ucol_getOffset(strsrch->textIter);
4779     }
4780
4781     int32_t start = -1;
4782     int32_t end = -1;
4783
4784     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4785         strsrch->search->matchedIndex = start;
4786         strsrch->search->matchedLength = end - start;
4787         return TRUE;
4788     } else {
4789         setMatchNotFound(strsrch);
4790         return FALSE;
4791     }
4792 #endif
4793 }
4794
4795 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4796                                       UErrorCode    *status)
4797 {
4798     if (U_FAILURE(*status)) {
4799         setMatchNotFound(strsrch);
4800         return FALSE;
4801     }
4802
4803 #if BOYER_MOORE
4804     UCollationElements *coleiter        = strsrch->textIter;
4805     int32_t            *patternce       = strsrch->pattern.ces;
4806     int32_t             patterncelength = strsrch->pattern.cesLength;
4807     int32_t             textoffset      = ucol_getOffset(coleiter);
4808     UBool               hasPatternAccents =
4809        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4810
4811     // shifting it check for setting offset
4812     // if setOffset is called previously or there was no previous match, we
4813     // leave the offset as it is.
4814     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4815         textoffset = strsrch->search->matchedIndex;
4816     }
4817
4818     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4819                               patterncelength);
4820     strsrch->canonicalPrefixAccents[0] = 0;
4821     strsrch->canonicalSuffixAccents[0] = 0;
4822
4823     while (textoffset >= 0)
4824     {
4825         int32_t     patternceindex = 1;
4826         int32_t     targetce;
4827         UBool       found          = FALSE;
4828         int32_t     firstce        = UCOL_NULLORDER;
4829
4830         setColEIterOffset(coleiter, textoffset);
4831         for (;;) {
4832             // finding the first pattern ce match, imagine composite
4833             // characters. for example: search for pattern \u0300 in text
4834             // \u00C0, we'll have to skip A first before we get to
4835             // \u0300 the grave accent
4836             targetce = ucol_next(coleiter, status);
4837             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4838                 found = FALSE;
4839                 break;
4840             }
4841             targetce = getCE(strsrch, targetce);
4842             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4843                 firstce = targetce;
4844             }
4845
4846             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4847             if (targetce == patternce[0]) {
4848                 // the first ce can be a contraction
4849                 found = TRUE;
4850                 break;
4851             }
4852             if (!hasExpansion(coleiter)) {
4853                 // checking for accents in composite character
4854                 found = FALSE;
4855                 break;
4856             }
4857         }
4858
4859         targetce = firstce;
4860
4861         while (found && patternceindex < patterncelength) {
4862             targetce    = ucol_next(coleiter, status);
4863             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4864                 found = FALSE;
4865                 break;
4866             }
4867             targetce = getCE(strsrch, targetce);
4868             if (targetce == UCOL_IGNORABLE) {
4869                 continue;
4870             }
4871
4872             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4873             found = found && targetce == patternce[patternceindex];
4874             patternceindex ++;
4875         }
4876
4877         // initializing the rearranged accent array
4878         if (hasPatternAccents && !found) {
4879             strsrch->canonicalPrefixAccents[0] = 0;
4880             strsrch->canonicalSuffixAccents[0] = 0;
4881             if (U_FAILURE(*status)) {
4882                 break;
4883             }
4884             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4885         }
4886
4887         if (!found) {
4888             if (U_FAILURE(*status)) {
4889                 break;
4890             }
4891             textoffset = reverseShift(strsrch, textoffset, targetce,
4892                                       patternceindex);
4893             patternceindex = 0;
4894             continue;
4895         }
4896
4897         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4898             setColEIterOffset(coleiter, textoffset);
4899             return TRUE;
4900         }
4901     }
4902     setMatchNotFound(strsrch);
4903     return FALSE;
4904 #else
4905     int32_t textOffset;
4906
4907     if (strsrch->search->isOverlap) {
4908         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4909             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4910         } else {
4911             // move the start position at the end of possible match
4912             initializePatternPCETable(strsrch, status);
4913             if (!initTextProcessedIter(strsrch, status)) {
4914                 setMatchNotFound(strsrch);
4915                 return FALSE;
4916             }
4917             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4918                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4919                 if (pce == UCOL_PROCESSED_NULLORDER) {
4920                     // at the end of the text
4921                     break;
4922                 }
4923             }
4924             if (U_FAILURE(*status)) {
4925                 setMatchNotFound(strsrch);
4926                 return FALSE;
4927             }
4928             textOffset = ucol_getOffset(strsrch->textIter);
4929         }
4930     } else {
4931         textOffset = ucol_getOffset(strsrch->textIter);
4932     }
4933
4934     int32_t start = -1;
4935     int32_t end = -1;
4936
4937     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4938         strsrch->search->matchedIndex = start;
4939         strsrch->search->matchedLength = end - start;
4940         return TRUE;
4941     } else {
4942         setMatchNotFound(strsrch);
4943         return FALSE;
4944     }
4945 #endif
4946 }
4947
4948 #endif /* #if !UCONFIG_NO_COLLATION */