icuSources/i18n/usearch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2008 IBM and others. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *  07/02/2001   synwee      Creation.
   7 **********************************************************************
   8 */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "unicode/usearch.h"
  15 #include "unicode/ustring.h"
  16 #include "unicode/uchar.h"
  17 #include "unormimp.h"
  18 #include "ucol_imp.h"
  19 #include "usrchimp.h"
  20 #include "cmemory.h"
  21 #include "ucln_in.h"
  22 #include "uassert.h"
  23
  24 U_NAMESPACE_USE
  25
  26 // don't use Boyer-Moore
  27 #define BOYER_MOORE 0
  28
  29 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  30
  31 // internal definition ---------------------------------------------------
  32
  33 #define LAST_BYTE_MASK_          0xFF
  34 #define SECOND_LAST_BYTE_SHIFT_  8
  35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
  36
  37 static const uint16_t *FCD_ = NULL;
  38
  39 // internal methods -------------------------------------------------
  40
  41 /**
  42 * Fast collation element iterator setOffset.
  43 * This function does not check for bounds.
  44 * @param coleiter collation element iterator
  45 * @param offset to set
  46 */
  47 static
  48 inline void setColEIterOffset(UCollationElements *elems,
  49                       int32_t             offset)
  50 {
  51     collIterate *ci = &(elems->iteratordata_);
  52     ci->pos         = ci->string + offset;
  53     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
  54     if (ci->flags & UCOL_ITER_INNORMBUF) {
  55         ci->flags = ci->origFlags;
  56     }
  57     ci->fcdPosition = NULL;
  58
  59         ci->offsetReturn = NULL;
  60     ci->offsetStore  = ci->offsetBuffer;
  61         ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
  62 }
  63
  64 /**
  65 * Getting the mask for collation strength
  66 * @param strength collation strength
  67 * @return collation element mask
  68 */
  69 static
  70 inline uint32_t getMask(UCollationStrength strength)
  71 {
  72     switch (strength)
  73     {
  74     case UCOL_PRIMARY:
  75         return UCOL_PRIMARYORDERMASK;
  76     case UCOL_SECONDARY:
  77         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
  78     default:
  79         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
  80                UCOL_PRIMARYORDERMASK;
  81     }
  82 }
  83
  84 /**
  85 * This is to squeeze the 21bit ces into a 256 table
  86 * @param ce collation element
  87 * @return collapsed version of the collation element
  88 */
  89 static
  90 inline int hash(uint32_t ce)
  91 {
  92     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
  93     // well with the new collation where most of the latin 1 characters
  94     // are of the value xx000xxx. their hashes will most of the time be 0
  95     // to be discussed on the hash algo.
  96     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
  97 }
  98
  99 U_CDECL_BEGIN
 100 static UBool U_CALLCONV
 101 usearch_cleanup(void) {
 102     FCD_ = NULL;
 103     return TRUE;
 104 }
 105 U_CDECL_END
 106
 107 /**
 108 * Initializing the fcd tables.
 109 * Internal method, status assumed to be a success.
 110 * @param status output error if any, caller to check status before calling
 111 *               method, status assumed to be success when passed in.
 112 */
 113 static
 114 inline void initializeFCD(UErrorCode *status)
 115 {
 116     if (FCD_ == NULL) {
 117         FCD_ = unorm_getFCDTrie(status);
 118         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
 119     }
 120 }
 121
 122 /**
 123 * Gets the fcd value for a character at the argument index.
 124 * This method takes into accounts of the supplementary characters.
 125 * @param str UTF16 string where character for fcd retrieval resides
 126 * @param offset position of the character whose fcd is to be retrieved, to be
 127 *               overwritten with the next character position, taking
 128 *               surrogate characters into consideration.
 129 * @param strlength length of the argument string
 130 * @return fcd value
 131 */
 132 static
 133 uint16_t getFCD(const UChar   *str, int32_t *offset,
 134                              int32_t  strlength)
 135 {
 136     int32_t temp = *offset;
 137     uint16_t    result;
 138     UChar       ch   = str[temp];
 139     result = unorm_getFCD16(FCD_, ch);
 140     temp ++;
 141
 142     if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
 143         ch = str[temp];
 144         if (UTF_IS_SECOND_SURROGATE(ch)) {
 145             result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
 146             temp ++;
 147         } else {
 148             result = 0;
 149         }
 150     }
 151     *offset = temp;
 152     return result;
 153 }
 154
 155 /**
 156 * Getting the modified collation elements taking into account the collation
 157 * attributes
 158 * @param strsrch string search data
 159 * @param sourcece
 160 * @return the modified collation element
 161 */
 162 static
 163 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
 164 {
 165     // note for tertiary we can't use the collator->tertiaryMask, that
 166     // is a preprocessed mask that takes into account case options. since
 167     // we are only concerned with exact matches, we don't need that.
 168     sourcece &= strsrch->ceMask;
 169
 170     if (strsrch->toShift) {
 171         // alternate handling here, since only the 16 most significant digits
 172         // is only used, we can safely do a compare without masking
 173         // if the ce is a variable, we mask and get only the primary values
 174         // no shifting to quartenary is required since all primary values
 175         // less than variabletop will need to be masked off anyway.
 176         if (strsrch->variableTop > sourcece) {
 177             if (strsrch->strength == UCOL_QUATERNARY) {
 178                 sourcece &= UCOL_PRIMARYORDERMASK;
 179             }
 180             else {
 181                 sourcece = UCOL_IGNORABLE;
 182             }
 183         }
 184     }
 185
 186     return sourcece;
 187 }
 188
 189 /**
 190 * Allocate a memory and returns NULL if it failed.
 191 * Internal method, status assumed to be a success.
 192 * @param size to allocate
 193 * @param status output error if any, caller to check status before calling
 194 *               method, status assumed to be success when passed in.
 195 * @return newly allocated array, NULL otherwise
 196 */
 197 static
 198 inline void * allocateMemory(uint32_t size, UErrorCode *status)
 199 {
 200     uint32_t *result = (uint32_t *)uprv_malloc(size);
 201     if (result == NULL) {
 202         *status = U_MEMORY_ALLOCATION_ERROR;
 203     }
 204     return result;
 205 }
 206
 207 /**
 208 * Adds a uint32_t value to a destination array.
 209 * Creates a new array if we run out of space. The caller will have to
 210 * manually deallocate the newly allocated array.
 211 * Internal method, status assumed to be success, caller has to check status
 212 * before calling this method. destination not to be NULL and has at least
 213 * size destinationlength.
 214 * @param destination target array
 215 * @param offset destination offset to add value
 216 * @param destinationlength target array size, return value for the new size
 217 * @param value to be added
 218 * @param increments incremental size expected
 219 * @param status output error if any, caller to check status before calling
 220 *               method, status assumed to be success when passed in.
 221 * @return new destination array, destination if there was no new allocation
 222 */
 223 static
 224 inline int32_t * addTouint32_tArray(int32_t    *destination,
 225                                     uint32_t    offset,
 226                                     uint32_t   *destinationlength,
 227                                     uint32_t    value,
 228                                     uint32_t    increments,
 229                                     UErrorCode *status)
 230 {
 231     uint32_t newlength = *destinationlength;
 232     if (offset + 1 == newlength) {
 233         newlength += increments;
 234         int32_t *temp = (int32_t *)allocateMemory(
 235                                          sizeof(int32_t) * newlength, status);
 236         if (U_FAILURE(*status)) {
 237             return NULL;
 238         }
 239         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
 240         *destinationlength = newlength;
 241         destination        = temp;
 242     }
 243     destination[offset] = value;
 244     return destination;
 245 }
 246
 247 /**
 248 * Adds a uint64_t value to a destination array.
 249 * Creates a new array if we run out of space. The caller will have to
 250 * manually deallocate the newly allocated array.
 251 * Internal method, status assumed to be success, caller has to check status
 252 * before calling this method. destination not to be NULL and has at least
 253 * size destinationlength.
 254 * @param destination target array
 255 * @param offset destination offset to add value
 256 * @param destinationlength target array size, return value for the new size
 257 * @param value to be added
 258 * @param increments incremental size expected
 259 * @param status output error if any, caller to check status before calling
 260 *               method, status assumed to be success when passed in.
 261 * @return new destination array, destination if there was no new allocation
 262 */
 263 static
 264 inline int64_t * addTouint64_tArray(int64_t    *destination,
 265                                     uint32_t    offset,
 266                                     uint32_t   *destinationlength,
 267                                     uint64_t    value,
 268                                     uint32_t    increments,
 269                                     UErrorCode *status)
 270 {
 271     uint32_t newlength = *destinationlength;
 272     if (offset + 1 == newlength) {
 273         newlength += increments;
 274         int64_t *temp = (int64_t *)allocateMemory(
 275                                          sizeof(int64_t) * newlength, status);
 276
 277         if (U_FAILURE(*status)) {
 278             return NULL;
 279         }
 280
 281         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
 282         *destinationlength = newlength;
 283         destination        = temp;
 284     }
 285
 286     destination[offset] = value;
 287
 288     return destination;
 289 }
 290
 291 /**
 292 * Initializing the ce table for a pattern.
 293 * Stores non-ignorable collation keys.
 294 * Table size will be estimated by the size of the pattern text. Table
 295 * expansion will be perform as we go along. Adding 1 to ensure that the table
 296 * size definitely increases.
 297 * Internal method, status assumed to be a success.
 298 * @param strsrch string search data
 299 * @param status output error if any, caller to check status before calling
 300 *               method, status assumed to be success when passed in.
 301 * @return total number of expansions
 302 */
 303 static
 304 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
 305                                          UErrorCode    *status)
 306 {
 307     UPattern *pattern            = &(strsrch->pattern);
 308     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
 309     int32_t  *cetable            = pattern->CEBuffer;
 310     uint32_t  patternlength      = pattern->textLength;
 311     UCollationElements *coleiter = strsrch->utilIter;
 312
 313     if (coleiter == NULL) {
 314         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 315                                      patternlength, status);
 316         // status will be checked in ucol_next(..) later and if it is an
 317         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 318         // returned.
 319         strsrch->utilIter = coleiter;
 320     }
 321     else {
 322         uprv_init_collIterate(strsrch->collator, pattern->text,
 323                          pattern->textLength,
 324                          &coleiter->iteratordata_);
 325     }
 326
 327     if (pattern->CE != cetable && pattern->CE) {
 328         uprv_free(pattern->CE);
 329     }
 330
 331     uint16_t  offset      = 0;
 332     uint16_t  result      = 0;
 333     int32_t   ce;
 334
 335     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
 336            U_SUCCESS(*status)) {
 337         uint32_t newce = getCE(strsrch, ce);
 338         if (newce) {
 339             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
 340                                   newce,
 341                                   patternlength - ucol_getOffset(coleiter) + 1,
 342                                   status);
 343             if (U_FAILURE(*status)) {
 344                 return 0;
 345             }
 346             offset ++;
 347             if (cetable != temp && cetable != pattern->CEBuffer) {
 348                 uprv_free(cetable);
 349             }
 350             cetable = temp;
 351         }
 352         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 353     }
 354
 355     cetable[offset]   = 0;
 356     pattern->CE       = cetable;
 357     pattern->CELength = offset;
 358
 359     return result;
 360 }
 361
 362 /**
 363 * Initializing the pce table for a pattern.
 364 * Stores non-ignorable collation keys.
 365 * Table size will be estimated by the size of the pattern text. Table
 366 * expansion will be perform as we go along. Adding 1 to ensure that the table
 367 * size definitely increases.
 368 * Internal method, status assumed to be a success.
 369 * @param strsrch string search data
 370 * @param status output error if any, caller to check status before calling
 371 *               method, status assumed to be success when passed in.
 372 * @return total number of expansions
 373 */
 374 static
 375 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
 376                                           UErrorCode    *status)
 377 {
 378     UPattern *pattern            = &(strsrch->pattern);
 379     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
 380     int64_t  *pcetable           = pattern->PCEBuffer;
 381     uint32_t  patternlength      = pattern->textLength;
 382     UCollationElements *coleiter = strsrch->utilIter;
 383
 384     if (coleiter == NULL) {
 385         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 386                                      patternlength, status);
 387         // status will be checked in ucol_next(..) later and if it is an
 388         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 389         // returned.
 390         strsrch->utilIter = coleiter;
 391     } else {
 392         uprv_init_collIterate(strsrch->collator, pattern->text,
 393                               pattern->textLength,
 394                               &coleiter->iteratordata_);
 395     }
 396
 397     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
 398         uprv_free(pattern->PCE);
 399     }
 400
 401     uint16_t  offset = 0;
 402     uint16_t  result = 0;
 403     int64_t   pce;
 404
 405     uprv_init_pce(coleiter);
 406
 407     // ** Should processed CEs be signed or unsigned?
 408     // ** (the rest of the code in this file seems to play fast-and-loose with
 409     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
 410     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
 411            U_SUCCESS(*status)) {
 412         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
 413                               pce,
 414                               patternlength - ucol_getOffset(coleiter) + 1,
 415                               status);
 416
 417         if (U_FAILURE(*status)) {
 418             return 0;
 419         }
 420
 421         offset += 1;
 422
 423         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
 424             uprv_free(pcetable);
 425         }
 426
 427         pcetable = temp;
 428         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 429     }
 430
 431     pcetable[offset]   = 0;
 432     pattern->PCE       = pcetable;
 433     pattern->PCELength = offset;
 434
 435     return result;
 436 }
 437
 438 /**
 439 * Initializes the pattern struct.
 440 * Internal method, status assumed to be success.
 441 * @param strsrch UStringSearch data storage
 442 * @param status output error if any, caller to check status before calling
 443 *               method, status assumed to be success when passed in.
 444 * @return expansionsize the total expansion size of the pattern
 445 */
 446 static
 447 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
 448 {
 449           UPattern   *pattern     = &(strsrch->pattern);
 450     const UChar      *patterntext = pattern->text;
 451           int32_t     length      = pattern->textLength;
 452           int32_t index       = 0;
 453
 454     // Since the strength is primary, accents are ignored in the pattern.
 455     if (strsrch->strength == UCOL_PRIMARY) {
 456         pattern->hasPrefixAccents = 0;
 457         pattern->hasSuffixAccents = 0;
 458     } else {
 459             pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
 460                                                              SECOND_LAST_BYTE_SHIFT_;
 461             index = length;
 462             UTF_BACK_1(patterntext, 0, index);
 463             pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
 464                                                                      LAST_BYTE_MASK_;
 465     }
 466
 467     // ** HACK **
 468     if (strsrch->pattern.PCE != NULL) {
 469         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
 470             uprv_free(strsrch->pattern.PCE);
 471         }
 472
 473         strsrch->pattern.PCE = NULL;
 474     }
 475
 476     // since intializePattern is an internal method status is a success.
 477     return initializePatternCETable(strsrch, status);
 478 }
 479
 480 /**
 481 * Initializing shift tables, with the default values.
 482 * If a corresponding default value is 0, the shift table is not set.
 483 * @param shift table for forwards shift
 484 * @param backshift table for backwards shift
 485 * @param cetable table containing pattern ce
 486 * @param cesize size of the pattern ces
 487 * @param expansionsize total size of the expansions
 488 * @param defaultforward the default forward value
 489 * @param defaultbackward the default backward value
 490 */
 491 static
 492 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
 493                           int32_t  *cetable, int32_t cesize,
 494                           int16_t   expansionsize,
 495                           int16_t   defaultforward,
 496                           int16_t   defaultbackward)
 497 {
 498     // estimate the value to shift. to do that we estimate the smallest
 499     // number of characters to give the relevant ces, ie approximately
 500     // the number of ces minus their expansion, since expansions can come
 501     // from a character.
 502     int32_t count;
 503     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 504         shift[count] = defaultforward;
 505     }
 506     cesize --; // down to the last index
 507     for (count = 0; count < cesize; count ++) {
 508         // number of ces from right of array to the count
 509         int temp = defaultforward - count - 1;
 510         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
 511     }
 512     shift[hash(cetable[cesize])] = 1;
 513     // for ignorables we just shift by one. see test examples.
 514     shift[hash(0)] = 1;
 515
 516     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 517         backshift[count] = defaultbackward;
 518     }
 519     for (count = cesize; count > 0; count --) {
 520         // the original value count does not seem to work
 521         backshift[hash(cetable[count])] = count > expansionsize ?
 522                                           (int16_t)(count - expansionsize) : 1;
 523     }
 524     backshift[hash(cetable[0])] = 1;
 525     backshift[hash(0)] = 1;
 526 }
 527
 528 /**
 529 * Building of the pattern collation element list and the boyer moore strsrch
 530 * table.
 531 * The canonical match will only be performed after the default match fails.
 532 * For both cases we need to remember the size of the composed and decomposed
 533 * versions of the string. Since the Boyer-Moore shift calculations shifts by
 534 * a number of characters in the text and tries to match the pattern from that
 535 * offset, the shift value can not be too large in case we miss some
 536 * characters. To choose a right shift size, we estimate the NFC form of the
 537 * and use its size as a shift guide. The NFC form should be the small
 538 * possible representation of the pattern. Anyways, we'll err on the smaller
 539 * shift size. Hence the calculation for minlength.
 540 * Canonical match will be performed slightly differently. We'll split the
 541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
 542 * the first and last base character (MS), the ending accents (EA). Matches
 543 * will be done on MS first, and only when we match MS then some processing
 544 * will be required for the prefix and end accents in order to determine if
 545 * they match PA and EA. Hence the default shift values
 546 * for the canonical match will take the size of either end's accent into
 547 * consideration. Forwards search will take the end accents into consideration
 548 * for the default shift values and the backwards search will take the prefix
 549 * accents into consideration.
 550 * If pattern has no non-ignorable ce, we return a illegal argument error.
 551 * Internal method, status assumed to be success.
 552 * @param strsrch UStringSearch data storage
 553 * @param status  for output errors if it occurs, status is assumed to be a
 554 *                success when it is passed in.
 555 */
 556 static
 557 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
 558 {
 559     int16_t expandlength  = initializePattern(strsrch, status);
 560     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
 561         UPattern *pattern = &strsrch->pattern;
 562         int32_t   cesize  = pattern->CELength;
 563
 564         int16_t minlength = cesize > expandlength
 565                             ? (int16_t)cesize - expandlength : 1;
 566         pattern->defaultShiftSize    = minlength;
 567         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
 568                       cesize, expandlength, minlength, minlength);
 569         return;
 570     }
 571     strsrch->pattern.defaultShiftSize = 0;
 572 }
 573
 574 #if BOYER_MOORE
 575 /**
 576 * Check to make sure that the match length is at the end of the character by
 577 * using the breakiterator.
 578 * @param strsrch string search data
 579 * @param start target text start offset
 580 * @param end target text end offset
 581 */
 582 static
 583 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
 584                                int32_t *end)
 585 {
 586 #if !UCONFIG_NO_BREAK_ITERATION
 587     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
 588     if (breakiterator) {
 589             int32_t matchend = *end;
 590             //int32_t matchstart = *start;
 591
 592             if (!ubrk_isBoundary(breakiterator, matchend)) {
 593                 *end = ubrk_following(breakiterator, matchend);
 594         }
 595
 596             /* Check the start of the matched text to make sure it doesn't have any accents
 597              * before it.  This code may not be necessary and so it is commented out */
 598             /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
 599                 *start = ubrk_preceding(breakiterator, matchstart);
 600             }*/
 601     }
 602 #endif
 603 }
 604
 605 /**
 606 * Determine whether the target text in UStringSearch bounded by the offset
 607 * start and end is one or more whole units of text as
 608 * determined by the breakiterator in UStringSearch.
 609 * @param strsrch string search data
 610 * @param start target text start offset
 611 * @param end target text end offset
 612 */
 613 static
 614 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
 615                                int32_t    end)
 616 {
 617 #if !UCONFIG_NO_BREAK_ITERATION
 618     UBreakIterator *breakiterator = strsrch->search->breakIter;
 619     //TODO: Add here.
 620     if (breakiterator) {
 621         int32_t startindex = ubrk_first(breakiterator);
 622         int32_t endindex   = ubrk_last(breakiterator);
 623
 624         // out-of-range indexes are never boundary positions
 625         if (start < startindex || start > endindex ||
 626             end < startindex || end > endindex) {
 627             return FALSE;
 628         }
 629         // otherwise, we can use following() on the position before the
 630         // specified one and return true of the position we get back is the
 631         // one the user specified
 632         UBool result = (start == startindex ||
 633                 ubrk_following(breakiterator, start - 1) == start) &&
 634                (end == endindex ||
 635                 ubrk_following(breakiterator, end - 1) == end);
 636         if (result) {
 637             // iterates the individual ces
 638                   UCollationElements *coleiter  = strsrch->utilIter;
 639             const UChar              *text      = strsrch->search->text +
 640                                                                       start;
 641                   UErrorCode          status    = U_ZERO_ERROR;
 642             ucol_setText(coleiter, text, end - start, &status);
 643             for (int32_t count = 0; count < strsrch->pattern.CELength;
 644                  count ++) {
 645                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 646                 if (ce == UCOL_IGNORABLE) {
 647                     count --;
 648                     continue;
 649                 }
 650                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
 651                     return FALSE;
 652                 }
 653             }
 654             int32_t nextce = ucol_next(coleiter, &status);
 655             while (ucol_getOffset(coleiter) == (end - start)
 656                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
 657                 nextce = ucol_next(coleiter, &status);
 658             }
 659             if (ucol_getOffset(coleiter) == (end - start)
 660                 && nextce != UCOL_NULLORDER) {
 661                 // extra collation elements at the end of the match
 662                 return FALSE;
 663             }
 664         }
 665         return result;
 666     }
 667 #endif
 668     return TRUE;
 669 }
 670
 671 /**
 672 * Getting the next base character offset if current offset is an accent,
 673 * or the current offset if the current character contains a base character.
 674 * accents the following base character will be returned
 675 * @param text string
 676 * @param textoffset current offset
 677 * @param textlength length of text string
 678 * @return the next base character or the current offset
 679 *         if the current character is contains a base character.
 680 */
 681 static
 682 inline int32_t getNextBaseOffset(const UChar       *text,
 683                                            int32_t  textoffset,
 684                                            int32_t      textlength)
 685 {
 686     if (textoffset < textlength) {
 687         int32_t temp = textoffset;
 688         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 689             while (temp < textlength) {
 690                 int32_t result = temp;
 691                 if ((getFCD(text, &temp, textlength) >>
 692                      SECOND_LAST_BYTE_SHIFT_) == 0) {
 693                     return result;
 694                 }
 695             }
 696             return textlength;
 697         }
 698     }
 699     return textoffset;
 700 }
 701
 702 /**
 703 * Gets the next base character offset depending on the string search pattern
 704 * data
 705 * @param strsrch string search data
 706 * @param textoffset current offset, one offset away from the last character
 707 *                   to search for.
 708 * @return start index of the next base character or the current offset
 709 *         if the current character is contains a base character.
 710 */
 711 static
 712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
 713                                                   int32_t    textoffset)
 714 {
 715     int32_t textlength = strsrch->search->textLength;
 716     if (strsrch->pattern.hasSuffixAccents &&
 717         textoffset < textlength) {
 718               int32_t  temp       = textoffset;
 719         const UChar       *text       = strsrch->search->text;
 720         UTF_BACK_1(text, 0, temp);
 721         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 722             return getNextBaseOffset(text, textoffset, textlength);
 723         }
 724     }
 725     return textoffset;
 726 }
 727
 728 /**
 729 * Shifting the collation element iterator position forward to prepare for
 730 * a following match. If the last character is a unsafe character, we'll only
 731 * shift by 1 to capture contractions, normalization etc.
 732 * Internal method, status assumed to be success.
 733 * @param text strsrch string search data
 734 * @param textoffset start text position to do search
 735 * @param ce the text ce which failed the match.
 736 * @param patternceindex index of the ce within the pattern ce buffer which
 737 *        failed the match
 738 * @return final offset
 739 */
 740 static
 741 inline int32_t shiftForward(UStringSearch *strsrch,
 742                                 int32_t    textoffset,
 743                                 int32_t       ce,
 744                                 int32_t        patternceindex)
 745 {
 746     UPattern *pattern = &(strsrch->pattern);
 747     if (ce != UCOL_NULLORDER) {
 748         int32_t shift = pattern->shift[hash(ce)];
 749         // this is to adjust for characters in the middle of the
 750         // substring for matching that failed.
 751         int32_t adjust = pattern->CELength - patternceindex;
 752         if (adjust > 1 && shift >= adjust) {
 753             shift -= adjust - 1;
 754         }
 755         textoffset += shift;
 756     }
 757     else {
 758         textoffset += pattern->defaultShiftSize;
 759     }
 760
 761     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
 762     // check for unsafe characters
 763     // * if it is the start or middle of a contraction: to be done after
 764     //   a initial match is found
 765     // * thai or lao base consonant character: similar to contraction
 766     // * high surrogate character: similar to contraction
 767     // * next character is a accent: shift to the next base character
 768     return textoffset;
 769 }
 770 #endif // #if BOYER_MOORE
 771
 772 /**
 773 * sets match not found
 774 * @param strsrch string search data
 775 */
 776 static
 777 inline void setMatchNotFound(UStringSearch *strsrch)
 778 {
 779     // this method resets the match result regardless of the error status.
 780     strsrch->search->matchedIndex = USEARCH_DONE;
 781     strsrch->search->matchedLength = 0;
 782     if (strsrch->search->isForwardSearching) {
 783         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
 784     }
 785     else {
 786         setColEIterOffset(strsrch->textIter, 0);
 787     }
 788 }
 789
 790 #if BOYER_MOORE
 791 /**
 792 * Gets the offset to the next safe point in text.
 793 * ie. not the middle of a contraction, swappable characters or supplementary
 794 * characters.
 795 * @param collator collation sata
 796 * @param text string to work with
 797 * @param textoffset offset in string
 798 * @param textlength length of text string
 799 * @return offset to the next safe character
 800 */
 801 static
 802 inline int32_t getNextSafeOffset(const UCollator   *collator,
 803                                      const UChar       *text,
 804                                            int32_t  textoffset,
 805                                            int32_t      textlength)
 806 {
 807     int32_t result = textoffset; // first contraction character
 808     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
 809         result ++;
 810     }
 811     return result;
 812 }
 813
 814 /**
 815 * This checks for accents in the potential match started with a .
 816 * composite character.
 817 * This is really painful... we have to check that composite character do not
 818 * have any extra accents. We have to normalize the potential match and find
 819 * the immediate decomposed character before the match.
 820 * The first composite character would have been taken care of by the fcd
 821 * checks in checkForwardExactMatch.
 822 * This is the slow path after the fcd of the first character and
 823 * the last character has been checked by checkForwardExactMatch and we
 824 * determine that the potential match has extra non-ignorable preceding
 825 * ces.
 826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 828 * Note here that accents checking are slow and cautioned in the API docs.
 829 * Internal method, status assumed to be a success, caller should check status
 830 * before calling this method
 831 * @param strsrch string search data
 832 * @param start index of the potential unfriendly composite character
 833 * @param end index of the potential unfriendly composite character
 834 * @param status output error status if any.
 835 * @return TRUE if there is non-ignorable accents before at the beginning
 836 *              of the match, FALSE otherwise.
 837 */
 838
 839 static
 840 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
 841                                    int32_t    end,
 842                                    UErrorCode    *status)
 843 {
 844     UBool result = FALSE;
 845     if (strsrch->pattern.hasPrefixAccents) {
 846               int32_t  length = end - start;
 847               int32_t  offset = 0;
 848         const UChar       *text   = strsrch->search->text + start;
 849
 850         UTF_FWD_1(text, offset, length);
 851         // we are only concerned with the first composite character
 852         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
 853             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
 854                                                        text, 0, length);
 855             if (safeoffset != length) {
 856                 safeoffset ++;
 857             }
 858             UChar   *norm = NULL;
 859             UChar    buffer[INITIAL_ARRAY_SIZE_];
 860             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
 861                                             buffer, INITIAL_ARRAY_SIZE_,
 862                                             status);
 863             if (U_FAILURE(*status)) {
 864                 return FALSE;
 865             }
 866             if (size >= INITIAL_ARRAY_SIZE_) {
 867                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
 868                                                status);
 869                 // if allocation failed, status will be set to
 870                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
 871                 // checks for it.
 872                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
 873                                        size, status);
 874                 if (U_FAILURE(*status) && norm != NULL) {
 875                     uprv_free(norm);
 876                     return FALSE;
 877                 }
 878             }
 879             else {
 880                 norm = buffer;
 881             }
 882
 883             UCollationElements *coleiter  = strsrch->utilIter;
 884             ucol_setText(coleiter, norm, size, status);
 885             uint32_t            firstce   = strsrch->pattern.CE[0];
 886             UBool               ignorable = TRUE;
 887             uint32_t            ce        = UCOL_IGNORABLE;
 888             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
 889                 offset = ucol_getOffset(coleiter);
 890                 if (ce != firstce && ce != UCOL_IGNORABLE) {
 891                     ignorable = FALSE;
 892                 }
 893                 ce = ucol_next(coleiter, status);
 894             }
 895             UChar32 codepoint;
 896             UTF_PREV_CHAR(norm, 0, offset, codepoint);
 897             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
 898
 899             if (norm != buffer) {
 900                 uprv_free(norm);
 901             }
 902         }
 903     }
 904
 905     return result;
 906 }
 907
 908 /**
 909 * Used by exact matches, checks if there are accents before the match.
 910 * This is really painful... we have to check that composite characters at
 911 * the start of the matches have to not have any extra accents.
 912 * We check the FCD of the character first, if it starts with an accent and
 913 * the first pattern ce does not match the first ce of the character, we bail.
 914 * Otherwise we try normalizing the first composite
 915 * character and find the immediate decomposed character before the match to
 916 * see if it is an non-ignorable accent.
 917 * Now normalizing the first composite character is enough because we ensure
 918 * that when the match is passed in here with extra beginning ces, the
 919 * first or last ce that match has to occur within the first character.
 920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 922 * Note here that accents checking are slow and cautioned in the API docs.
 923 * @param strsrch string search data
 924 * @param start offset
 925 * @param end offset
 926 * @return TRUE if there are accents on either side of the match,
 927 *         FALSE otherwise
 928 */
 929 static
 930 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
 931                                   int32_t    end)
 932 {
 933     if (strsrch->pattern.hasPrefixAccents) {
 934         UCollationElements *coleiter  = strsrch->textIter;
 935         UErrorCode          status    = U_ZERO_ERROR;
 936         // we have been iterating forwards previously
 937         uint32_t            ignorable = TRUE;
 938         int32_t             firstce   = strsrch->pattern.CE[0];
 939
 940         setColEIterOffset(coleiter, start);
 941         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
 942         if (U_FAILURE(status)) {
 943             return TRUE;
 944         }
 945         while (ce != firstce) {
 946             if (ce != UCOL_IGNORABLE) {
 947                 ignorable = FALSE;
 948             }
 949             ce = getCE(strsrch, ucol_next(coleiter, &status));
 950             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
 951                 return TRUE;
 952             }
 953         }
 954         if (!ignorable && inNormBuf(coleiter)) {
 955             // within normalization buffer, discontiguous handled here
 956             return TRUE;
 957         }
 958
 959         // within text
 960         int32_t temp = start;
 961         // original code
 962         // accent = (getFCD(strsrch->search->text, &temp,
 963         //                  strsrch->search->textLength)
 964         //            >> SECOND_LAST_BYTE_SHIFT_);
 965         // however this code does not work well with VC7 .net in release mode.
 966         // maybe the inlines for getFCD combined with shifting has bugs in
 967         // VC7. anyways this is a work around.
 968         UBool accent = getFCD(strsrch->search->text, &temp,
 969                               strsrch->search->textLength) > 0xFF;
 970         if (!accent) {
 971             return checkExtraMatchAccents(strsrch, start, end, &status);
 972         }
 973         if (!ignorable) {
 974             return TRUE;
 975         }
 976         if (start > 0) {
 977             temp = start;
 978             UTF_BACK_1(strsrch->search->text, 0, temp);
 979             if (getFCD(strsrch->search->text, &temp,
 980                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
 981                 setColEIterOffset(coleiter, start);
 982                 ce = ucol_previous(coleiter, &status);
 983                 if (U_FAILURE(status) ||
 984                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
 985                     return TRUE;
 986                 }
 987             }
 988         }
 989     }
 990
 991     return FALSE;
 992 }
 993
 994 /**
 995 * Used by exact matches, checks if there are accents bounding the match.
 996 * Note this is the initial boundary check. If the potential match
 997 * starts or ends with composite characters, the accents in those
 998 * characters will be determined later.
 999 * Not doing backwards iteration here, since discontiguos contraction for
1000 * backwards collation element iterator, use up too many characters.
1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
1002 * should fail since there is a acute at the end of \u01FA
1003 * Note here that accents checking are slow and cautioned in the API docs.
1004 * @param strsrch string search data
1005 * @param start offset of match
1006 * @param end end offset of the match
1007 * @return TRUE if there are accents on either side of the match,
1008 *         FALSE otherwise
1009 */
1010 static
1011 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1012                                  int32_t    end)
1013 {
1014     if (strsrch->pattern.hasSuffixAccents) {
1015         const UChar       *text       = strsrch->search->text;
1016               int32_t  temp       = end;
1017               int32_t      textlength = strsrch->search->textLength;
1018         UTF_BACK_1(text, 0, temp);
1019         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1020             int32_t             firstce  = strsrch->pattern.CE[0];
1021             UCollationElements *coleiter = strsrch->textIter;
1022             UErrorCode          status   = U_ZERO_ERROR;
1023                         int32_t ce;
1024             setColEIterOffset(coleiter, start);
1025             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1026                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1027                     return TRUE;
1028                 }
1029             }
1030             int32_t count = 1;
1031             while (count < strsrch->pattern.CELength) {
1032                 if (getCE(strsrch, ucol_next(coleiter, &status))
1033                     == UCOL_IGNORABLE) {
1034                     // Thai can give an ignorable here.
1035                     count --;
1036                 }
1037                 if (U_FAILURE(status)) {
1038                     return TRUE;
1039                 }
1040                 count ++;
1041             }
1042
1043                         ce = ucol_next(coleiter, &status);
1044             if (U_FAILURE(status)) {
1045                 return TRUE;
1046             }
1047             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1048                 ce = getCE(strsrch, ce);
1049             }
1050             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1051                 if (ucol_getOffset(coleiter) <= end) {
1052                     return TRUE;
1053                 }
1054                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1055                     return TRUE;
1056                 }
1057             }
1058         }
1059     }
1060     return FALSE;
1061 }
1062 #endif // #if BOYER_MOORE
1063
1064 /**
1065 * Checks if the offset runs out of the text string
1066 * @param offset
1067 * @param textlength of the text string
1068 * @return TRUE if offset is out of bounds, FALSE otherwise
1069 */
1070 static
1071 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1072 {
1073     return offset < 0 || offset > textlength;
1074 }
1075
1076 #if BOYER_MOORE
1077 /**
1078 * Checks for identical match
1079 * @param strsrch string search data
1080 * @param start offset of possible match
1081 * @param end offset of possible match
1082 * @return TRUE if identical match is found
1083 */
1084 static
1085 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1086                                   int32_t    end)
1087 {
1088     UChar t2[32], p2[32];
1089     int32_t length = end - start;
1090     if (strsrch->strength != UCOL_IDENTICAL) {
1091         return TRUE;
1092     }
1093
1094     UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
1095     int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
1096                                        strsrch->search->text + start, length,
1097                                        FALSE, 0, &status);
1098     // use separate status2 in case of buffer overflow
1099     if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
1100                                         strsrch->pattern.text,
1101                                         strsrch->pattern.textLength,
1102                                         FALSE, 0, &status2)) {
1103         return FALSE; // lengths are different
1104     }
1105
1106     // compare contents
1107     UChar *text, *pattern;
1108     if(U_SUCCESS(status)) {
1109         text = t2;
1110         pattern = p2;
1111     } else if(status==U_BUFFER_OVERFLOW_ERROR) {
1112         status = U_ZERO_ERROR;
1113         // allocate one buffer for both decompositions
1114         text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
1115         // Check for allocation failure.
1116         if (text == NULL) {
1117                 return FALSE;
1118         }
1119         pattern = text + decomplength;
1120         unorm_decompose(text, decomplength, strsrch->search->text + start,
1121                         length, FALSE, 0, &status);
1122         unorm_decompose(pattern, decomplength, strsrch->pattern.text,
1123                         strsrch->pattern.textLength, FALSE, 0, &status);
1124     } else {
1125         // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
1126         // and that we don't uprv_free() an undefined text pointer
1127         text = pattern = t2;
1128         decomplength = 0;
1129     }
1130     UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
1131     if(text != t2) {
1132         uprv_free(text);
1133     }
1134     // return FALSE if NFD failed
1135     return U_SUCCESS(status) && result;
1136 }
1137
1138 /**
1139 * Checks to see if the match is repeated
1140 * @param strsrch string search data
1141 * @param start new match start index
1142 * @param end new match end index
1143 * @return TRUE if the the match is repeated, FALSE otherwise
1144 */
1145 static
1146 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1147                                 int32_t    start,
1148                                 int32_t    end)
1149 {
1150     int32_t lastmatchindex = strsrch->search->matchedIndex;
1151     UBool       result;
1152     if (lastmatchindex == USEARCH_DONE) {
1153         return FALSE;
1154     }
1155     if (strsrch->search->isForwardSearching) {
1156         result = start <= lastmatchindex;
1157     }
1158     else {
1159         result = start >= lastmatchindex;
1160     }
1161     if (!result && !strsrch->search->isOverlap) {
1162         if (strsrch->search->isForwardSearching) {
1163             result = start < lastmatchindex + strsrch->search->matchedLength;
1164         }
1165         else {
1166             result = end > lastmatchindex;
1167         }
1168     }
1169     return result;
1170 }
1171
1172 /**
1173 * Gets the collation element iterator's current offset.
1174 * @param coleiter collation element iterator
1175 * @param forwards flag TRUE if we are moving in th forwards direction
1176 * @return current offset
1177 */
1178 static
1179 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1180                                               UBool               forwards)
1181 {
1182     int32_t result = ucol_getOffset(coleiter);
1183     // intricacies of the the backwards collation element iterator
1184     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1185         result ++;
1186     }
1187     return result;
1188 }
1189
1190 /**
1191 * Checks match for contraction.
1192 * If the match ends with a partial contraction we fail.
1193 * If the match starts too far off (because of backwards iteration) we try to
1194 * chip off the extra characters depending on whether a breakiterator has
1195 * been used.
1196 * Internal method, error assumed to be success, caller has to check status
1197 * before calling this method.
1198 * @param strsrch string search data
1199 * @param start offset of potential match, to be modified if necessary
1200 * @param end offset of potential match, to be modified if necessary
1201 * @param status output error status if any
1202 * @return TRUE if match passes the contraction test, FALSE otherwise
1203 */
1204
1205 static
1206 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1207                                      int32_t   *start,
1208                                      int32_t   *end, UErrorCode  *status)
1209 {
1210           UCollationElements *coleiter   = strsrch->textIter;
1211           int32_t             textlength = strsrch->search->textLength;
1212           int32_t             temp       = *start;
1213     const UCollator          *collator   = strsrch->collator;
1214     const UChar              *text       = strsrch->search->text;
1215     // This part checks if either ends of the match contains potential
1216     // contraction. If so we'll have to iterate through them
1217     // The start contraction needs to be checked since ucol_previous dumps
1218     // all characters till the first safe character into the buffer.
1219     // *start + 1 is used to test for the unsafe characters instead of *start
1220     // because ucol_prev takes all unsafe characters till the first safe
1221     // character ie *start. so by testing *start + 1, we can estimate if
1222     // excess prefix characters has been included in the potential search
1223     // results.
1224     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1225         (*start + 1 < textlength
1226          && ucol_unsafeCP(text[*start + 1], collator))) {
1227         int32_t expansion  = getExpansionPrefix(coleiter);
1228         UBool   expandflag = expansion > 0;
1229         setColEIterOffset(coleiter, *start);
1230         while (expansion > 0) {
1231             // getting rid of the redundant ce, caused by setOffset.
1232             // since backward contraction/expansion may have extra ces if we
1233             // are in the normalization buffer, hasAccentsBeforeMatch would
1234             // have taken care of it.
1235             // E.g. the character \u01FA will have an expansion of 3, but if
1236             // we are only looking for acute and ring \u030A and \u0301, we'll
1237             // have to skip the first ce in the expansion buffer.
1238             ucol_next(coleiter, status);
1239             if (U_FAILURE(*status)) {
1240                 return FALSE;
1241             }
1242             if (ucol_getOffset(coleiter) != temp) {
1243                 *start = temp;
1244                 temp  = ucol_getOffset(coleiter);
1245             }
1246             expansion --;
1247         }
1248
1249         int32_t  *patternce       = strsrch->pattern.CE;
1250         int32_t   patterncelength = strsrch->pattern.CELength;
1251         int32_t   count           = 0;
1252         while (count < patterncelength) {
1253             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1254             if (ce == UCOL_IGNORABLE) {
1255                 continue;
1256             }
1257             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1258                 *start = temp;
1259                 temp   = ucol_getOffset(coleiter);
1260             }
1261             if (U_FAILURE(*status) || ce != patternce[count]) {
1262                 (*end) ++;
1263                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1264                 return FALSE;
1265             }
1266             count ++;
1267         }
1268     }
1269     return TRUE;
1270 }
1271
1272 /**
1273 * Checks and sets the match information if found.
1274 * Checks
1275 * <ul>
1276 * <li> the potential match does not repeat the previous match
1277 * <li> boundaries are correct
1278 * <li> exact matches has no extra accents
1279 * <li> identical matchesb
1280 * <li> potential match does not end in the middle of a contraction
1281 * <\ul>
1282 * Otherwise the offset will be shifted to the next character.
1283 * Internal method, status assumed to be success, caller has to check status
1284 * before calling this method.
1285 * @param strsrch string search data
1286 * @param textoffset offset in the collation element text. the returned value
1287 *        will be the truncated end offset of the match or the new start
1288 *        search offset.
1289 * @param status output error status if any
1290 * @return TRUE if the match is valid, FALSE otherwise
1291 */
1292 static
1293 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1294                                  int32_t   *textoffset, UErrorCode *status)
1295 {
1296     UCollationElements *coleiter = strsrch->textIter;
1297     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1298
1299     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1300         return FALSE;
1301     }
1302
1303     // this totally matches, however we need to check if it is repeating
1304     if (!isBreakUnit(strsrch, start, *textoffset) ||
1305         checkRepeatedMatch(strsrch, start, *textoffset) ||
1306         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1307         !checkIdentical(strsrch, start, *textoffset) ||
1308         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1309
1310         (*textoffset) ++;
1311         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1312         return FALSE;
1313     }
1314
1315     //Add breakiterator boundary check for primary strength search.
1316     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1317         checkBreakBoundary(strsrch, &start, textoffset);
1318     }
1319
1320     // totally match, we will get rid of the ending ignorables.
1321     strsrch->search->matchedIndex  = start;
1322     strsrch->search->matchedLength = *textoffset - start;
1323     return TRUE;
1324 }
1325
1326 /**
1327 * Getting the previous base character offset, or the current offset if the
1328 * current character is a base character
1329 * @param text string
1330 * @param textoffset one offset after the current character
1331 * @return the offset of the next character after the base character or the first
1332 *         composed character with accents
1333 */
1334 static
1335 inline int32_t getPreviousBaseOffset(const UChar       *text,
1336                                                int32_t  textoffset)
1337 {
1338     if (textoffset > 0) {
1339         for (;;) {
1340             int32_t result = textoffset;
1341             UTF_BACK_1(text, 0, textoffset);
1342             int32_t temp = textoffset;
1343             uint16_t fcd = getFCD(text, &temp, result);
1344             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1345                 if (fcd & LAST_BYTE_MASK_) {
1346                     return textoffset;
1347                 }
1348                 return result;
1349             }
1350             if (textoffset == 0) {
1351                 return 0;
1352             }
1353         }
1354     }
1355     return textoffset;
1356 }
1357
1358 /**
1359 * Getting the indexes of the accents that are not blocked in the argument
1360 * accent array
1361 * @param accents array of accents in nfd terminated by a 0.
1362 * @param accentsindex array of indexes of the accents that are not blocked
1363 */
1364 static
1365 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1366 {
1367     int32_t index     = 0;
1368     int32_t     length    = u_strlen(accents);
1369     UChar32     codepoint = 0;
1370     int         cclass    = 0;
1371     int         result    = 0;
1372     int32_t temp;
1373     while (index < length) {
1374         temp = index;
1375         UTF_NEXT_CHAR(accents, index, length, codepoint);
1376         if (u_getCombiningClass(codepoint) != cclass) {
1377             cclass        = u_getCombiningClass(codepoint);
1378             accentsindex[result] = temp;
1379             result ++;
1380         }
1381     }
1382     accentsindex[result] = length;
1383     return result;
1384 }
1385
1386 /**
1387 * Appends 3 UChar arrays to a destination array.
1388 * Creates a new array if we run out of space. The caller will have to
1389 * manually deallocate the newly allocated array.
1390 * Internal method, status assumed to be success, caller has to check status
1391 * before calling this method. destination not to be NULL and has at least
1392 * size destinationlength.
1393 * @param destination target array
1394 * @param destinationlength target array size, returning the appended length
1395 * @param source1 null-terminated first array
1396 * @param source2 second array
1397 * @param source2length length of seond array
1398 * @param source3 null-terminated third array
1399 * @param status error status if any
1400 * @return new destination array, destination if there was no new allocation
1401 */
1402 static
1403 inline UChar * addToUCharArray(      UChar      *destination,
1404                                      int32_t    *destinationlength,
1405                                const UChar      *source1,
1406                                const UChar      *source2,
1407                                      int32_t     source2length,
1408                                const UChar      *source3,
1409                                      UErrorCode *status)
1410 {
1411     int32_t source1length = source1 ? u_strlen(source1) : 0;
1412     int32_t source3length = source3 ? u_strlen(source3) : 0;
1413     if (*destinationlength < source1length + source2length + source3length +
1414                                                                            1)
1415     {
1416         destination = (UChar *)allocateMemory(
1417           (source1length + source2length + source3length + 1) * sizeof(UChar),
1418           status);
1419         // if error allocating memory, status will be
1420         // U_MEMORY_ALLOCATION_ERROR
1421         if (U_FAILURE(*status)) {
1422             *destinationlength = 0;
1423             return NULL;
1424         }
1425     }
1426     if (source1length != 0) {
1427         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1428     }
1429     if (source2length != 0) {
1430         uprv_memcpy(destination + source1length, source2,
1431                     sizeof(UChar) * source2length);
1432     }
1433     if (source3length != 0) {
1434         uprv_memcpy(destination + source1length + source2length, source3,
1435                     sizeof(UChar) * source3length);
1436     }
1437     *destinationlength = source1length + source2length + source3length;
1438     return destination;
1439 }
1440
1441 /**
1442 * Running through a collation element iterator to see if the contents matches
1443 * pattern in string search data
1444 * @param strsrch string search data
1445 * @param coleiter collation element iterator
1446 * @return TRUE if a match if found, FALSE otherwise
1447 */
1448 static
1449 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1450                                        UCollationElements *coleiter)
1451 {
1452     int         patternceindex = strsrch->pattern.CELength;
1453     int32_t    *patternce      = strsrch->pattern.CE;
1454     UErrorCode  status = U_ZERO_ERROR;
1455     while (patternceindex > 0) {
1456         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1457         if (ce == UCOL_IGNORABLE) {
1458             continue;
1459         }
1460         if (U_FAILURE(status) || ce != *patternce) {
1461             return FALSE;
1462         }
1463         patternce ++;
1464         patternceindex --;
1465     }
1466     return TRUE;
1467 }
1468
1469 /**
1470 * Rearranges the front accents to try matching.
1471 * Prefix accents in the text will be grouped according to their combining
1472 * class and the groups will be mixed and matched to try find the perfect
1473 * match with the pattern.
1474 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1475 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1476 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1477 *         "\u0301\u0325".
1478 * step 2: check if any of the generated substrings matches the pattern.
1479 * Internal method, status is assumed to be success, caller has to check status
1480 * before calling this method.
1481 * @param strsrch string search match
1482 * @param start first offset of the accents to start searching
1483 * @param end start of the last accent set
1484 * @param status output error status if any
1485 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1486 *         offset of the match. Note this start includes all preceding accents.
1487 */
1488 static
1489 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1490                                        int32_t    start,
1491                                        int32_t    end,
1492                                        UErrorCode    *status)
1493 {
1494     const UChar       *text       = strsrch->search->text;
1495           int32_t      textlength = strsrch->search->textLength;
1496           int32_t  tempstart  = start;
1497
1498     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1499         // die... failed at a base character
1500         return USEARCH_DONE;
1501     }
1502
1503     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1504     start = getPreviousBaseOffset(text, tempstart);
1505
1506     UChar       accents[INITIAL_ARRAY_SIZE_];
1507     // normalizing the offensive string
1508     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1509                     INITIAL_ARRAY_SIZE_, status);
1510     if (U_FAILURE(*status)) {
1511         return USEARCH_DONE;
1512     }
1513
1514     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1515     int32_t         accentsize = getUnblockedAccentIndex(accents,
1516                                                                  accentsindex);
1517     int32_t         count      = (2 << (accentsize - 1)) - 1;
1518     UChar               buffer[INITIAL_ARRAY_SIZE_];
1519     UCollationElements *coleiter   = strsrch->utilIter;
1520     while (U_SUCCESS(*status) && count > 0) {
1521         UChar *rearrange = strsrch->canonicalPrefixAccents;
1522         // copy the base characters
1523         for (int k = 0; k < accentsindex[0]; k ++) {
1524             *rearrange ++ = accents[k];
1525         }
1526         // forming all possible canonical rearrangement by dropping
1527         // sets of accents
1528         for (int i = 0; i <= accentsize - 1; i ++) {
1529             int32_t mask = 1 << (accentsize - i - 1);
1530             if (count & mask) {
1531                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1532                     *rearrange ++ = accents[j];
1533                 }
1534             }
1535         }
1536         *rearrange = 0;
1537         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1538         UChar   *match     = addToUCharArray(buffer, &matchsize,
1539                                            strsrch->canonicalPrefixAccents,
1540                                            strsrch->search->text + offset,
1541                                            end - offset,
1542                                            strsrch->canonicalSuffixAccents,
1543                                            status);
1544
1545         // if status is a failure, ucol_setText does nothing.
1546         // run the collator iterator through this match
1547         ucol_setText(coleiter, match, matchsize, status);
1548         if (U_SUCCESS(*status)) {
1549             if (checkCollationMatch(strsrch, coleiter)) {
1550                 if (match != buffer) {
1551                     uprv_free(match);
1552                 }
1553                 return start;
1554             }
1555         }
1556         count --;
1557     }
1558     return USEARCH_DONE;
1559 }
1560
1561 /**
1562 * Gets the offset to the safe point in text before textoffset.
1563 * ie. not the middle of a contraction, swappable characters or supplementary
1564 * characters.
1565 * @param collator collation sata
1566 * @param text string to work with
1567 * @param textoffset offset in string
1568 * @param textlength length of text string
1569 * @return offset to the previous safe character
1570 */
1571 static
1572 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1573                                       const UChar       *text,
1574                                             int32_t  textoffset)
1575 {
1576     int32_t result = textoffset; // first contraction character
1577     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1578         result --;
1579     }
1580     if (result != 0) {
1581         // the first contraction character is consider unsafe here
1582         result --;
1583     }
1584     return result;
1585 }
1586
1587 /**
1588 * Cleaning up after we passed the safe zone
1589 * @param strsrch string search data
1590 * @param safetext safe text array
1591 * @param safebuffer safe text buffer
1592 * @param coleiter collation element iterator for safe text
1593 */
1594 static
1595 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1596                                   UChar         *safebuffer)
1597 {
1598     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1599     {
1600        uprv_free(safetext);
1601     }
1602 }
1603
1604 /**
1605 * Take the rearranged end accents and tries matching. If match failed at
1606 * a seperate preceding set of accents (seperated from the rearranged on by
1607 * at least a base character) then we rearrange the preceding accents and
1608 * tries matching again.
1609 * We allow skipping of the ends of the accent set if the ces do not match.
1610 * However if the failure is found before the accent set, it fails.
1611 * Internal method, status assumed to be success, caller has to check status
1612 * before calling this method.
1613 * @param strsrch string search data
1614 * @param textoffset of the start of the rearranged accent
1615 * @param status output error status if any
1616 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1617 *         offset of the match. Note this start includes all preceding accents.
1618 */
1619 static
1620 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1621                                        int32_t    textoffset,
1622                                        UErrorCode    *status)
1623 {
1624     const UChar              *text           = strsrch->search->text;
1625     const UCollator          *collator       = strsrch->collator;
1626           int32_t             safelength     = 0;
1627           UChar              *safetext;
1628           int32_t             safetextlength;
1629           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1630           UCollationElements *coleiter       = strsrch->utilIter;
1631           int32_t         safeoffset     = textoffset;
1632
1633     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1634                                          collator)) {
1635         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1636         safelength     = textoffset - safeoffset;
1637         safetextlength = INITIAL_ARRAY_SIZE_;
1638         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1639                                          text + safeoffset, safelength,
1640                                          strsrch->canonicalSuffixAccents,
1641                                          status);
1642     }
1643     else {
1644         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1645         safetext       = strsrch->canonicalSuffixAccents;
1646     }
1647
1648     // if status is a failure, ucol_setText does nothing
1649     ucol_setText(coleiter, safetext, safetextlength, status);
1650     // status checked in loop below
1651
1652     int32_t  *ce        = strsrch->pattern.CE;
1653     int32_t   celength  = strsrch->pattern.CELength;
1654     int       ceindex   = celength - 1;
1655     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1656
1657     while (ceindex >= 0) {
1658         int32_t textce = ucol_previous(coleiter, status);
1659         if (U_FAILURE(*status)) {
1660             if (isSafe) {
1661                 cleanUpSafeText(strsrch, safetext, safebuffer);
1662             }
1663             return USEARCH_DONE;
1664         }
1665         if (textce == UCOL_NULLORDER) {
1666             // check if we have passed the safe buffer
1667             if (coleiter == strsrch->textIter) {
1668                 cleanUpSafeText(strsrch, safetext, safebuffer);
1669                 return USEARCH_DONE;
1670             }
1671             cleanUpSafeText(strsrch, safetext, safebuffer);
1672             safetext = safebuffer;
1673             coleiter = strsrch->textIter;
1674             setColEIterOffset(coleiter, safeoffset);
1675             // status checked at the start of the loop
1676             isSafe = FALSE;
1677             continue;
1678         }
1679         textce = getCE(strsrch, textce);
1680         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1681             // do the beginning stuff
1682             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1683             if (isSafe && failedoffset >= safelength) {
1684                 // alas... no hope. failed at rearranged accent set
1685                 cleanUpSafeText(strsrch, safetext, safebuffer);
1686                 return USEARCH_DONE;
1687             }
1688             else {
1689                 if (isSafe) {
1690                     failedoffset += safeoffset;
1691                     cleanUpSafeText(strsrch, safetext, safebuffer);
1692                 }
1693
1694                 // try rearranging the front accents
1695                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1696                                         failedoffset, textoffset, status);
1697                 if (result != USEARCH_DONE) {
1698                     // if status is a failure, ucol_setOffset does nothing
1699                     setColEIterOffset(strsrch->textIter, result);
1700                 }
1701                 if (U_FAILURE(*status)) {
1702                     return USEARCH_DONE;
1703                 }
1704                 return result;
1705             }
1706         }
1707         if (textce == ce[ceindex]) {
1708             ceindex --;
1709         }
1710     }
1711     // set offset here
1712     if (isSafe) {
1713         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1714         // sets the text iterator here with the correct expansion and offset
1715         int32_t    leftoverces = getExpansionPrefix(coleiter);
1716         cleanUpSafeText(strsrch, safetext, safebuffer);
1717         if (result >= safelength) {
1718             result = textoffset;
1719         }
1720         else {
1721             result += safeoffset;
1722         }
1723         setColEIterOffset(strsrch->textIter, result);
1724         strsrch->textIter->iteratordata_.toReturn =
1725                        setExpansionPrefix(strsrch->textIter, leftoverces);
1726         return result;
1727     }
1728
1729     return ucol_getOffset(coleiter);
1730 }
1731
1732 /**
1733 * Trying out the substring and sees if it can be a canonical match.
1734 * This will try normalizing the end accents and arranging them into canonical
1735 * equivalents and check their corresponding ces with the pattern ce.
1736 * Suffix accents in the text will be grouped according to their combining
1737 * class and the groups will be mixed and matched to try find the perfect
1738 * match with the pattern.
1739 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1740 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1741 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1742 *         "\u0301\u0325".
1743 * step 2: check if any of the generated substrings matches the pattern.
1744 * Internal method, status assumed to be success, caller has to check status
1745 * before calling this method.
1746 * @param strsrch string search data
1747 * @param textoffset end offset in the collation element text that ends with
1748 *                   the accents to be rearranged
1749 * @param status error status if any
1750 * @return TRUE if the match is valid, FALSE otherwise
1751 */
1752 static
1753 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1754                            int32_t    textoffset,
1755                            UErrorCode    *status)
1756 {
1757     const UChar       *text = strsrch->search->text;
1758           int32_t  temp = textoffset;
1759     UTF_BACK_1(text, 0, temp);
1760     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1761         UCollationElements *coleiter = strsrch->textIter;
1762         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1763         if (strsrch->pattern.hasPrefixAccents) {
1764             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1765                                                 status);
1766             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1767                 setColEIterOffset(coleiter, offset);
1768                 return TRUE;
1769             }
1770         }
1771         return FALSE;
1772     }
1773
1774     if (!strsrch->pattern.hasSuffixAccents) {
1775         return FALSE;
1776     }
1777
1778     UChar       accents[INITIAL_ARRAY_SIZE_];
1779     // offset to the last base character in substring to search
1780     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1781     // normalizing the offensive string
1782     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1783                                0, accents, INITIAL_ARRAY_SIZE_, status);
1784     // status checked in loop below
1785
1786     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1787     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1788
1789     // 2 power n - 1 plus the full set of accents
1790     int32_t  count = (2 << (size - 1)) - 1;
1791     while (U_SUCCESS(*status) && count > 0) {
1792         UChar *rearrange = strsrch->canonicalSuffixAccents;
1793         // copy the base characters
1794         for (int k = 0; k < accentsindex[0]; k ++) {
1795             *rearrange ++ = accents[k];
1796         }
1797         // forming all possible canonical rearrangement by dropping
1798         // sets of accents
1799         for (int i = 0; i <= size - 1; i ++) {
1800             int32_t mask = 1 << (size - i - 1);
1801             if (count & mask) {
1802                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1803                     *rearrange ++ = accents[j];
1804                 }
1805             }
1806         }
1807         *rearrange = 0;
1808         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1809                                                         status);
1810         if (offset != USEARCH_DONE) {
1811             return TRUE; // match found
1812         }
1813         count --;
1814     }
1815     return FALSE;
1816 }
1817
1818 /**
1819 * Gets the previous base character offset depending on the string search
1820 * pattern data
1821 * @param strsrch string search data
1822 * @param textoffset current offset, current character
1823 * @return the offset of the next character after this base character or itself
1824 *         if it is a composed character with accents
1825 */
1826 static
1827 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1828                                                       int32_t textoffset)
1829 {
1830     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1831         const UChar       *text = strsrch->search->text;
1832               int32_t  offset = textoffset;
1833         if (getFCD(text, &offset, strsrch->search->textLength) >>
1834                                                    SECOND_LAST_BYTE_SHIFT_) {
1835             return getPreviousBaseOffset(text, textoffset);
1836         }
1837     }
1838     return textoffset;
1839 }
1840
1841 /**
1842 * Checks match for contraction.
1843 * If the match ends with a partial contraction we fail.
1844 * If the match starts too far off (because of backwards iteration) we try to
1845 * chip off the extra characters
1846 * Internal method, status assumed to be success, caller has to check status
1847 * before calling this method.
1848 * @param strsrch string search data
1849 * @param start offset of potential match, to be modified if necessary
1850 * @param end offset of potential match, to be modified if necessary
1851 * @param status output error status if any
1852 * @return TRUE if match passes the contraction test, FALSE otherwise
1853 */
1854 static
1855 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1856                                          int32_t   *start,
1857                                          int32_t   *end,
1858                                          UErrorCode    *status)
1859 {
1860           UCollationElements *coleiter   = strsrch->textIter;
1861           int32_t             textlength = strsrch->search->textLength;
1862           int32_t         temp       = *start;
1863     const UCollator          *collator   = strsrch->collator;
1864     const UChar              *text       = strsrch->search->text;
1865     // This part checks if either ends of the match contains potential
1866     // contraction. If so we'll have to iterate through them
1867     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1868         (*start + 1 < textlength
1869          && ucol_unsafeCP(text[*start + 1], collator))) {
1870         int32_t expansion  = getExpansionPrefix(coleiter);
1871         UBool   expandflag = expansion > 0;
1872         setColEIterOffset(coleiter, *start);
1873         while (expansion > 0) {
1874             // getting rid of the redundant ce, caused by setOffset.
1875             // since backward contraction/expansion may have extra ces if we
1876             // are in the normalization buffer, hasAccentsBeforeMatch would
1877             // have taken care of it.
1878             // E.g. the character \u01FA will have an expansion of 3, but if
1879             // we are only looking for acute and ring \u030A and \u0301, we'll
1880             // have to skip the first ce in the expansion buffer.
1881             ucol_next(coleiter, status);
1882             if (U_FAILURE(*status)) {
1883                 return FALSE;
1884             }
1885             if (ucol_getOffset(coleiter) != temp) {
1886                 *start = temp;
1887                 temp  = ucol_getOffset(coleiter);
1888             }
1889             expansion --;
1890         }
1891
1892         int32_t  *patternce       = strsrch->pattern.CE;
1893         int32_t   patterncelength = strsrch->pattern.CELength;
1894         int32_t   count           = 0;
1895         int32_t   textlength      = strsrch->search->textLength;
1896         while (count < patterncelength) {
1897             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1898             // status checked below, note that if status is a failure
1899             // ucol_next returns UCOL_NULLORDER
1900             if (ce == UCOL_IGNORABLE) {
1901                 continue;
1902             }
1903             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1904                 *start = temp;
1905                 temp   = ucol_getOffset(coleiter);
1906             }
1907
1908             if (count == 0 && ce != patternce[0]) {
1909                 // accents may have extra starting ces, this occurs when a
1910                 // pure accent pattern is matched without rearrangement
1911                 // text \u0325\u0300 and looking for \u0300
1912                 int32_t expected = patternce[0];
1913                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1914                     ce = getCE(strsrch, ucol_next(coleiter, status));
1915                     while (U_SUCCESS(*status) && ce != expected &&
1916                            ce != UCOL_NULLORDER &&
1917                            ucol_getOffset(coleiter) <= *end) {
1918                         ce = getCE(strsrch, ucol_next(coleiter, status));
1919                     }
1920                 }
1921             }
1922             if (U_FAILURE(*status) || ce != patternce[count]) {
1923                 (*end) ++;
1924                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1925                 return FALSE;
1926             }
1927             count ++;
1928         }
1929     }
1930     return TRUE;
1931 }
1932
1933 /**
1934 * Checks and sets the match information if found.
1935 * Checks
1936 * <ul>
1937 * <li> the potential match does not repeat the previous match
1938 * <li> boundaries are correct
1939 * <li> potential match does not end in the middle of a contraction
1940 * <li> identical matches
1941 * <\ul>
1942 * Otherwise the offset will be shifted to the next character.
1943 * Internal method, status assumed to be success, caller has to check the
1944 * status before calling this method.
1945 * @param strsrch string search data
1946 * @param textoffset offset in the collation element text. the returned value
1947 *        will be the truncated end offset of the match or the new start
1948 *        search offset.
1949 * @param status output error status if any
1950 * @return TRUE if the match is valid, FALSE otherwise
1951 */
1952 static
1953 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1954                                      int32_t   *textoffset,
1955                                      UErrorCode    *status)
1956 {
1957     // to ensure that the start and ends are not composite characters
1958     UCollationElements *coleiter = strsrch->textIter;
1959     // if we have a canonical accent match
1960     if ((strsrch->pattern.hasSuffixAccents &&
1961         strsrch->canonicalSuffixAccents[0]) ||
1962         (strsrch->pattern.hasPrefixAccents &&
1963         strsrch->canonicalPrefixAccents[0])) {
1964         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1965                                                     strsrch,
1966                                                     ucol_getOffset(coleiter));
1967         strsrch->search->matchedLength = *textoffset -
1968                                                 strsrch->search->matchedIndex;
1969         return TRUE;
1970     }
1971
1972     int32_t start = getColElemIterOffset(coleiter, FALSE);
1973     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1974                                             status) || U_FAILURE(*status)) {
1975         return FALSE;
1976     }
1977
1978     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1979     // this totally matches, however we need to check if it is repeating
1980     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1981         !isBreakUnit(strsrch, start, *textoffset) ||
1982         !checkIdentical(strsrch, start, *textoffset)) {
1983         (*textoffset) ++;
1984         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1985                                         strsrch->search->textLength);
1986         return FALSE;
1987     }
1988
1989     strsrch->search->matchedIndex  = start;
1990     strsrch->search->matchedLength = *textoffset - start;
1991     return TRUE;
1992 }
1993
1994 /**
1995 * Shifting the collation element iterator position forward to prepare for
1996 * a preceding match. If the first character is a unsafe character, we'll only
1997 * shift by 1 to capture contractions, normalization etc.
1998 * Internal method, status assumed to be success, caller has to check status
1999 * before calling this method.
2000 * @param text strsrch string search data
2001 * @param textoffset start text position to do search
2002 * @param ce the text ce which failed the match.
2003 * @param patternceindex index of the ce within the pattern ce buffer which
2004 *        failed the match
2005 * @return final offset
2006 */
2007 static
2008 inline int32_t reverseShift(UStringSearch *strsrch,
2009                                 int32_t    textoffset,
2010                                 int32_t       ce,
2011                                 int32_t        patternceindex)
2012 {
2013     if (strsrch->search->isOverlap) {
2014         if (textoffset != strsrch->search->textLength) {
2015             textoffset --;
2016         }
2017         else {
2018             textoffset -= strsrch->pattern.defaultShiftSize;
2019         }
2020     }
2021     else {
2022         if (ce != UCOL_NULLORDER) {
2023             int32_t shift = strsrch->pattern.backShift[hash(ce)];
2024
2025             // this is to adjust for characters in the middle of the substring
2026             // for matching that failed.
2027             int32_t adjust = patternceindex;
2028             if (adjust > 1 && shift > adjust) {
2029                 shift -= adjust - 1;
2030             }
2031             textoffset -= shift;
2032         }
2033         else {
2034             textoffset -= strsrch->pattern.defaultShiftSize;
2035         }
2036     }
2037     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
2038     return textoffset;
2039 }
2040
2041 /**
2042 * Checks match for contraction.
2043 * If the match starts with a partial contraction we fail.
2044 * Internal method, status assumed to be success, caller has to check status
2045 * before calling this method.
2046 * @param strsrch string search data
2047 * @param start offset of potential match, to be modified if necessary
2048 * @param end offset of potential match, to be modified if necessary
2049 * @param status output error status if any
2050 * @return TRUE if match passes the contraction test, FALSE otherwise
2051 */
2052 static
2053 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2054                                      int32_t   *start,
2055                                      int32_t   *end, UErrorCode  *status)
2056 {
2057           UCollationElements *coleiter   = strsrch->textIter;
2058           int32_t             textlength = strsrch->search->textLength;
2059           int32_t             temp       = *end;
2060     const UCollator          *collator   = strsrch->collator;
2061     const UChar              *text       = strsrch->search->text;
2062     // This part checks if either if the start of the match contains potential
2063     // contraction. If so we'll have to iterate through them
2064     // Since we used ucol_next while previously looking for the potential
2065     // match, this guarantees that our end will not be a partial contraction,
2066     // or a partial supplementary character.
2067     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2068         int32_t expansion  = getExpansionSuffix(coleiter);
2069         UBool   expandflag = expansion > 0;
2070         setColEIterOffset(coleiter, *end);
2071         while (U_SUCCESS(*status) && expansion > 0) {
2072             // getting rid of the redundant ce
2073             // since forward contraction/expansion may have extra ces
2074             // if we are in the normalization buffer, hasAccentsBeforeMatch
2075             // would have taken care of it.
2076             // E.g. the character \u01FA will have an expansion of 3, but if
2077             // we are only looking for A ring A\u030A, we'll have to skip the
2078             // last ce in the expansion buffer
2079             ucol_previous(coleiter, status);
2080             if (U_FAILURE(*status)) {
2081                 return FALSE;
2082             }
2083             if (ucol_getOffset(coleiter) != temp) {
2084                 *end = temp;
2085                 temp  = ucol_getOffset(coleiter);
2086             }
2087             expansion --;
2088         }
2089
2090         int32_t  *patternce       = strsrch->pattern.CE;
2091         int32_t   patterncelength = strsrch->pattern.CELength;
2092         int32_t   count           = patterncelength;
2093         while (count > 0) {
2094             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2095             // status checked below, note that if status is a failure
2096             // ucol_previous returns UCOL_NULLORDER
2097             if (ce == UCOL_IGNORABLE) {
2098                 continue;
2099             }
2100             if (expandflag && count == 0 &&
2101                 getColElemIterOffset(coleiter, FALSE) != temp) {
2102                 *end = temp;
2103                 temp  = ucol_getOffset(coleiter);
2104             }
2105             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2106                 (*start) --;
2107                 *start = getPreviousBaseOffset(text, *start);
2108                 return FALSE;
2109             }
2110             count --;
2111         }
2112     }
2113     return TRUE;
2114 }
2115
2116 /**
2117 * Checks and sets the match information if found.
2118 * Checks
2119 * <ul>
2120 * <li> the current match does not repeat the last match
2121 * <li> boundaries are correct
2122 * <li> exact matches has no extra accents
2123 * <li> identical matches
2124 * <\ul>
2125 * Otherwise the offset will be shifted to the preceding character.
2126 * Internal method, status assumed to be success, caller has to check status
2127 * before calling this method.
2128 * @param strsrch string search data
2129 * @param collator
2130 * @param coleiter collation element iterator
2131 * @param text string
2132 * @param textoffset offset in the collation element text. the returned value
2133 *        will be the truncated start offset of the match or the new start
2134 *        search offset.
2135 * @param status output error status if any
2136 * @return TRUE if the match is valid, FALSE otherwise
2137 */
2138 static
2139 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2140                                      int32_t   *textoffset,
2141                                      UErrorCode    *status)
2142 {
2143     // to ensure that the start and ends are not composite characters
2144     int32_t end = ucol_getOffset(strsrch->textIter);
2145     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2146         || U_FAILURE(*status)) {
2147             return FALSE;
2148     }
2149
2150     // this totally matches, however we need to check if it is repeating
2151     // the old match
2152     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2153         !isBreakUnit(strsrch, *textoffset, end) ||
2154         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2155         !checkIdentical(strsrch, *textoffset, end) ||
2156         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2157         (*textoffset) --;
2158         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2159                                             *textoffset);
2160         return FALSE;
2161     }
2162
2163     //Add breakiterator boundary check for primary strength search.
2164     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2165         checkBreakBoundary(strsrch, textoffset, &end);
2166     }
2167
2168     strsrch->search->matchedIndex = *textoffset;
2169     strsrch->search->matchedLength = end - *textoffset;
2170     return TRUE;
2171 }
2172
2173 /**
2174 * Rearranges the end accents to try matching.
2175 * Suffix accents in the text will be grouped according to their combining
2176 * class and the groups will be mixed and matched to try find the perfect
2177 * match with the pattern.
2178 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2179 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2180 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2181 *         "\u0301\u0325".
2182 * step 2: check if any of the generated substrings matches the pattern.
2183 * Internal method, status assumed to be success, user has to check status
2184 * before calling this method.
2185 * @param strsrch string search match
2186 * @param start offset of the first base character
2187 * @param end start of the last accent set
2188 * @param status only error status if any
2189 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2190 *         offset of the match. Note this start includes all following accents.
2191 */
2192 static
2193 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2194                                            int32_t    start,
2195                                            int32_t    end,
2196                                            UErrorCode    *status)
2197 {
2198     const UChar       *text       = strsrch->search->text;
2199           int32_t  tempend    = end;
2200
2201     UTF_BACK_1(text, 0, tempend);
2202     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2203                                                            LAST_BYTE_MASK_)) {
2204         // die... failed at a base character
2205         return USEARCH_DONE;
2206     }
2207     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2208
2209     if (U_SUCCESS(*status)) {
2210         UChar       accents[INITIAL_ARRAY_SIZE_];
2211         int32_t offset = getPreviousBaseOffset(text, end);
2212         // normalizing the offensive string
2213         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2214                         INITIAL_ARRAY_SIZE_, status);
2215
2216         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2217         int32_t         accentsize = getUnblockedAccentIndex(accents,
2218                                                          accentsindex);
2219         int32_t         count      = (2 << (accentsize - 1)) - 1;
2220         UChar               buffer[INITIAL_ARRAY_SIZE_];
2221         UCollationElements *coleiter = strsrch->utilIter;
2222         while (U_SUCCESS(*status) && count > 0) {
2223             UChar *rearrange = strsrch->canonicalSuffixAccents;
2224             // copy the base characters
2225             for (int k = 0; k < accentsindex[0]; k ++) {
2226                 *rearrange ++ = accents[k];
2227             }
2228             // forming all possible canonical rearrangement by dropping
2229             // sets of accents
2230             for (int i = 0; i <= accentsize - 1; i ++) {
2231                 int32_t mask = 1 << (accentsize - i - 1);
2232                 if (count & mask) {
2233                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2234                         *rearrange ++ = accents[j];
2235                     }
2236                 }
2237             }
2238             *rearrange = 0;
2239             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2240             UChar   *match     = addToUCharArray(buffer, &matchsize,
2241                                            strsrch->canonicalPrefixAccents,
2242                                            strsrch->search->text + start,
2243                                            offset - start,
2244                                            strsrch->canonicalSuffixAccents,
2245                                            status);
2246
2247             // run the collator iterator through this match
2248             // if status is a failure ucol_setText does nothing
2249             ucol_setText(coleiter, match, matchsize, status);
2250             if (U_SUCCESS(*status)) {
2251                 if (checkCollationMatch(strsrch, coleiter)) {
2252                     if (match != buffer) {
2253                         uprv_free(match);
2254                     }
2255                     return end;
2256                 }
2257             }
2258             count --;
2259         }
2260     }
2261     return USEARCH_DONE;
2262 }
2263
2264 /**
2265 * Take the rearranged start accents and tries matching. If match failed at
2266 * a seperate following set of accents (seperated from the rearranged on by
2267 * at least a base character) then we rearrange the preceding accents and
2268 * tries matching again.
2269 * We allow skipping of the ends of the accent set if the ces do not match.
2270 * However if the failure is found before the accent set, it fails.
2271 * Internal method, status assumed to be success, caller has to check status
2272 * before calling this method.
2273 * @param strsrch string search data
2274 * @param textoffset of the ends of the rearranged accent
2275 * @param status output error status if any
2276 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2277 *         offset of the match. Note this start includes all following accents.
2278 */
2279 static
2280 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2281                                            int32_t    textoffset,
2282                                            UErrorCode    *status)
2283 {
2284     const UChar       *text       = strsrch->search->text;
2285     const UCollator   *collator   = strsrch->collator;
2286           int32_t      safelength = 0;
2287           UChar       *safetext;
2288           int32_t      safetextlength;
2289           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2290           int32_t  safeoffset = textoffset;
2291
2292     if (textoffset &&
2293         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2294                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2295                                          ], collator)) {
2296         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2297                                            strsrch->search->textLength);
2298         safelength     = safeoffset - textoffset;
2299         safetextlength = INITIAL_ARRAY_SIZE_;
2300         safetext       = addToUCharArray(safebuffer, &safetextlength,
2301                                          strsrch->canonicalPrefixAccents,
2302                                          text + textoffset, safelength,
2303                                          NULL, status);
2304     }
2305     else {
2306         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2307         safetext       = strsrch->canonicalPrefixAccents;
2308     }
2309
2310     UCollationElements *coleiter = strsrch->utilIter;
2311      // if status is a failure, ucol_setText does nothing
2312     ucol_setText(coleiter, safetext, safetextlength, status);
2313     // status checked in loop below
2314
2315     int32_t  *ce           = strsrch->pattern.CE;
2316     int32_t   celength     = strsrch->pattern.CELength;
2317     int       ceindex      = 0;
2318     UBool     isSafe       = TRUE; // safe zone indication flag for position
2319     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2320
2321     while (ceindex < celength) {
2322         int32_t textce = ucol_next(coleiter, status);
2323         if (U_FAILURE(*status)) {
2324             if (isSafe) {
2325                 cleanUpSafeText(strsrch, safetext, safebuffer);
2326             }
2327             return USEARCH_DONE;
2328         }
2329         if (textce == UCOL_NULLORDER) {
2330             // check if we have passed the safe buffer
2331             if (coleiter == strsrch->textIter) {
2332                 cleanUpSafeText(strsrch, safetext, safebuffer);
2333                 return USEARCH_DONE;
2334             }
2335             cleanUpSafeText(strsrch, safetext, safebuffer);
2336             safetext = safebuffer;
2337             coleiter = strsrch->textIter;
2338             setColEIterOffset(coleiter, safeoffset);
2339             // status checked at the start of the loop
2340             isSafe = FALSE;
2341             continue;
2342         }
2343         textce = getCE(strsrch, textce);
2344         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2345             // do the beginning stuff
2346             int32_t failedoffset = ucol_getOffset(coleiter);
2347             if (isSafe && failedoffset <= prefixlength) {
2348                 // alas... no hope. failed at rearranged accent set
2349                 cleanUpSafeText(strsrch, safetext, safebuffer);
2350                 return USEARCH_DONE;
2351             }
2352             else {
2353                 if (isSafe) {
2354                     failedoffset = safeoffset - failedoffset;
2355                     cleanUpSafeText(strsrch, safetext, safebuffer);
2356                 }
2357
2358                 // try rearranging the end accents
2359                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2360                                         textoffset, failedoffset, status);
2361                 if (result != USEARCH_DONE) {
2362                     // if status is a failure, ucol_setOffset does nothing
2363                     setColEIterOffset(strsrch->textIter, result);
2364                 }
2365                 if (U_FAILURE(*status)) {
2366                     return USEARCH_DONE;
2367                 }
2368                 return result;
2369             }
2370         }
2371         if (textce == ce[ceindex]) {
2372             ceindex ++;
2373         }
2374     }
2375     // set offset here
2376     if (isSafe) {
2377         int32_t result      = ucol_getOffset(coleiter);
2378         // sets the text iterator here with the correct expansion and offset
2379         int32_t     leftoverces = getExpansionSuffix(coleiter);
2380         cleanUpSafeText(strsrch, safetext, safebuffer);
2381         if (result <= prefixlength) {
2382             result = textoffset;
2383         }
2384         else {
2385             result = textoffset + (safeoffset - result);
2386         }
2387         setColEIterOffset(strsrch->textIter, result);
2388         setExpansionSuffix(strsrch->textIter, leftoverces);
2389         return result;
2390     }
2391
2392     return ucol_getOffset(coleiter);
2393 }
2394
2395 /**
2396 * Trying out the substring and sees if it can be a canonical match.
2397 * This will try normalizing the starting accents and arranging them into
2398 * canonical equivalents and check their corresponding ces with the pattern ce.
2399 * Prefix accents in the text will be grouped according to their combining
2400 * class and the groups will be mixed and matched to try find the perfect
2401 * match with the pattern.
2402 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2403 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2404 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2405 *         "\u0301\u0325".
2406 * step 2: check if any of the generated substrings matches the pattern.
2407 * Internal method, status assumed to be success, caller has to check status
2408 * before calling this method.
2409 * @param strsrch string search data
2410 * @param textoffset start offset in the collation element text that starts
2411 *                   with the accents to be rearranged
2412 * @param status output error status if any
2413 * @return TRUE if the match is valid, FALSE otherwise
2414 */
2415 static
2416 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2417                                int32_t    textoffset,
2418                                UErrorCode    *status)
2419 {
2420     const UChar       *text       = strsrch->search->text;
2421           int32_t  temp       = textoffset;
2422           int32_t      textlength = strsrch->search->textLength;
2423     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2424         UCollationElements *coleiter = strsrch->textIter;
2425         int32_t         offset   = ucol_getOffset(coleiter);
2426         if (strsrch->pattern.hasSuffixAccents) {
2427             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2428                                                     offset, status);
2429             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2430                 setColEIterOffset(coleiter, offset);
2431                 return TRUE;
2432             }
2433         }
2434         return FALSE;
2435     }
2436
2437     if (!strsrch->pattern.hasPrefixAccents) {
2438         return FALSE;
2439     }
2440
2441     UChar       accents[INITIAL_ARRAY_SIZE_];
2442     // offset to the last base character in substring to search
2443     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2444     // normalizing the offensive string
2445     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2446                                0, accents, INITIAL_ARRAY_SIZE_, status);
2447     // status checked in loop
2448
2449     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2450     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2451
2452     // 2 power n - 1 plus the full set of accents
2453     int32_t  count = (2 << (size - 1)) - 1;
2454     while (U_SUCCESS(*status) && count > 0) {
2455         UChar *rearrange = strsrch->canonicalPrefixAccents;
2456         // copy the base characters
2457         for (int k = 0; k < accentsindex[0]; k ++) {
2458             *rearrange ++ = accents[k];
2459         }
2460         // forming all possible canonical rearrangement by dropping
2461         // sets of accents
2462         for (int i = 0; i <= size - 1; i ++) {
2463             int32_t mask = 1 << (size - i - 1);
2464             if (count & mask) {
2465                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2466                     *rearrange ++ = accents[j];
2467                 }
2468             }
2469         }
2470         *rearrange = 0;
2471         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2472                                                           baseoffset, status);
2473         if (offset != USEARCH_DONE) {
2474             return TRUE; // match found
2475         }
2476         count --;
2477     }
2478     return FALSE;
2479 }
2480
2481 /**
2482 * Checks match for contraction.
2483 * If the match starts with a partial contraction we fail.
2484 * Internal method, status assumed to be success, caller has to check status
2485 * before calling this method.
2486 * @param strsrch string search data
2487 * @param start offset of potential match, to be modified if necessary
2488 * @param end offset of potential match, to be modified if necessary
2489 * @param status only error status if any
2490 * @return TRUE if match passes the contraction test, FALSE otherwise
2491 */
2492 static
2493 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2494                                      int32_t   *start,
2495                                      int32_t   *end, UErrorCode  *status)
2496 {
2497           UCollationElements *coleiter   = strsrch->textIter;
2498           int32_t             textlength = strsrch->search->textLength;
2499           int32_t         temp       = *end;
2500     const UCollator          *collator   = strsrch->collator;
2501     const UChar              *text       = strsrch->search->text;
2502     // This part checks if either if the start of the match contains potential
2503     // contraction. If so we'll have to iterate through them
2504     // Since we used ucol_next while previously looking for the potential
2505     // match, this guarantees that our end will not be a partial contraction,
2506     // or a partial supplementary character.
2507     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2508         int32_t expansion  = getExpansionSuffix(coleiter);
2509         UBool   expandflag = expansion > 0;
2510         setColEIterOffset(coleiter, *end);
2511         while (expansion > 0) {
2512             // getting rid of the redundant ce
2513             // since forward contraction/expansion may have extra ces
2514             // if we are in the normalization buffer, hasAccentsBeforeMatch
2515             // would have taken care of it.
2516             // E.g. the character \u01FA will have an expansion of 3, but if
2517             // we are only looking for A ring A\u030A, we'll have to skip the
2518             // last ce in the expansion buffer
2519             ucol_previous(coleiter, status);
2520             if (U_FAILURE(*status)) {
2521                 return FALSE;
2522             }
2523             if (ucol_getOffset(coleiter) != temp) {
2524                 *end = temp;
2525                 temp  = ucol_getOffset(coleiter);
2526             }
2527             expansion --;
2528         }
2529
2530         int32_t  *patternce       = strsrch->pattern.CE;
2531         int32_t   patterncelength = strsrch->pattern.CELength;
2532         int32_t   count           = patterncelength;
2533         while (count > 0) {
2534             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2535             // status checked below, note that if status is a failure
2536             // ucol_previous returns UCOL_NULLORDER
2537             if (ce == UCOL_IGNORABLE) {
2538                 continue;
2539             }
2540             if (expandflag && count == 0 &&
2541                 getColElemIterOffset(coleiter, FALSE) != temp) {
2542                 *end = temp;
2543                 temp  = ucol_getOffset(coleiter);
2544             }
2545             if (count == patterncelength &&
2546                 ce != patternce[patterncelength - 1]) {
2547                 // accents may have extra starting ces, this occurs when a
2548                 // pure accent pattern is matched without rearrangement
2549                 int32_t    expected = patternce[patterncelength - 1];
2550                 UTF_BACK_1(text, 0, *end);
2551                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2552                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2553                     while (U_SUCCESS(*status) && ce != expected &&
2554                            ce != UCOL_NULLORDER &&
2555                            ucol_getOffset(coleiter) <= *start) {
2556                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2557                     }
2558                 }
2559             }
2560             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2561                 (*start) --;
2562                 *start = getPreviousBaseOffset(text, *start);
2563                 return FALSE;
2564             }
2565             count --;
2566         }
2567     }
2568     return TRUE;
2569 }
2570
2571 /**
2572 * Checks and sets the match information if found.
2573 * Checks
2574 * <ul>
2575 * <li> the potential match does not repeat the previous match
2576 * <li> boundaries are correct
2577 * <li> potential match does not end in the middle of a contraction
2578 * <li> identical matches
2579 * <\ul>
2580 * Otherwise the offset will be shifted to the next character.
2581 * Internal method, status assumed to be success, caller has to check status
2582 * before calling this method.
2583 * @param strsrch string search data
2584 * @param textoffset offset in the collation element text. the returned value
2585 *        will be the truncated start offset of the match or the new start
2586 *        search offset.
2587 * @param status only error status if any
2588 * @return TRUE if the match is valid, FALSE otherwise
2589 */
2590 static
2591 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2592                                          int32_t   *textoffset,
2593                                          UErrorCode    *status)
2594 {
2595     // to ensure that the start and ends are not composite characters
2596     UCollationElements *coleiter = strsrch->textIter;
2597     // if we have a canonical accent match
2598     if ((strsrch->pattern.hasSuffixAccents &&
2599         strsrch->canonicalSuffixAccents[0]) ||
2600         (strsrch->pattern.hasPrefixAccents &&
2601         strsrch->canonicalPrefixAccents[0])) {
2602         strsrch->search->matchedIndex  = *textoffset;
2603         strsrch->search->matchedLength =
2604             getNextUStringSearchBaseOffset(strsrch,
2605                                       getColElemIterOffset(coleiter, FALSE))
2606             - *textoffset;
2607         return TRUE;
2608     }
2609
2610     int32_t end = ucol_getOffset(coleiter);
2611     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2612                                                 status) ||
2613          U_FAILURE(*status)) {
2614         return FALSE;
2615     }
2616
2617     end = getNextUStringSearchBaseOffset(strsrch, end);
2618     // this totally matches, however we need to check if it is repeating
2619     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2620         !isBreakUnit(strsrch, *textoffset, end) ||
2621         !checkIdentical(strsrch, *textoffset, end)) {
2622         (*textoffset) --;
2623         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2624                                             *textoffset);
2625         return FALSE;
2626     }
2627
2628     strsrch->search->matchedIndex  = *textoffset;
2629     strsrch->search->matchedLength = end - *textoffset;
2630     return TRUE;
2631 }
2632 #endif // #if BOYER_MOORE
2633
2634 // constructors and destructor -------------------------------------------
2635
2636 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2637                                           int32_t         patternlength,
2638                                     const UChar          *text,
2639                                           int32_t         textlength,
2640                                     const char           *locale,
2641                                           UBreakIterator *breakiter,
2642                                           UErrorCode     *status)
2643 {
2644     if (U_FAILURE(*status)) {
2645         return NULL;
2646     }
2647 #if UCONFIG_NO_BREAK_ITERATION
2648     if (breakiter != NULL) {
2649         *status = U_UNSUPPORTED_ERROR;
2650         return NULL;
2651     }
2652 #endif
2653     if (locale) {
2654         // ucol_open internally checks for status
2655         UCollator     *collator = ucol_open(locale, status);
2656         // pattern, text checks are done in usearch_openFromCollator
2657         UStringSearch *result   = usearch_openFromCollator(pattern,
2658                                               patternlength, text, textlength,
2659                                               collator, breakiter, status);
2660
2661         if (result == NULL || U_FAILURE(*status)) {
2662             if (collator) {
2663                 ucol_close(collator);
2664             }
2665             return NULL;
2666         }
2667         else {
2668             result->ownCollator = TRUE;
2669         }
2670         return result;
2671     }
2672     *status = U_ILLEGAL_ARGUMENT_ERROR;
2673     return NULL;
2674 }
2675
2676 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2677                                   const UChar          *pattern,
2678                                         int32_t         patternlength,
2679                                   const UChar          *text,
2680                                         int32_t         textlength,
2681                                   const UCollator      *collator,
2682                                         UBreakIterator *breakiter,
2683                                         UErrorCode     *status)
2684 {
2685     if (U_FAILURE(*status)) {
2686         return NULL;
2687     }
2688 #if UCONFIG_NO_BREAK_ITERATION
2689     if (breakiter != NULL) {
2690         *status = U_UNSUPPORTED_ERROR;
2691         return NULL;
2692     }
2693 #endif
2694     if (pattern == NULL || text == NULL || collator == NULL) {
2695         *status = U_ILLEGAL_ARGUMENT_ERROR;
2696         return NULL;
2697     }
2698
2699     // string search does not really work when numeric collation is turned on
2700     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2701         *status = U_UNSUPPORTED_ERROR;
2702         return NULL;
2703     }
2704
2705     if (U_SUCCESS(*status)) {
2706         initializeFCD(status);
2707         if (U_FAILURE(*status)) {
2708             return NULL;
2709         }
2710
2711         UStringSearch *result;
2712         if (textlength == -1) {
2713             textlength = u_strlen(text);
2714         }
2715         if (patternlength == -1) {
2716             patternlength = u_strlen(pattern);
2717         }
2718         if (textlength <= 0 || patternlength <= 0) {
2719             *status = U_ILLEGAL_ARGUMENT_ERROR;
2720             return NULL;
2721         }
2722
2723         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2724         if (result == NULL) {
2725             *status = U_MEMORY_ALLOCATION_ERROR;
2726             return NULL;
2727         }
2728
2729         result->collator    = collator;
2730         result->strength    = ucol_getStrength(collator);
2731         result->ceMask      = getMask(result->strength);
2732         result->toShift     =
2733              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2734                                                             UCOL_SHIFTED;
2735         result->variableTop = ucol_getVariableTop(collator, status);
2736
2737         if (U_FAILURE(*status)) {
2738             uprv_free(result);
2739             return NULL;
2740         }
2741
2742         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2743         if (result->search == NULL) {
2744             *status = U_MEMORY_ALLOCATION_ERROR;
2745             uprv_free(result);
2746             return NULL;
2747         }
2748
2749         result->search->text       = text;
2750         result->search->textLength = textlength;
2751
2752         result->pattern.text       = pattern;
2753         result->pattern.textLength = patternlength;
2754         result->pattern.CE         = NULL;
2755         result->pattern.PCE        = NULL;
2756
2757         result->search->breakIter  = breakiter;
2758 #if !UCONFIG_NO_BREAK_ITERATION
2759         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocale(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2760         if (breakiter) {
2761                 ubrk_setText(breakiter, text, textlength, status);
2762         }
2763 #endif
2764
2765         result->ownCollator           = FALSE;
2766         result->search->matchedLength = 0;
2767         result->search->matchedIndex  = USEARCH_DONE;
2768         result->utilIter              = NULL;
2769         result->textIter              = ucol_openElements(collator, text,
2770                                                           textlength, status);
2771         if (U_FAILURE(*status)) {
2772             usearch_close(result);
2773             return NULL;
2774         }
2775
2776         result->search->isOverlap          = FALSE;
2777         result->search->isCanonicalMatch   = FALSE;
2778         result->search->isForwardSearching = TRUE;
2779         result->search->reset              = TRUE;
2780
2781         initialize(result, status);
2782
2783         if (U_FAILURE(*status)) {
2784             usearch_close(result);
2785             return NULL;
2786         }
2787
2788         return result;
2789     }
2790     return NULL;
2791 }
2792
2793 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2794 {
2795     if (strsrch) {
2796         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2797             strsrch->pattern.CE) {
2798             uprv_free(strsrch->pattern.CE);
2799         }
2800
2801         if (strsrch->pattern.PCE != NULL &&
2802             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
2803             uprv_free(strsrch->pattern.PCE);
2804         }
2805
2806         ucol_closeElements(strsrch->textIter);
2807         ucol_closeElements(strsrch->utilIter);
2808
2809         if (strsrch->ownCollator && strsrch->collator) {
2810             ucol_close((UCollator *)strsrch->collator);
2811         }
2812
2813 #if !UCONFIG_NO_BREAK_ITERATION
2814         if (strsrch->search->internalBreakIter) {
2815                 ubrk_close(strsrch->search->internalBreakIter);
2816         }
2817 #endif
2818
2819         uprv_free(strsrch->search);
2820         uprv_free(strsrch);
2821     }
2822 }
2823
2824 // set and get methods --------------------------------------------------
2825
2826 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2827                                         int32_t    position,
2828                                         UErrorCode    *status)
2829 {
2830     if (U_SUCCESS(*status) && strsrch) {
2831         if (isOutOfBounds(strsrch->search->textLength, position)) {
2832             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2833         }
2834         else {
2835             setColEIterOffset(strsrch->textIter, position);
2836         }
2837         strsrch->search->matchedIndex  = USEARCH_DONE;
2838         strsrch->search->matchedLength = 0;
2839         strsrch->search->reset         = FALSE;
2840     }
2841 }
2842
2843 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2844 {
2845     if (strsrch) {
2846         int32_t result = ucol_getOffset(strsrch->textIter);
2847         if (isOutOfBounds(strsrch->search->textLength, result)) {
2848             return USEARCH_DONE;
2849         }
2850         return result;
2851     }
2852     return USEARCH_DONE;
2853 }
2854
2855 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2856                                  USearchAttribute attribute,
2857                                  USearchAttributeValue value,
2858                                  UErrorCode *status)
2859 {
2860     if (U_SUCCESS(*status) && strsrch) {
2861         switch (attribute)
2862         {
2863         case USEARCH_OVERLAP :
2864             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2865             break;
2866         case USEARCH_CANONICAL_MATCH :
2867             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2868                                                                       FALSE);
2869             break;
2870         case USEARCH_ATTRIBUTE_COUNT :
2871         default:
2872             *status = U_ILLEGAL_ARGUMENT_ERROR;
2873         }
2874     }
2875     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2876         *status = U_ILLEGAL_ARGUMENT_ERROR;
2877     }
2878 }
2879
2880 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2881                                                 const UStringSearch *strsrch,
2882                                                 USearchAttribute attribute)
2883 {
2884     if (strsrch) {
2885         switch (attribute) {
2886         case USEARCH_OVERLAP :
2887             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2888                                                         USEARCH_OFF);
2889         case USEARCH_CANONICAL_MATCH :
2890             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2891                                                                USEARCH_OFF);
2892         case USEARCH_ATTRIBUTE_COUNT :
2893             return USEARCH_DEFAULT;
2894         }
2895     }
2896     return USEARCH_DEFAULT;
2897 }
2898
2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2900                                                 const UStringSearch *strsrch)
2901 {
2902     if (strsrch == NULL) {
2903         return USEARCH_DONE;
2904     }
2905     return strsrch->search->matchedIndex;
2906 }
2907
2908
2909 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2910                                             UChar         *result,
2911                                             int32_t        resultCapacity,
2912                                             UErrorCode    *status)
2913 {
2914     if (U_FAILURE(*status)) {
2915         return USEARCH_DONE;
2916     }
2917     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2918         result == NULL)) {
2919         *status = U_ILLEGAL_ARGUMENT_ERROR;
2920         return USEARCH_DONE;
2921     }
2922
2923     int32_t     copylength = strsrch->search->matchedLength;
2924     int32_t copyindex  = strsrch->search->matchedIndex;
2925     if (copyindex == USEARCH_DONE) {
2926         u_terminateUChars(result, resultCapacity, 0, status);
2927         return USEARCH_DONE;
2928     }
2929
2930     if (resultCapacity < copylength) {
2931         copylength = resultCapacity;
2932     }
2933     if (copylength > 0) {
2934         uprv_memcpy(result, strsrch->search->text + copyindex,
2935                     copylength * sizeof(UChar));
2936     }
2937     return u_terminateUChars(result, resultCapacity,
2938                              strsrch->search->matchedLength, status);
2939 }
2940
2941 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2942                                               const UStringSearch *strsrch)
2943 {
2944     if (strsrch) {
2945         return strsrch->search->matchedLength;
2946     }
2947     return USEARCH_DONE;
2948 }
2949
2950 #if !UCONFIG_NO_BREAK_ITERATION
2951
2952 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2953                                                UBreakIterator *breakiter,
2954                                                UErrorCode     *status)
2955 {
2956     if (U_SUCCESS(*status) && strsrch) {
2957         strsrch->search->breakIter = breakiter;
2958         if (breakiter) {
2959             ubrk_setText(breakiter, strsrch->search->text,
2960                          strsrch->search->textLength, status);
2961         }
2962     }
2963 }
2964
2965 U_CAPI const UBreakIterator* U_EXPORT2
2966 usearch_getBreakIterator(const UStringSearch *strsrch)
2967 {
2968     if (strsrch) {
2969         return strsrch->search->breakIter;
2970     }
2971     return NULL;
2972 }
2973
2974 #endif
2975
2976 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2977                                       const UChar         *text,
2978                                             int32_t        textlength,
2979                                             UErrorCode    *status)
2980 {
2981     if (U_SUCCESS(*status)) {
2982         if (strsrch == NULL || text == NULL || textlength < -1 ||
2983             textlength == 0) {
2984             *status = U_ILLEGAL_ARGUMENT_ERROR;
2985         }
2986         else {
2987             if (textlength == -1) {
2988                 textlength = u_strlen(text);
2989             }
2990             strsrch->search->text       = text;
2991             strsrch->search->textLength = textlength;
2992             ucol_setText(strsrch->textIter, text, textlength, status);
2993             strsrch->search->matchedIndex  = USEARCH_DONE;
2994             strsrch->search->matchedLength = 0;
2995             strsrch->search->reset         = TRUE;
2996 #if !UCONFIG_NO_BREAK_ITERATION
2997             if (strsrch->search->breakIter != NULL) {
2998                 ubrk_setText(strsrch->search->breakIter, text,
2999                              textlength, status);
3000             }
3001             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
3002 #endif
3003         }
3004     }
3005 }
3006
3007 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3008                                                      int32_t       *length)
3009 {
3010     if (strsrch) {
3011         *length = strsrch->search->textLength;
3012         return strsrch->search->text;
3013     }
3014     return NULL;
3015 }
3016
3017 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3018                                           const UCollator     *collator,
3019                                                 UErrorCode    *status)
3020 {
3021     if (U_SUCCESS(*status)) {
3022         if (collator == NULL) {
3023             *status = U_ILLEGAL_ARGUMENT_ERROR;
3024             return;
3025         }
3026
3027         if (strsrch) {
3028             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3029                 ucol_close((UCollator *)strsrch->collator);
3030                 strsrch->ownCollator = FALSE;
3031             }
3032             strsrch->collator    = collator;
3033             strsrch->strength    = ucol_getStrength(collator);
3034             strsrch->ceMask      = getMask(strsrch->strength);
3035 #if !UCONFIG_NO_BREAK_ITERATION
3036                 ubrk_close(strsrch->search->internalBreakIter);
3037                 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocale(collator, ULOC_VALID_LOCALE, status),
3038                                                                                                  strsrch->search->text, strsrch->search->textLength, status);
3039 #endif
3040             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3041             strsrch->toShift     =
3042                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3043                                                                 UCOL_SHIFTED;
3044             // if status is a failure, ucol_getVariableTop returns 0
3045             strsrch->variableTop = ucol_getVariableTop(collator, status);
3046             if (U_SUCCESS(*status)) {
3047                 initialize(strsrch, status);
3048                 if (U_SUCCESS(*status)) {
3049                     /* free offset buffer to avoid memory leak before initializing. */
3050                     freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
3051                     uprv_init_collIterate(collator, strsrch->search->text,
3052                                           strsrch->search->textLength,
3053                                           &(strsrch->textIter->iteratordata_));
3054                     strsrch->utilIter->iteratordata_.coll = collator;
3055                 }
3056             }
3057         }
3058
3059         // **** are these calls needed?
3060         // **** we call uprv_init_pce in initializePatternPCETable
3061         // **** and the CEBuffer constructor...
3062 #if 0
3063         uprv_init_pce(strsrch->textIter);
3064         uprv_init_pce(strsrch->utilIter);
3065 #endif
3066     }
3067 }
3068
3069 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3070 {
3071     if (strsrch) {
3072         return (UCollator *)strsrch->collator;
3073     }
3074     return NULL;
3075 }
3076
3077 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3078                                          const UChar         *pattern,
3079                                                int32_t        patternlength,
3080                                                UErrorCode    *status)
3081 {
3082     if (U_SUCCESS(*status)) {
3083         if (strsrch == NULL || pattern == NULL) {
3084             *status = U_ILLEGAL_ARGUMENT_ERROR;
3085         }
3086         else {
3087             if (patternlength == -1) {
3088                 patternlength = u_strlen(pattern);
3089             }
3090             if (patternlength == 0) {
3091                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3092                 return;
3093             }
3094             strsrch->pattern.text       = pattern;
3095             strsrch->pattern.textLength = patternlength;
3096             initialize(strsrch, status);
3097         }
3098     }
3099 }
3100
3101 U_CAPI const UChar* U_EXPORT2
3102 usearch_getPattern(const UStringSearch *strsrch,
3103                    int32_t       *length)
3104 {
3105     if (strsrch) {
3106         *length = strsrch->pattern.textLength;
3107         return strsrch->pattern.text;
3108     }
3109     return NULL;
3110 }
3111
3112 // miscellanous methods --------------------------------------------------
3113
3114 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3115                                            UErrorCode    *status)
3116 {
3117     if (strsrch && U_SUCCESS(*status)) {
3118         strsrch->search->isForwardSearching = TRUE;
3119         usearch_setOffset(strsrch, 0, status);
3120         if (U_SUCCESS(*status)) {
3121             return usearch_next(strsrch, status);
3122         }
3123     }
3124     return USEARCH_DONE;
3125 }
3126
3127 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3128                                                int32_t    position,
3129                                                UErrorCode    *status)
3130 {
3131     if (strsrch && U_SUCCESS(*status)) {
3132         strsrch->search->isForwardSearching = TRUE;
3133         // position checked in usearch_setOffset
3134         usearch_setOffset(strsrch, position, status);
3135         if (U_SUCCESS(*status)) {
3136             return usearch_next(strsrch, status);
3137         }
3138     }
3139     return USEARCH_DONE;
3140 }
3141
3142 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3143                                           UErrorCode    *status)
3144 {
3145     if (strsrch && U_SUCCESS(*status)) {
3146         strsrch->search->isForwardSearching = FALSE;
3147         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3148         if (U_SUCCESS(*status)) {
3149             return usearch_previous(strsrch, status);
3150         }
3151     }
3152     return USEARCH_DONE;
3153 }
3154
3155 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3156                                                int32_t    position,
3157                                                UErrorCode    *status)
3158 {
3159     if (strsrch && U_SUCCESS(*status)) {
3160         strsrch->search->isForwardSearching = FALSE;
3161         // position checked in usearch_setOffset
3162         usearch_setOffset(strsrch, position, status);
3163         if (U_SUCCESS(*status)) {
3164             return usearch_previous(strsrch, status);
3165         }
3166     }
3167     return USEARCH_DONE;
3168 }
3169
3170 /**
3171 * If a direction switch is required, we'll count the number of ces till the
3172 * beginning of the collation element iterator and iterate forwards that
3173 * number of times. This is so that we get to the correct point within the
3174 * string to continue the search in. Imagine when we are in the middle of the
3175 * normalization buffer when the change in direction is request. arrrgghh....
3176 * After searching the offset within the collation element iterator will be
3177 * shifted to the start of the match. If a match is not found, the offset would
3178 * have been set to the end of the text string in the collation element
3179 * iterator.
3180 * Okay, here's my take on normalization buffer. The only time when there can
3181 * be 2 matches within the same normalization is when the pattern is consists
3182 * of all accents. But since the offset returned is from the text string, we
3183 * should not confuse the caller by returning the second match within the
3184 * same normalization buffer. If we do, the 2 results will have the same match
3185 * offsets, and that'll be confusing. I'll return the next match that doesn't
3186 * fall within the same normalization buffer. Note this does not affect the
3187 * results of matches spanning the text and the normalization buffer.
3188 * The position to start searching is taken from the collation element
3189 * iterator. Callers of this API would have to set the offset in the collation
3190 * element iterator before using this method.
3191 */
3192 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3193                                           UErrorCode    *status)
3194 {
3195     if (U_SUCCESS(*status) && strsrch) {
3196         // note offset is either equivalent to the start of the previous match
3197         // or is set by the user
3198         int32_t      offset       = usearch_getOffset(strsrch);
3199         USearch     *search       = strsrch->search;
3200         search->reset             = FALSE;
3201         int32_t      textlength   = search->textLength;
3202         if (search->isForwardSearching) {
3203 #if BOYER_MOORE
3204             if (offset == textlength
3205                 || (!search->isOverlap &&
3206                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3207                     (search->matchedIndex != USEARCH_DONE &&
3208                      offset + search->matchedLength >= textlength)))) {
3209                 // not enough characters to match
3210                 setMatchNotFound(strsrch);
3211                 return USEARCH_DONE;
3212             }
3213 #else
3214             if (offset == textlength ||
3215                 (! search->isOverlap &&
3216                 (search->matchedIndex != USEARCH_DONE &&
3217                 offset + search->matchedLength > textlength))) {
3218                     // not enough characters to match
3219                     setMatchNotFound(strsrch);
3220                     return USEARCH_DONE;
3221             }
3222 #endif
3223         }
3224         else {
3225             // switching direction.
3226             // if matchedIndex == USEARCH_DONE, it means that either a
3227             // setOffset has been called or that previous ran off the text
3228             // string. the iterator would have been set to offset 0 if a
3229             // match is not found.
3230             search->isForwardSearching = TRUE;
3231             if (search->matchedIndex != USEARCH_DONE) {
3232                 // there's no need to set the collation element iterator
3233                 // the next call to next will set the offset.
3234                 return search->matchedIndex;
3235             }
3236         }
3237
3238         if (U_SUCCESS(*status)) {
3239             if (strsrch->pattern.CELength == 0) {
3240                 if (search->matchedIndex == USEARCH_DONE) {
3241                     search->matchedIndex = offset;
3242                 }
3243                 else { // moves by codepoints
3244                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
3245                 }
3246
3247                 search->matchedLength = 0;
3248                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3249                 // status checked below
3250                 if (search->matchedIndex == textlength) {
3251                     search->matchedIndex = USEARCH_DONE;
3252                 }
3253             }
3254             else {
3255                 if (search->matchedLength > 0) {
3256                     // if matchlength is 0 we are at the start of the iteration
3257                     if (search->isOverlap) {
3258                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3259                     }
3260                     else {
3261                         ucol_setOffset(strsrch->textIter,
3262                                        offset + search->matchedLength, status);
3263                     }
3264                 }
3265                 else {
3266                     // for boundary check purposes. this will ensure that the
3267                     // next match will not preceed the current offset
3268                     // note search->matchedIndex will always be set to something
3269                     // in the code
3270                     search->matchedIndex = offset - 1;
3271                 }
3272
3273                 if (search->isCanonicalMatch) {
3274                     // can't use exact here since extra accents are allowed.
3275                     usearch_handleNextCanonical(strsrch, status);
3276                 }
3277                 else {
3278                     usearch_handleNextExact(strsrch, status);
3279                 }
3280             }
3281
3282             if (U_FAILURE(*status)) {
3283                 return USEARCH_DONE;
3284             }
3285
3286 #if !BOYER_MOORE
3287             if (search->matchedIndex == USEARCH_DONE) {
3288                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3289             } else {
3290                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3291             }
3292 #endif
3293
3294             return search->matchedIndex;
3295         }
3296     }
3297     return USEARCH_DONE;
3298 }
3299
3300 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3301                                               UErrorCode *status)
3302 {
3303     if (U_SUCCESS(*status) && strsrch) {
3304         int32_t offset;
3305         USearch *search = strsrch->search;
3306         if (search->reset) {
3307             offset                     = search->textLength;
3308             search->isForwardSearching = FALSE;
3309             search->reset              = FALSE;
3310             setColEIterOffset(strsrch->textIter, offset);
3311         }
3312         else {
3313             offset = usearch_getOffset(strsrch);
3314         }
3315
3316         int32_t matchedindex = search->matchedIndex;
3317         if (search->isForwardSearching == TRUE) {
3318             // switching direction.
3319             // if matchedIndex == USEARCH_DONE, it means that either a
3320             // setOffset has been called or that next ran off the text
3321             // string. the iterator would have been set to offset textLength if
3322             // a match is not found.
3323             search->isForwardSearching = FALSE;
3324             if (matchedindex != USEARCH_DONE) {
3325                 return matchedindex;
3326             }
3327         }
3328         else {
3329 #if BOYER_MOORE
3330             if (offset == 0 || matchedindex == 0 ||
3331                 (!search->isOverlap &&
3332                     (offset < strsrch->pattern.defaultShiftSize ||
3333                     (matchedindex != USEARCH_DONE &&
3334                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3335                 // not enough characters to match
3336                 setMatchNotFound(strsrch);
3337                 return USEARCH_DONE;
3338             }
3339 #else
3340             // Could check pattern length, but the
3341             // linear search will do the right thing
3342             if (offset == 0 || matchedindex == 0) {
3343                 setMatchNotFound(strsrch);
3344                 return USEARCH_DONE;
3345             }
3346 #endif
3347         }
3348
3349         if (U_SUCCESS(*status)) {
3350             if (strsrch->pattern.CELength == 0) {
3351                 search->matchedIndex =
3352                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3353                 if (search->matchedIndex == 0) {
3354                     setMatchNotFound(strsrch);
3355                     // status checked below
3356                 }
3357                 else { // move by codepoints
3358                     UTF_BACK_1(search->text, 0, search->matchedIndex);
3359                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3360                     // status checked below
3361                     search->matchedLength = 0;
3362                 }
3363             }
3364             else {
3365 #if !BOYER_MOORE
3366                 if (search->matchedIndex != USEARCH_DONE) {
3367                     if (search->isOverlap) {
3368                         ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);
3369                     }
3370                 }
3371 #endif
3372
3373                 if (strsrch->search->isCanonicalMatch) {
3374                     // can't use exact here since extra accents are allowed.
3375                     usearch_handlePreviousCanonical(strsrch, status);
3376                     // status checked below
3377                 }
3378                 else {
3379                     usearch_handlePreviousExact(strsrch, status);
3380                     // status checked below
3381                 }
3382             }
3383
3384             if (U_FAILURE(*status)) {
3385                 return USEARCH_DONE;
3386             }
3387
3388             return search->matchedIndex;
3389         }
3390     }
3391     return USEARCH_DONE;
3392 }
3393
3394
3395
3396 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3397 {
3398     /*
3399     reset is setting the attributes that are already in
3400     string search, hence all attributes in the collator should
3401     be retrieved without any problems
3402     */
3403     if (strsrch) {
3404         UErrorCode status            = U_ZERO_ERROR;
3405         UBool      sameCollAttribute = TRUE;
3406         uint32_t   ceMask;
3407         UBool      shift;
3408         uint32_t   varTop;
3409
3410         strsrch->strength    = ucol_getStrength(strsrch->collator);
3411         ceMask = getMask(strsrch->strength);
3412         if (strsrch->ceMask != ceMask) {
3413             strsrch->ceMask = ceMask;
3414             sameCollAttribute = FALSE;
3415         }
3416         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3417         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3418                                   &status) == UCOL_SHIFTED;
3419         if (strsrch->toShift != shift) {
3420             strsrch->toShift  = shift;
3421             sameCollAttribute = FALSE;
3422         }
3423
3424         // if status is a failure, ucol_getVariableTop returns 0
3425         varTop = ucol_getVariableTop(strsrch->collator, &status);
3426         if (strsrch->variableTop != varTop) {
3427             strsrch->variableTop = varTop;
3428             sameCollAttribute    = FALSE;
3429         }
3430         if (!sameCollAttribute) {
3431             initialize(strsrch, &status);
3432         }
3433         /* free offset buffer to avoid memory leak before initializing. */
3434         freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
3435         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3436                               strsrch->search->textLength,
3437                               &(strsrch->textIter->iteratordata_));
3438         strsrch->search->matchedLength      = 0;
3439         strsrch->search->matchedIndex       = USEARCH_DONE;
3440         strsrch->search->isOverlap          = FALSE;
3441         strsrch->search->isCanonicalMatch   = FALSE;
3442         strsrch->search->isForwardSearching = TRUE;
3443         strsrch->search->reset              = TRUE;
3444     }
3445 }
3446
3447 //
3448 //  CEI  Collation Element + source text index.
3449 //       These structs are kept in the circular buffer.
3450 //
3451 struct  CEI {
3452     int64_t ce;
3453     int32_t lowIndex;
3454     int32_t highIndex;
3455 };
3456
3457 U_NAMESPACE_BEGIN
3458
3459
3460 //
3461 //  CEBuffer   A circular buffer of CEs from the text being searched.
3462 //
3463 #define   DEFAULT_CEBUFFER_SIZE 50
3464 struct CEBuffer {
3465     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3466     CEI                 *buf;
3467     int32_t              bufSize;
3468     int32_t              firstIx;
3469     int32_t              limitIx;
3470     UCollationElements  *ceIter;
3471     UStringSearch       *strSearch;
3472
3473
3474
3475                CEBuffer(UStringSearch *ss, UErrorCode *status);
3476                ~CEBuffer();
3477    const CEI   *get(int32_t index);
3478    const CEI   *getPrevious(int32_t index);
3479 };
3480
3481
3482 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
3483     buf = defBuf;
3484     strSearch = ss;
3485     bufSize = ss->pattern.CELength+10;
3486     ceIter    = ss->textIter;
3487     firstIx = 0;
3488     limitIx = 0;
3489
3490     uprv_init_pce(ceIter);
3491
3492     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3493         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3494         if (buf == NULL) {
3495             *status = U_MEMORY_ALLOCATION_ERROR;
3496         }
3497     }
3498 }
3499
3500 // TODO: add a reset or init function so that allocated
3501 //       buffers can be retained & reused.
3502
3503 CEBuffer::~CEBuffer() {
3504     if (buf != defBuf) {
3505         uprv_free(buf);
3506     }
3507 }
3508
3509
3510 // Get the CE with the specified index.
3511 //   Index must be in the range
3512 //          n-history_size < index < n+1
3513 //   where n is the largest index to have been fetched by some previous call to this function.
3514 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3515 //
3516 const CEI *CEBuffer::get(int32_t index) {
3517     int i = index % bufSize;
3518
3519     if (index>=firstIx && index<limitIx) {
3520         // The request was for an entry already in our buffer.
3521         //  Just return it.
3522         return &buf[i];
3523     }
3524
3525     // Caller is requesting a new, never accessed before, CE.
3526     //   Verify that it is the next one in sequence, which is all
3527     //   that is allowed.
3528     if (index != limitIx) {
3529         U_ASSERT(FALSE);
3530
3531         return NULL;
3532     }
3533
3534     // Manage the circular CE buffer indexing
3535     limitIx++;
3536
3537     if (limitIx - firstIx >= bufSize) {
3538         // The buffer is full, knock out the lowest-indexed entry.
3539         firstIx++;
3540     }
3541
3542     UErrorCode status = U_ZERO_ERROR;
3543
3544     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
3545
3546     return &buf[i];
3547 }
3548
3549 // Get the CE with the specified index.
3550 //   Index must be in the range
3551 //          n-history_size < index < n+1
3552 //   where n is the largest index to have been fetched by some previous call to this function.
3553 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3554 //
3555 const CEI *CEBuffer::getPrevious(int32_t index) {
3556     int i = index % bufSize;
3557
3558     if (index>=firstIx && index<limitIx) {
3559         // The request was for an entry already in our buffer.
3560         //  Just return it.
3561         return &buf[i];
3562     }
3563
3564     // Caller is requesting a new, never accessed before, CE.
3565     //   Verify that it is the next one in sequence, which is all
3566     //   that is allowed.
3567     if (index != limitIx) {
3568         U_ASSERT(FALSE);
3569
3570         return NULL;
3571     }
3572
3573     // Manage the circular CE buffer indexing
3574     limitIx++;
3575
3576     if (limitIx - firstIx >= bufSize) {
3577         // The buffer is full, knock out the lowest-indexed entry.
3578         firstIx++;
3579     }
3580
3581     UErrorCode status = U_ZERO_ERROR;
3582
3583     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
3584
3585     return &buf[i];
3586 }
3587
3588 U_NAMESPACE_END
3589
3590
3591 // #define USEARCH_DEBUG
3592
3593 #ifdef USEARCH_DEBUG
3594 #include <stdio.h>
3595 #include <stdlib.h>
3596 #endif
3597
3598 /*
3599  * Find the next break boundary after startIndex. If the UStringSearch object
3600  * has an external break iterator, use that. Otherwise use the internal character
3601  * break iterator.
3602  */
3603 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3604 #if 0
3605     const UChar *text = strsrch->search->text;
3606     int32_t textLen   = strsrch->search->textLength;
3607
3608     U_ASSERT(startIndex>=0);
3609     U_ASSERT(startIndex<=textLen);
3610
3611     if (startIndex >= textLen) {
3612         return startIndex;
3613     }
3614
3615     UChar32  c;
3616     int32_t  i = startIndex;
3617     U16_NEXT(text, i, textLen, c);
3618
3619     // If we are on a control character, stop without looking for combining marks.
3620     //    Control characters do not combine.
3621     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3622     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3623         return i;
3624     }
3625
3626     // The initial character was not a control, and can thus accept trailing
3627     //   combining characters.  Advance over however many of them there are.
3628     int32_t  indexOfLastCharChecked;
3629     for (;;) {
3630         indexOfLastCharChecked = i;
3631         if (i>=textLen) {
3632             break;
3633         }
3634         U16_NEXT(text, i, textLen, c);
3635         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3636         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3637             break;
3638         }
3639     }
3640     return indexOfLastCharChecked;
3641 #elif !UCONFIG_NO_BREAK_ITERATION
3642     UBreakIterator *breakiterator = strsrch->search->breakIter;
3643
3644     if (breakiterator == NULL) {
3645         breakiterator = strsrch->search->internalBreakIter;
3646     }
3647
3648     if (breakiterator != NULL) {
3649         return ubrk_following(breakiterator, startIndex);
3650     }
3651
3652     return startIndex;
3653 #else
3654     // **** or should we use the original code? ****
3655     return startIndex;
3656 #endif
3657
3658 }
3659
3660 /*
3661  * Returns TRUE if index is on a break boundary. If the UStringSearch
3662  * has an external break iterator, test using that, otherwise test
3663  * using the internal character break iterator.
3664  */
3665 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3666 #if 0
3667     const UChar *text = strsrch->search->text;
3668     int32_t textLen   = strsrch->search->textLength;
3669
3670     U_ASSERT(index>=0);
3671     U_ASSERT(index<=textLen);
3672
3673     if (index>=textLen || index<=0) {
3674         return FALSE;
3675     }
3676
3677     // If the character at the current index is not a GRAPHEME_EXTEND
3678     //    then we can not be within a combining sequence.
3679     UChar32  c;
3680     U16_GET(text, 0, index, textLen, c);
3681     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3682     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3683         return FALSE;
3684     }
3685
3686     // We are at a combining mark.  If the preceding character is anything
3687     //   except a CONTROL, CR or LF, we are in a combining sequence.
3688     U16_PREV(text, 0, index, c);
3689     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3690     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3691     return combining;
3692 #elif !UCONFIG_NO_BREAK_ITERATION
3693     UBreakIterator *breakiterator = strsrch->search->breakIter;
3694
3695     if (breakiterator == NULL) {
3696         breakiterator = strsrch->search->internalBreakIter;
3697     }
3698
3699     return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
3700 #else
3701     // **** or use the original code? ****
3702     return FALSE;
3703 #endif
3704 }
3705
3706 #if 0
3707 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3708 {
3709 #if !UCONFIG_NO_BREAK_ITERATION
3710     UBreakIterator *breakiterator = strsrch->search->breakIter;
3711
3712     if (breakiterator != NULL) {
3713         int32_t startindex = ubrk_first(breakiterator);
3714         int32_t endindex   = ubrk_last(breakiterator);
3715
3716         // out-of-range indexes are never boundary positions
3717         if (start < startindex || start > endindex ||
3718             end < startindex || end > endindex) {
3719             return FALSE;
3720         }
3721
3722         return ubrk_isBoundary(breakiterator, start) &&
3723                ubrk_isBoundary(breakiterator, end);
3724     }
3725 #endif
3726
3727     return TRUE;
3728 }
3729 #endif
3730
3731
3732 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3733                                        int32_t        startIdx,
3734                                        int32_t        *matchStart,
3735                                        int32_t        *matchLimit,
3736                                        UErrorCode     *status)
3737 {
3738     if (U_FAILURE(*status)) {
3739         return FALSE;
3740     }
3741
3742     // TODO:  reject search patterns beginning with a combining char.
3743
3744 #ifdef USEARCH_DEBUG
3745     if (getenv("USEARCH_DEBUG") != NULL) {
3746         printf("Pattern CEs\n");
3747         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3748             printf(" %8x", strsrch->pattern.CE[ii]);
3749         }
3750         printf("\n");
3751     }
3752
3753 #endif
3754     // Input parameter sanity check.
3755     //  TODO:  should input indicies clip to the text length
3756     //         in the same way that UText does.
3757     if(strsrch->pattern.CELength == 0         ||
3758        startIdx < 0                           ||
3759        startIdx > strsrch->search->textLength ||
3760        strsrch->pattern.CE == NULL) {
3761            *status = U_ILLEGAL_ARGUMENT_ERROR;
3762            return FALSE;
3763     }
3764
3765     if (strsrch->pattern.PCE == NULL) {
3766         initializePatternPCETable(strsrch, status);
3767     }
3768
3769     ucol_setOffset(strsrch->textIter, startIdx, status);
3770     CEBuffer ceb(strsrch, status);
3771
3772
3773     int32_t    targetIx = 0;
3774     const CEI *targetCEI;
3775     int32_t    patIx;
3776     UBool      found;
3777
3778     int32_t  mStart = -1;
3779     int32_t  mLimit = -1;
3780     int32_t  minLimit;
3781     int32_t  maxLimit;
3782
3783
3784
3785     // Outer loop moves over match starting positions in the
3786     //      target CE space.
3787     for(targetIx=0; ; targetIx++)
3788     {
3789         found = TRUE;
3790         //  Inner loop checks for a match beginning at each
3791         //  position from the outer loop.
3792         for (patIx=0; patIx<strsrch->pattern.CELength; patIx++) {
3793             int64_t patCE = strsrch->pattern.PCE[patIx];
3794             targetCEI = ceb.get(targetIx+patIx);
3795             //  Compare CE from target string with CE from the pattern.
3796             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
3797             //    which will fail the compare, below.
3798             if (targetCEI->ce != patCE) {
3799                 found = FALSE;
3800                 break;
3801             }
3802         }
3803
3804         if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
3805             // No match at this targetIx.  Try again at the next.
3806             continue;
3807         }
3808
3809         if (!found) {
3810             // No match at all, we have run off the end of the target text.
3811             break;
3812         }
3813
3814
3815         // We have found a match in CE space.
3816         // Now determine the bounds in string index space.
3817         //  There still is a chance of match failure if the CE range not correspond to
3818         //     an acceptable character range.
3819         //
3820         const CEI *firstCEI = ceb.get(targetIx);
3821         const CEI *lastCEI  = ceb.get(targetIx + strsrch->pattern.CELength - 1);
3822         const CEI *nextCEI  = ceb.get(targetIx + strsrch->pattern.CELength);
3823
3824      // targetCEI = ceb.get(targetIx+strsrch->pattern.CELength);
3825      // maxLimit = targetCEI->lowIndex;
3826         mStart   = firstCEI->lowIndex;
3827         minLimit = lastCEI->lowIndex;
3828         maxLimit = nextCEI->lowIndex;
3829
3830         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3831         //   extended to the end of input, and the match is good.
3832
3833         // Look at the high and low indices of the CE following the match. If
3834         // they are the same it means one of two things:
3835         //    1. The match extended to the last CE from the target text, which is OK, or
3836         //    2. The last CE that was part of the match is in an expansion that extends
3837         //       to the first CE after the match. In this case, we reject the match.
3838         if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3839             found = FALSE;
3840         }
3841
3842
3843         // Check for the start of the match being within a combining sequence.
3844         //   This can happen if the pattern itself begins with a combining char, and
3845         //   the match found combining marks in the target text that were attached
3846         //    to something else.
3847         //   This type of match should be rejected for not completely consuming a
3848         //   combining sequence.
3849         if (isBreakBoundary(strsrch, mStart)) {
3850             found = FALSE;
3851         }
3852
3853         // Check for the start of the match being within an Collation Element Expansion,
3854         //   meaning that the first char of the match is only partially matched.
3855         //   With exapnsions, the first CE will report the index of the source
3856         //   character, and all subsequent (expansions) CEs will report the source index of the
3857         //    _following_ character.
3858         int32_t secondIx = firstCEI->highIndex;
3859         if (mStart == secondIx) {
3860             found = FALSE;
3861         }
3862
3863         //  Advance the match end position to the first acceptable match boundary.
3864         //    This advances the index over any combining charcters.
3865         mLimit = maxLimit;
3866         if (minLimit < maxLimit) {
3867             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
3868
3869             if (nba >= lastCEI->highIndex) {
3870                 mLimit = nba;
3871             }
3872         }
3873
3874     #ifdef USEARCH_DEBUG
3875         if (getenv("USEARCH_DEBUG") != NULL) {
3876             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
3877         }
3878     #endif
3879
3880         // If advancing to the end of a combining sequence in character indexing space
3881         //   advanced us beyond the end of the match in CE space, reject this match.
3882         if (mLimit > maxLimit) {
3883             found = FALSE;
3884         }
3885
3886         if (isBreakBoundary(strsrch, mLimit)) {
3887             found = FALSE;
3888         }
3889
3890         if (found) {
3891             break;
3892         }
3893     }
3894
3895     #ifdef USEARCH_DEBUG
3896     if (getenv("USEARCH_DEBUG") != NULL) {
3897         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
3898         int32_t  lastToPrint = ceb.limitIx+2;
3899         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
3900             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
3901         }
3902         printf("\n%s\n", found? "match found" : "no match");
3903     }
3904     #endif
3905
3906     // All Done.  Store back the match bounds to the caller.
3907     //
3908     if (found==FALSE) {
3909         mLimit = -1;
3910         mStart = -1;
3911     }
3912
3913     if (matchStart != NULL) {
3914         *matchStart= mStart;
3915     }
3916
3917     if (matchLimit != NULL) {
3918         *matchLimit = mLimit;
3919     }
3920
3921     return found;
3922 }
3923
3924
3925 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
3926                                                 int32_t        startIdx,
3927                                                 int32_t        *matchStart,
3928                                                 int32_t        *matchLimit,
3929                                                 UErrorCode     *status)
3930 {
3931     if (U_FAILURE(*status)) {
3932         return FALSE;
3933     }
3934
3935     // TODO:  reject search patterns beginning with a combining char.
3936
3937 #ifdef USEARCH_DEBUG
3938     if (getenv("USEARCH_DEBUG") != NULL) {
3939         printf("Pattern CEs\n");
3940         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3941             printf(" %8x", strsrch->pattern.CE[ii]);
3942         }
3943         printf("\n");
3944     }
3945
3946 #endif
3947     // Input parameter sanity check.
3948     //  TODO:  should input indicies clip to the text length
3949     //         in the same way that UText does.
3950     if(strsrch->pattern.CELength == 0         ||
3951        startIdx < 0                           ||
3952        startIdx > strsrch->search->textLength ||
3953        strsrch->pattern.CE == NULL) {
3954            *status = U_ILLEGAL_ARGUMENT_ERROR;
3955            return FALSE;
3956     }
3957
3958     if (strsrch->pattern.PCE == NULL) {
3959         initializePatternPCETable(strsrch, status);
3960     }
3961
3962     CEBuffer ceb(strsrch, status);
3963     int32_t    targetIx = 0;
3964
3965     /*
3966      * Pre-load the buffer with the CE's for the grapheme
3967      * after our starting position so that we're sure that
3968      * we can look at the CE following the match when we
3969      * check the match boundaries.
3970      *
3971      * This will also pre-fetch the first CE that we'll
3972      * consider for the match.
3973      */
3974     if (startIdx < strsrch->search->textLength) {
3975         UBreakIterator *bi = strsrch->search->internalBreakIter;
3976         int32_t next = ubrk_following(bi, startIdx);
3977
3978         ucol_setOffset(strsrch->textIter, next, status);
3979
3980         for (targetIx = 0; ; targetIx += 1) {
3981             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
3982                 break;
3983             }
3984         }
3985     } else {
3986         ucol_setOffset(strsrch->textIter, startIdx, status);
3987     }
3988
3989
3990    const CEI  *targetCEI;
3991     int32_t    patIx;
3992     UBool      found;
3993
3994     int32_t  limitIx = targetIx;
3995     int32_t  mStart = -1;
3996     int32_t  mLimit = -1;
3997     int32_t  minLimit;
3998     int32_t  maxLimit;
3999
4000
4001
4002     // Outer loop moves over match starting positions in the
4003     //      target CE space.
4004     for(targetIx = limitIx; ; targetIx += 1)
4005     {
4006         found = TRUE;
4007         //  Inner loop checks for a match beginning at each
4008         //  position from the outer loop.
4009         for (patIx = strsrch->pattern.CELength - 1; patIx >= 0; patIx -= 1) {
4010             int64_t patCE = strsrch->pattern.PCE[patIx];
4011
4012             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1 - patIx);
4013             //  Compare CE from target string with CE from the pattern.
4014             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4015             //    which will fail the compare, below.
4016             if (targetCEI->ce != patCE) {
4017                 found = FALSE;
4018                 break;
4019             }
4020         }
4021
4022         if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
4023             // No match at this targetIx.  Try again at the next.
4024             continue;
4025         }
4026
4027         if (!found) {
4028             // No match at all, we have run off the end of the target text.
4029             break;
4030         }
4031
4032
4033         // We have found a match in CE space.
4034         // Now determine the bounds in string index space.
4035         //  There still is a chance of match failure if the CE range not correspond to
4036         //     an acceptable character range.
4037         //
4038         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1);
4039         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4040         const CEI *nextCEI  = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
4041
4042         mStart   = firstCEI->lowIndex;
4043         minLimit = lastCEI->lowIndex;
4044         maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;
4045
4046         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4047         //   extended to the end of input, and the match is good.
4048
4049         // Look at the high and low indices of the CE following the match. If
4050         // they are the same it means one of two things:
4051         //    1. The match extended to the last CE from the target text, which is OK, or
4052         //    2. The last CE that was part of the match is in an expansion that extends
4053         //       to the first CE after the match. In this case, we reject the match.
4054         if (targetIx >= 1) {
4055             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4056                 found = FALSE;
4057             }
4058         }
4059
4060
4061         // Check for the start of the match being within a combining sequence.
4062         //   This can happen if the pattern itself begins with a combining char, and
4063         //   the match found combining marks in the target text that were attached
4064         //    to something else.
4065         //   This type of match should be rejected for not completely consuming a
4066         //   combining sequence.
4067         if (isBreakBoundary(strsrch, mStart)) {
4068             found = FALSE;
4069         }
4070
4071         // Look at the high index of the first CE in the match. If it's the same as the
4072         // low index, the first CE in the match is in the middle of an expansion.
4073         if (mStart == firstCEI->highIndex) {
4074             found = FALSE;
4075         }
4076
4077         //  Advance the match end position to the first acceptable match boundary.
4078         //    This advances the index over any combining charcters.
4079         mLimit = maxLimit;
4080         if (/*targetIx > 0 &&*/ minLimit < maxLimit) {
4081             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4082
4083             if (nba >= lastCEI->highIndex) {
4084                 mLimit = nba;
4085             }
4086         }
4087
4088     #ifdef USEARCH_DEBUG
4089         if (getenv("USEARCH_DEBUG") != NULL) {
4090             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4091         }
4092     #endif
4093
4094         // If advancing to the end of a combining sequence in character indexing space
4095         //   advanced us beyond the end of the match in CE space, reject this match.
4096         if (mLimit > maxLimit) {
4097             found = FALSE;
4098         }
4099
4100         // Make sure the end of the match is on a break boundary
4101         if (isBreakBoundary(strsrch, mLimit)) {
4102             found = FALSE;
4103         }
4104
4105         if (found) {
4106             break;
4107         }
4108     }
4109
4110     #ifdef USEARCH_DEBUG
4111     if (getenv("USEARCH_DEBUG") != NULL) {
4112         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4113         int32_t  lastToPrint = ceb.limitIx+2;
4114         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4115             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4116         }
4117         printf("\n%s\n", found? "match found" : "no match");
4118     }
4119     #endif
4120
4121     // All Done.  Store back the match bounds to the caller.
4122     //
4123     if (found==FALSE) {
4124         mLimit = -1;
4125         mStart = -1;
4126     }
4127
4128     if (matchStart != NULL) {
4129         *matchStart= mStart;
4130     }
4131
4132     if (matchLimit != NULL) {
4133         *matchLimit = mLimit;
4134     }
4135
4136     return found;
4137 }
4138
4139
4140
4141
4142 // internal use methods declared in usrchimp.h -----------------------------
4143
4144 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4145 {
4146     if (U_FAILURE(*status)) {
4147         setMatchNotFound(strsrch);
4148         return FALSE;
4149     }
4150
4151 #if BOYER_MOORE
4152     UCollationElements *coleiter        = strsrch->textIter;
4153     int32_t             textlength      = strsrch->search->textLength;
4154     int32_t            *patternce       = strsrch->pattern.CE;
4155     int32_t             patterncelength = strsrch->pattern.CELength;
4156     int32_t             textoffset      = ucol_getOffset(coleiter);
4157
4158     // status used in setting coleiter offset, since offset is checked in
4159     // shiftForward before setting the coleiter offset, status never
4160     // a failure
4161     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4162                               patterncelength);
4163     while (textoffset <= textlength)
4164     {
4165         uint32_t    patternceindex = patterncelength - 1;
4166         int32_t     targetce;
4167         UBool       found          = FALSE;
4168         int32_t    lastce          = UCOL_NULLORDER;
4169
4170         setColEIterOffset(coleiter, textoffset);
4171
4172         for (;;) {
4173             // finding the last pattern ce match, imagine composite characters
4174             // for example: search for pattern A in text \u00C0
4175             // we'll have to skip \u0300 the grave first before we get to A
4176             targetce = ucol_previous(coleiter, status);
4177             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4178                 found = FALSE;
4179                 break;
4180             }
4181             targetce = getCE(strsrch, targetce);
4182             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4183                 // this is for the text \u0315\u0300 that requires
4184                 // normalization and pattern \u0300, where \u0315 is ignorable
4185                 continue;
4186             }
4187             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4188                 lastce = targetce;
4189             }
4190             if (targetce == patternce[patternceindex]) {
4191                 // the first ce can be a contraction
4192                 found = TRUE;
4193                 break;
4194             }
4195             if (!hasExpansion(coleiter)) {
4196                 found = FALSE;
4197                 break;
4198             }
4199         }
4200
4201         //targetce = lastce;
4202
4203         while (found && patternceindex > 0) {
4204                 lastce = targetce;
4205             targetce    = ucol_previous(coleiter, status);
4206             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4207                 found = FALSE;
4208                 break;
4209             }
4210             targetce    = getCE(strsrch, targetce);
4211             if (targetce == UCOL_IGNORABLE) {
4212                 continue;
4213             }
4214
4215             patternceindex --;
4216             found = found && targetce == patternce[patternceindex];
4217         }
4218
4219         targetce = lastce;
4220
4221         if (!found) {
4222             if (U_FAILURE(*status)) {
4223                 break;
4224             }
4225             textoffset = shiftForward(strsrch, textoffset, lastce,
4226                                       patternceindex);
4227             // status checked at loop.
4228             patternceindex = patterncelength;
4229             continue;
4230         }
4231
4232         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4233             // status checked in ucol_setOffset
4234             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4235             return TRUE;
4236         }
4237     }
4238     setMatchNotFound(strsrch);
4239     return FALSE;
4240 #else
4241     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4242     int32_t start = -1;
4243     int32_t end = -1;
4244
4245     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4246         strsrch->search->matchedIndex  = start;
4247         strsrch->search->matchedLength = end - start;
4248         return TRUE;
4249     } else {
4250         setMatchNotFound(strsrch);
4251         return FALSE;
4252     }
4253 #endif
4254 }
4255
4256 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4257 {
4258     if (U_FAILURE(*status)) {
4259         setMatchNotFound(strsrch);
4260         return FALSE;
4261     }
4262
4263 #if BOYER_MOORE
4264     UCollationElements *coleiter        = strsrch->textIter;
4265     int32_t             textlength      = strsrch->search->textLength;
4266     int32_t            *patternce       = strsrch->pattern.CE;
4267     int32_t             patterncelength = strsrch->pattern.CELength;
4268     int32_t             textoffset      = ucol_getOffset(coleiter);
4269     UBool               hasPatternAccents =
4270        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4271
4272     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4273                               patterncelength);
4274     strsrch->canonicalPrefixAccents[0] = 0;
4275     strsrch->canonicalSuffixAccents[0] = 0;
4276
4277     while (textoffset <= textlength)
4278     {
4279         int32_t     patternceindex = patterncelength - 1;
4280         int32_t     targetce;
4281         UBool       found          = FALSE;
4282         int32_t     lastce         = UCOL_NULLORDER;
4283
4284         setColEIterOffset(coleiter, textoffset);
4285
4286         for (;;) {
4287             // finding the last pattern ce match, imagine composite characters
4288             // for example: search for pattern A in text \u00C0
4289             // we'll have to skip \u0300 the grave first before we get to A
4290             targetce = ucol_previous(coleiter, status);
4291             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4292                 found = FALSE;
4293                 break;
4294             }
4295             targetce = getCE(strsrch, targetce);
4296             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4297                 lastce = targetce;
4298             }
4299             if (targetce == patternce[patternceindex]) {
4300                 // the first ce can be a contraction
4301                 found = TRUE;
4302                 break;
4303             }
4304             if (!hasExpansion(coleiter)) {
4305                 found = FALSE;
4306                 break;
4307             }
4308         }
4309
4310         while (found && patternceindex > 0) {
4311             targetce    = ucol_previous(coleiter, status);
4312             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4313                 found = FALSE;
4314                 break;
4315             }
4316             targetce    = getCE(strsrch, targetce);
4317             if (targetce == UCOL_IGNORABLE) {
4318                 continue;
4319             }
4320
4321             patternceindex --;
4322             found = found && targetce == patternce[patternceindex];
4323         }
4324
4325         // initializing the rearranged accent array
4326         if (hasPatternAccents && !found) {
4327             strsrch->canonicalPrefixAccents[0] = 0;
4328             strsrch->canonicalSuffixAccents[0] = 0;
4329             if (U_FAILURE(*status)) {
4330                 break;
4331             }
4332             found = doNextCanonicalMatch(strsrch, textoffset, status);
4333         }
4334
4335         if (!found) {
4336             if (U_FAILURE(*status)) {
4337                 break;
4338             }
4339             textoffset = shiftForward(strsrch, textoffset, lastce,
4340                                       patternceindex);
4341             // status checked at loop
4342             patternceindex = patterncelength;
4343             continue;
4344         }
4345
4346         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4347             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4348             return TRUE;
4349         }
4350     }
4351     setMatchNotFound(strsrch);
4352     return FALSE;
4353 #else
4354     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4355     int32_t start = -1;
4356     int32_t end = -1;
4357
4358     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4359         strsrch->search->matchedIndex  = start;
4360         strsrch->search->matchedLength = end - start;
4361         return TRUE;
4362     } else {
4363         setMatchNotFound(strsrch);
4364         return FALSE;
4365     }
4366 #endif
4367 }
4368
4369 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4370 {
4371     if (U_FAILURE(*status)) {
4372         setMatchNotFound(strsrch);
4373         return FALSE;
4374     }
4375
4376 #if BOYER_MOORE
4377     UCollationElements *coleiter        = strsrch->textIter;
4378     int32_t            *patternce       = strsrch->pattern.CE;
4379     int32_t             patterncelength = strsrch->pattern.CELength;
4380     int32_t             textoffset      = ucol_getOffset(coleiter);
4381
4382     // shifting it check for setting offset
4383     // if setOffset is called previously or there was no previous match, we
4384     // leave the offset as it is.
4385     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4386         textoffset = strsrch->search->matchedIndex;
4387     }
4388
4389     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4390                               patterncelength);
4391
4392     while (textoffset >= 0)
4393     {
4394         int32_t     patternceindex = 1;
4395         int32_t     targetce;
4396         UBool       found          = FALSE;
4397         int32_t     firstce        = UCOL_NULLORDER;
4398
4399         // if status is a failure, ucol_setOffset does nothing
4400         setColEIterOffset(coleiter, textoffset);
4401
4402         for (;;) {
4403             // finding the first pattern ce match, imagine composite
4404             // characters. for example: search for pattern \u0300 in text
4405             // \u00C0, we'll have to skip A first before we get to
4406             // \u0300 the grave accent
4407             targetce = ucol_next(coleiter, status);
4408             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4409                 found = FALSE;
4410                 break;
4411             }
4412             targetce = getCE(strsrch, targetce);
4413             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4414                 firstce = targetce;
4415             }
4416             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4417                 continue;
4418             }
4419             if (targetce == patternce[0]) {
4420                 found = TRUE;
4421                 break;
4422             }
4423             if (!hasExpansion(coleiter)) {
4424                 // checking for accents in composite character
4425                 found = FALSE;
4426                 break;
4427             }
4428         }
4429
4430         //targetce = firstce;
4431
4432         while (found && (patternceindex < patterncelength)) {
4433                 firstce = targetce;
4434             targetce    = ucol_next(coleiter, status);
4435             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4436                 found = FALSE;
4437                 break;
4438             }
4439             targetce    = getCE(strsrch, targetce);
4440             if (targetce == UCOL_IGNORABLE) {
4441                 continue;
4442             }
4443
4444             found = found && targetce == patternce[patternceindex];
4445             patternceindex ++;
4446         }
4447
4448         targetce = firstce;
4449
4450         if (!found) {
4451             if (U_FAILURE(*status)) {
4452                 break;
4453             }
4454
4455             textoffset = reverseShift(strsrch, textoffset, targetce,
4456                                       patternceindex);
4457             patternceindex = 0;
4458             continue;
4459         }
4460
4461         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4462             setColEIterOffset(coleiter, textoffset);
4463             return TRUE;
4464         }
4465     }
4466     setMatchNotFound(strsrch);
4467     return FALSE;
4468 #else
4469     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4470     int32_t start = -1;
4471     int32_t end = -1;
4472
4473     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4474         strsrch->search->matchedIndex = start;
4475         strsrch->search->matchedLength = end - start;
4476         return TRUE;
4477     } else {
4478         setMatchNotFound(strsrch);
4479         return FALSE;
4480     }
4481 #endif
4482 }
4483
4484 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4485                                       UErrorCode    *status)
4486 {
4487     if (U_FAILURE(*status)) {
4488         setMatchNotFound(strsrch);
4489         return FALSE;
4490     }
4491
4492 #if BOYER_MOORE
4493     UCollationElements *coleiter        = strsrch->textIter;
4494     int32_t            *patternce       = strsrch->pattern.CE;
4495     int32_t             patterncelength = strsrch->pattern.CELength;
4496     int32_t             textoffset      = ucol_getOffset(coleiter);
4497     UBool               hasPatternAccents =
4498        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4499
4500     // shifting it check for setting offset
4501     // if setOffset is called previously or there was no previous match, we
4502     // leave the offset as it is.
4503     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4504         textoffset = strsrch->search->matchedIndex;
4505     }
4506
4507     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4508                               patterncelength);
4509     strsrch->canonicalPrefixAccents[0] = 0;
4510     strsrch->canonicalSuffixAccents[0] = 0;
4511
4512     while (textoffset >= 0)
4513     {
4514         int32_t     patternceindex = 1;
4515         int32_t     targetce;
4516         UBool       found          = FALSE;
4517         int32_t     firstce        = UCOL_NULLORDER;
4518
4519         setColEIterOffset(coleiter, textoffset);
4520         for (;;) {
4521             // finding the first pattern ce match, imagine composite
4522             // characters. for example: search for pattern \u0300 in text
4523             // \u00C0, we'll have to skip A first before we get to
4524             // \u0300 the grave accent
4525             targetce = ucol_next(coleiter, status);
4526             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4527                 found = FALSE;
4528                 break;
4529             }
4530             targetce = getCE(strsrch, targetce);
4531             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4532                 firstce = targetce;
4533             }
4534
4535             if (targetce == patternce[0]) {
4536                 // the first ce can be a contraction
4537                 found = TRUE;
4538                 break;
4539             }
4540             if (!hasExpansion(coleiter)) {
4541                 // checking for accents in composite character
4542                 found = FALSE;
4543                 break;
4544             }
4545         }
4546
4547         targetce = firstce;
4548
4549         while (found && patternceindex < patterncelength) {
4550             targetce    = ucol_next(coleiter, status);
4551             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4552                 found = FALSE;
4553                 break;
4554             }
4555             targetce = getCE(strsrch, targetce);
4556             if (targetce == UCOL_IGNORABLE) {
4557                 continue;
4558             }
4559
4560             found = found && targetce == patternce[patternceindex];
4561             patternceindex ++;
4562         }
4563
4564         // initializing the rearranged accent array
4565         if (hasPatternAccents && !found) {
4566             strsrch->canonicalPrefixAccents[0] = 0;
4567             strsrch->canonicalSuffixAccents[0] = 0;
4568             if (U_FAILURE(*status)) {
4569                 break;
4570             }
4571             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4572         }
4573
4574         if (!found) {
4575             if (U_FAILURE(*status)) {
4576                 break;
4577             }
4578             textoffset = reverseShift(strsrch, textoffset, targetce,
4579                                       patternceindex);
4580             patternceindex = 0;
4581             continue;
4582         }
4583
4584         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4585             setColEIterOffset(coleiter, textoffset);
4586             return TRUE;
4587         }
4588     }
4589     setMatchNotFound(strsrch);
4590     return FALSE;
4591 #else
4592     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4593     int32_t start = -1;
4594     int32_t end = -1;
4595
4596     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4597         strsrch->search->matchedIndex = start;
4598         strsrch->search->matchedLength = end - start;
4599         return TRUE;
4600     } else {
4601         setMatchNotFound(strsrch);
4602         return FALSE;
4603     }
4604 #endif
4605 }
4606
4607 #endif /* #if !UCONFIG_NO_COLLATION */