icuSources/i18n/usearch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *  07/02/2001   synwee      Creation.
   7 **********************************************************************
   8 */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "unicode/usearch.h"
  15 #include "unicode/ustring.h"
  16 #include "unicode/uchar.h"
  17 #include "unicode/utf16.h"
  18 #include "normalizer2impl.h"
  19 #include "usrchimp.h"
  20 #include "cmemory.h"
  21 #include "ucln_in.h"
  22 #include "uassert.h"
  23 #include "ustr_imp.h"
  24
  25 U_NAMESPACE_USE
  26
  27 // don't use Boyer-Moore
  28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
  29 #define BOYER_MOORE 0
  30
  31 // internal definition ---------------------------------------------------
  32
  33 #define LAST_BYTE_MASK_          0xFF
  34 #define SECOND_LAST_BYTE_SHIFT_  8
  35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
  36
  37 static const Normalizer2Impl *g_nfcImpl = NULL;
  38
  39 // internal methods -------------------------------------------------
  40
  41 /**
  42 * Fast collation element iterator setOffset.
  43 * This function does not check for bounds.
  44 * @param coleiter collation element iterator
  45 * @param offset to set
  46 */
  47 static
  48 inline void setColEIterOffset(UCollationElements *elems,
  49                       int32_t             offset)
  50 {
  51     // Note: Not "fast" any more after the 2013 collation rewrite.
  52     // We do not want to expose more internals than necessary.
  53     UErrorCode status = U_ZERO_ERROR;
  54     ucol_setOffset(elems, offset, &status);
  55 }
  56
  57 /**
  58 * Getting the mask for collation strength
  59 * @param strength collation strength
  60 * @return collation element mask
  61 */
  62 static
  63 inline uint32_t getMask(UCollationStrength strength)
  64 {
  65     switch (strength)
  66     {
  67     case UCOL_PRIMARY:
  68         return UCOL_PRIMARYORDERMASK;
  69     case UCOL_SECONDARY:
  70         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
  71     default:
  72         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
  73                UCOL_PRIMARYORDERMASK;
  74     }
  75 }
  76
  77 /**
  78 * This is to squeeze the 21bit ces into a 256 table
  79 * @param ce collation element
  80 * @return collapsed version of the collation element
  81 */
  82 static
  83 inline int hash(uint32_t ce)
  84 {
  85     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
  86     // well with the new collation where most of the latin 1 characters
  87     // are of the value xx000xxx. their hashes will most of the time be 0
  88     // to be discussed on the hash algo.
  89     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
  90 }
  91
  92 U_CDECL_BEGIN
  93 static UBool U_CALLCONV
  94 usearch_cleanup(void) {
  95     g_nfcImpl = NULL;
  96     return TRUE;
  97 }
  98 U_CDECL_END
  99
 100 /**
 101 * Initializing the fcd tables.
 102 * Internal method, status assumed to be a success.
 103 * @param status output error if any, caller to check status before calling
 104 *               method, status assumed to be success when passed in.
 105 */
 106 static
 107 inline void initializeFCD(UErrorCode *status)
 108 {
 109     if (g_nfcImpl == NULL) {
 110         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
 111         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
 112     }
 113 }
 114
 115 /**
 116 * Gets the fcd value for a character at the argument index.
 117 * This method takes into accounts of the supplementary characters.
 118 * @param str UTF16 string where character for fcd retrieval resides
 119 * @param offset position of the character whose fcd is to be retrieved, to be
 120 *               overwritten with the next character position, taking
 121 *               surrogate characters into consideration.
 122 * @param strlength length of the argument string
 123 * @return fcd value
 124 */
 125 static
 126 uint16_t getFCD(const UChar   *str, int32_t *offset,
 127                              int32_t  strlength)
 128 {
 129     const UChar *temp = str + *offset;
 130     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
 131     *offset = (int32_t)(temp - str);
 132     return result;
 133 }
 134
 135 /**
 136 * Getting the modified collation elements taking into account the collation
 137 * attributes
 138 * @param strsrch string search data
 139 * @param sourcece
 140 * @return the modified collation element
 141 */
 142 static
 143 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
 144 {
 145     // note for tertiary we can't use the collator->tertiaryMask, that
 146     // is a preprocessed mask that takes into account case options. since
 147     // we are only concerned with exact matches, we don't need that.
 148     sourcece &= strsrch->ceMask;
 149
 150     if (strsrch->toShift) {
 151         // alternate handling here, since only the 16 most significant digits
 152         // is only used, we can safely do a compare without masking
 153         // if the ce is a variable, we mask and get only the primary values
 154         // no shifting to quartenary is required since all primary values
 155         // less than variabletop will need to be masked off anyway.
 156         if (strsrch->variableTop > sourcece) {
 157             if (strsrch->strength >= UCOL_QUATERNARY) {
 158                 sourcece &= UCOL_PRIMARYORDERMASK;
 159             }
 160             else {
 161                 sourcece = UCOL_IGNORABLE;
 162             }
 163         }
 164     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
 165         sourcece = 0xFFFF;
 166     }
 167
 168     return sourcece;
 169 }
 170
 171 /**
 172 * Allocate a memory and returns NULL if it failed.
 173 * Internal method, status assumed to be a success.
 174 * @param size to allocate
 175 * @param status output error if any, caller to check status before calling
 176 *               method, status assumed to be success when passed in.
 177 * @return newly allocated array, NULL otherwise
 178 */
 179 static
 180 inline void * allocateMemory(uint32_t size, UErrorCode *status)
 181 {
 182     uint32_t *result = (uint32_t *)uprv_malloc(size);
 183     if (result == NULL) {
 184         *status = U_MEMORY_ALLOCATION_ERROR;
 185     }
 186     return result;
 187 }
 188
 189 /**
 190 * Adds a uint32_t value to a destination array.
 191 * Creates a new array if we run out of space. The caller will have to
 192 * manually deallocate the newly allocated array.
 193 * Internal method, status assumed to be success, caller has to check status
 194 * before calling this method. destination not to be NULL and has at least
 195 * size destinationlength.
 196 * @param destination target array
 197 * @param offset destination offset to add value
 198 * @param destinationlength target array size, return value for the new size
 199 * @param value to be added
 200 * @param increments incremental size expected
 201 * @param status output error if any, caller to check status before calling
 202 *               method, status assumed to be success when passed in.
 203 * @return new destination array, destination if there was no new allocation
 204 */
 205 static
 206 inline int32_t * addTouint32_tArray(int32_t    *destination,
 207                                     uint32_t    offset,
 208                                     uint32_t   *destinationlength,
 209                                     uint32_t    value,
 210                                     uint32_t    increments,
 211                                     UErrorCode *status)
 212 {
 213     uint32_t newlength = *destinationlength;
 214     if (offset + 1 == newlength) {
 215         newlength += increments;
 216         int32_t *temp = (int32_t *)allocateMemory(
 217                                          sizeof(int32_t) * newlength, status);
 218         if (U_FAILURE(*status)) {
 219             return NULL;
 220         }
 221         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
 222         *destinationlength = newlength;
 223         destination        = temp;
 224     }
 225     destination[offset] = value;
 226     return destination;
 227 }
 228
 229 /**
 230 * Adds a uint64_t value to a destination array.
 231 * Creates a new array if we run out of space. The caller will have to
 232 * manually deallocate the newly allocated array.
 233 * Internal method, status assumed to be success, caller has to check status
 234 * before calling this method. destination not to be NULL and has at least
 235 * size destinationlength.
 236 * @param destination target array
 237 * @param offset destination offset to add value
 238 * @param destinationlength target array size, return value for the new size
 239 * @param value to be added
 240 * @param increments incremental size expected
 241 * @param status output error if any, caller to check status before calling
 242 *               method, status assumed to be success when passed in.
 243 * @return new destination array, destination if there was no new allocation
 244 */
 245 static
 246 inline int64_t * addTouint64_tArray(int64_t    *destination,
 247                                     uint32_t    offset,
 248                                     uint32_t   *destinationlength,
 249                                     uint64_t    value,
 250                                     uint32_t    increments,
 251                                     UErrorCode *status)
 252 {
 253     uint32_t newlength = *destinationlength;
 254     if (offset + 1 == newlength) {
 255         newlength += increments;
 256         int64_t *temp = (int64_t *)allocateMemory(
 257                                          sizeof(int64_t) * newlength, status);
 258
 259         if (U_FAILURE(*status)) {
 260             return NULL;
 261         }
 262
 263         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
 264         *destinationlength = newlength;
 265         destination        = temp;
 266     }
 267
 268     destination[offset] = value;
 269
 270     return destination;
 271 }
 272
 273 /**
 274 * Initializing the ce table for a pattern.
 275 * Stores non-ignorable collation keys.
 276 * Table size will be estimated by the size of the pattern text. Table
 277 * expansion will be perform as we go along. Adding 1 to ensure that the table
 278 * size definitely increases.
 279 * Internal method, status assumed to be a success.
 280 * @param strsrch string search data
 281 * @param status output error if any, caller to check status before calling
 282 *               method, status assumed to be success when passed in.
 283 * @return total number of expansions
 284 */
 285 static
 286 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
 287                                          UErrorCode    *status)
 288 {
 289     UPattern *pattern            = &(strsrch->pattern);
 290     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
 291     int32_t  *cetable            = pattern->cesBuffer;
 292     uint32_t  patternlength      = pattern->textLength;
 293     UCollationElements *coleiter = strsrch->utilIter;
 294
 295     if (coleiter == NULL) {
 296         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 297                                      patternlength, status);
 298         // status will be checked in ucol_next(..) later and if it is an
 299         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 300         // returned.
 301         strsrch->utilIter = coleiter;
 302     }
 303     else {
 304         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
 305     }
 306     if(U_FAILURE(*status)) {
 307         return 0;
 308     }
 309
 310     if (pattern->ces != cetable && pattern->ces) {
 311         uprv_free(pattern->ces);
 312     }
 313
 314     uint16_t  offset      = 0;
 315     uint16_t  result      = 0;
 316     int32_t   ce;
 317
 318     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
 319            U_SUCCESS(*status)) {
 320         uint32_t newce = getCE(strsrch, ce);
 321         if (newce) {
 322             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
 323                                   newce,
 324                                   patternlength - ucol_getOffset(coleiter) + 1,
 325                                   status);
 326             if (U_FAILURE(*status)) {
 327                 return 0;
 328             }
 329             offset ++;
 330             if (cetable != temp && cetable != pattern->cesBuffer) {
 331                 uprv_free(cetable);
 332             }
 333             cetable = temp;
 334         }
 335         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 336     }
 337
 338     cetable[offset]   = 0;
 339     pattern->ces       = cetable;
 340     pattern->cesLength = offset;
 341
 342     return result;
 343 }
 344
 345 /**
 346 * Initializing the pce table for a pattern.
 347 * Stores non-ignorable collation keys.
 348 * Table size will be estimated by the size of the pattern text. Table
 349 * expansion will be perform as we go along. Adding 1 to ensure that the table
 350 * size definitely increases.
 351 * Internal method, status assumed to be a success.
 352 * @param strsrch string search data
 353 * @param status output error if any, caller to check status before calling
 354 *               method, status assumed to be success when passed in.
 355 * @return total number of expansions
 356 */
 357 static
 358 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
 359                                           UErrorCode    *status)
 360 {
 361     UPattern *pattern            = &(strsrch->pattern);
 362     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
 363     int64_t  *pcetable           = pattern->pcesBuffer;
 364     uint32_t  patternlength      = pattern->textLength;
 365     UCollationElements *coleiter = strsrch->utilIter;
 366
 367     if (coleiter == NULL) {
 368         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 369                                      patternlength, status);
 370         // status will be checked in ucol_next(..) later and if it is an
 371         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 372         // returned.
 373         strsrch->utilIter = coleiter;
 374     } else {
 375         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
 376     }
 377     if(U_FAILURE(*status)) {
 378         return 0;
 379     }
 380
 381     if (pattern->pces != pcetable && pattern->pces != NULL) {
 382         uprv_free(pattern->pces);
 383     }
 384
 385     uint16_t  offset = 0;
 386     uint16_t  result = 0;
 387     int64_t   pce;
 388
 389     icu::UCollationPCE iter(coleiter);
 390
 391     // ** Should processed CEs be signed or unsigned?
 392     // ** (the rest of the code in this file seems to play fast-and-loose with
 393     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
 394     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
 395            U_SUCCESS(*status)) {
 396         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
 397                               pce,
 398                               patternlength - ucol_getOffset(coleiter) + 1,
 399                               status);
 400
 401         if (U_FAILURE(*status)) {
 402             return 0;
 403         }
 404
 405         offset += 1;
 406
 407         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
 408             uprv_free(pcetable);
 409         }
 410
 411         pcetable = temp;
 412         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 413     }
 414
 415     pcetable[offset]   = 0;
 416     pattern->pces       = pcetable;
 417     pattern->pcesLength = offset;
 418
 419     return result;
 420 }
 421
 422 /**
 423 * Initializes the pattern struct.
 424 * Internal method, status assumed to be success.
 425 * @param strsrch UStringSearch data storage
 426 * @param status output error if any, caller to check status before calling
 427 *               method, status assumed to be success when passed in.
 428 * @return expansionsize the total expansion size of the pattern
 429 */
 430 static
 431 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
 432 {
 433     if (U_FAILURE(*status)) { return 0; }
 434           UPattern   *pattern     = &(strsrch->pattern);
 435     const UChar      *patterntext = pattern->text;
 436           int32_t     length      = pattern->textLength;
 437           int32_t index       = 0;
 438
 439     // Since the strength is primary, accents are ignored in the pattern.
 440     if (strsrch->strength == UCOL_PRIMARY) {
 441         pattern->hasPrefixAccents = 0;
 442         pattern->hasSuffixAccents = 0;
 443     } else {
 444         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
 445                                                          SECOND_LAST_BYTE_SHIFT_;
 446         index = length;
 447         U16_BACK_1(patterntext, 0, index);
 448         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
 449                                                                  LAST_BYTE_MASK_;
 450     }
 451
 452     // ** HACK **
 453     if (strsrch->pattern.pces != NULL) {
 454         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
 455             uprv_free(strsrch->pattern.pces);
 456         }
 457
 458         strsrch->pattern.pces = NULL;
 459     }
 460
 461     // since intializePattern is an internal method status is a success.
 462     return initializePatternCETable(strsrch, status);
 463 }
 464
 465 /**
 466 * Initializing shift tables, with the default values.
 467 * If a corresponding default value is 0, the shift table is not set.
 468 * @param shift table for forwards shift
 469 * @param backshift table for backwards shift
 470 * @param cetable table containing pattern ce
 471 * @param cesize size of the pattern ces
 472 * @param expansionsize total size of the expansions
 473 * @param defaultforward the default forward value
 474 * @param defaultbackward the default backward value
 475 */
 476 static
 477 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
 478                           int32_t  *cetable, int32_t cesize,
 479                           int16_t   expansionsize,
 480                           int16_t   defaultforward,
 481                           int16_t   defaultbackward)
 482 {
 483     // estimate the value to shift. to do that we estimate the smallest
 484     // number of characters to give the relevant ces, ie approximately
 485     // the number of ces minus their expansion, since expansions can come
 486     // from a character.
 487     int32_t count;
 488     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 489         shift[count] = defaultforward;
 490     }
 491     cesize --; // down to the last index
 492     for (count = 0; count < cesize; count ++) {
 493         // number of ces from right of array to the count
 494         int temp = defaultforward - count - 1;
 495         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
 496     }
 497     shift[hash(cetable[cesize])] = 1;
 498     // for ignorables we just shift by one. see test examples.
 499     shift[hash(0)] = 1;
 500
 501     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 502         backshift[count] = defaultbackward;
 503     }
 504     for (count = cesize; count > 0; count --) {
 505         // the original value count does not seem to work
 506         backshift[hash(cetable[count])] = count > expansionsize ?
 507                                           (int16_t)(count - expansionsize) : 1;
 508     }
 509     backshift[hash(cetable[0])] = 1;
 510     backshift[hash(0)] = 1;
 511 }
 512
 513 /**
 514 * Building of the pattern collation element list and the boyer moore strsrch
 515 * table.
 516 * The canonical match will only be performed after the default match fails.
 517 * For both cases we need to remember the size of the composed and decomposed
 518 * versions of the string. Since the Boyer-Moore shift calculations shifts by
 519 * a number of characters in the text and tries to match the pattern from that
 520 * offset, the shift value can not be too large in case we miss some
 521 * characters. To choose a right shift size, we estimate the NFC form of the
 522 * and use its size as a shift guide. The NFC form should be the small
 523 * possible representation of the pattern. Anyways, we'll err on the smaller
 524 * shift size. Hence the calculation for minlength.
 525 * Canonical match will be performed slightly differently. We'll split the
 526 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
 527 * the first and last base character (MS), the ending accents (EA). Matches
 528 * will be done on MS first, and only when we match MS then some processing
 529 * will be required for the prefix and end accents in order to determine if
 530 * they match PA and EA. Hence the default shift values
 531 * for the canonical match will take the size of either end's accent into
 532 * consideration. Forwards search will take the end accents into consideration
 533 * for the default shift values and the backwards search will take the prefix
 534 * accents into consideration.
 535 * If pattern has no non-ignorable ce, we return a illegal argument error.
 536 * Internal method, status assumed to be success.
 537 * @param strsrch UStringSearch data storage
 538 * @param status  for output errors if it occurs, status is assumed to be a
 539 *                success when it is passed in.
 540 */
 541 static
 542 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
 543 {
 544     int16_t expandlength  = initializePattern(strsrch, status);
 545     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
 546         UPattern *pattern = &strsrch->pattern;
 547         int32_t   cesize  = pattern->cesLength;
 548
 549         int16_t minlength = cesize > expandlength
 550                             ? (int16_t)cesize - expandlength : 1;
 551         pattern->defaultShiftSize    = minlength;
 552         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
 553                       cesize, expandlength, minlength, minlength);
 554         return;
 555     }
 556     strsrch->pattern.defaultShiftSize = 0;
 557 }
 558
 559 #if BOYER_MOORE
 560 /**
 561 * Check to make sure that the match length is at the end of the character by
 562 * using the breakiterator.
 563 * @param strsrch string search data
 564 * @param start target text start offset
 565 * @param end target text end offset
 566 */
 567 static
 568 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
 569                                int32_t *end)
 570 {
 571 #if !UCONFIG_NO_BREAK_ITERATION
 572     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
 573     if (breakiterator) {
 574         int32_t matchend = *end;
 575         //int32_t matchstart = *start;
 576
 577         if (!ubrk_isBoundary(breakiterator, matchend)) {
 578             *end = ubrk_following(breakiterator, matchend);
 579         }
 580
 581         /* Check the start of the matched text to make sure it doesn't have any accents
 582          * before it.  This code may not be necessary and so it is commented out */
 583         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
 584             *start = ubrk_preceding(breakiterator, matchstart);
 585         }*/
 586     }
 587 #endif
 588 }
 589
 590 /**
 591 * Determine whether the target text in UStringSearch bounded by the offset
 592 * start and end is one or more whole units of text as
 593 * determined by the breakiterator in UStringSearch.
 594 * @param strsrch string search data
 595 * @param start target text start offset
 596 * @param end target text end offset
 597 */
 598 static
 599 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
 600                                int32_t    end)
 601 {
 602 #if !UCONFIG_NO_BREAK_ITERATION
 603     UBreakIterator *breakiterator = strsrch->search->breakIter;
 604     //TODO: Add here.
 605     if (breakiterator) {
 606         int32_t startindex = ubrk_first(breakiterator);
 607         int32_t endindex   = ubrk_last(breakiterator);
 608
 609         // out-of-range indexes are never boundary positions
 610         if (start < startindex || start > endindex ||
 611             end < startindex || end > endindex) {
 612             return FALSE;
 613         }
 614         // otherwise, we can use following() on the position before the
 615         // specified one and return true of the position we get back is the
 616         // one the user specified
 617         UBool result = (start == startindex ||
 618                 ubrk_following(breakiterator, start - 1) == start) &&
 619                (end == endindex ||
 620                 ubrk_following(breakiterator, end - 1) == end);
 621         if (result) {
 622             // iterates the individual ces
 623                   UCollationElements *coleiter  = strsrch->utilIter;
 624             const UChar              *text      = strsrch->search->text +
 625                                                                       start;
 626                   UErrorCode          status    = U_ZERO_ERROR;
 627             ucol_setText(coleiter, text, end - start, &status);
 628             for (int32_t count = 0; count < strsrch->pattern.cesLength;
 629                  count ++) {
 630                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 631                 if (ce == UCOL_IGNORABLE) {
 632                     count --;
 633                     continue;
 634                 }
 635                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
 636                     return FALSE;
 637                 }
 638             }
 639             int32_t nextce = ucol_next(coleiter, &status);
 640             while (ucol_getOffset(coleiter) == (end - start)
 641                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
 642                 nextce = ucol_next(coleiter, &status);
 643             }
 644             if (ucol_getOffset(coleiter) == (end - start)
 645                 && nextce != UCOL_NULLORDER) {
 646                 // extra collation elements at the end of the match
 647                 return FALSE;
 648             }
 649         }
 650         return result;
 651     }
 652 #endif
 653     return TRUE;
 654 }
 655
 656 /**
 657 * Getting the next base character offset if current offset is an accent,
 658 * or the current offset if the current character contains a base character.
 659 * accents the following base character will be returned
 660 * @param text string
 661 * @param textoffset current offset
 662 * @param textlength length of text string
 663 * @return the next base character or the current offset
 664 *         if the current character is contains a base character.
 665 */
 666 static
 667 inline int32_t getNextBaseOffset(const UChar       *text,
 668                                            int32_t  textoffset,
 669                                            int32_t      textlength)
 670 {
 671     if (textoffset < textlength) {
 672         int32_t temp = textoffset;
 673         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 674             while (temp < textlength) {
 675                 int32_t result = temp;
 676                 if ((getFCD(text, &temp, textlength) >>
 677                      SECOND_LAST_BYTE_SHIFT_) == 0) {
 678                     return result;
 679                 }
 680             }
 681             return textlength;
 682         }
 683     }
 684     return textoffset;
 685 }
 686
 687 /**
 688 * Gets the next base character offset depending on the string search pattern
 689 * data
 690 * @param strsrch string search data
 691 * @param textoffset current offset, one offset away from the last character
 692 *                   to search for.
 693 * @return start index of the next base character or the current offset
 694 *         if the current character is contains a base character.
 695 */
 696 static
 697 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
 698                                                   int32_t    textoffset)
 699 {
 700     int32_t textlength = strsrch->search->textLength;
 701     if (strsrch->pattern.hasSuffixAccents &&
 702         textoffset < textlength) {
 703               int32_t  temp       = textoffset;
 704         const UChar       *text       = strsrch->search->text;
 705         U16_BACK_1(text, 0, temp);
 706         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 707             return getNextBaseOffset(text, textoffset, textlength);
 708         }
 709     }
 710     return textoffset;
 711 }
 712
 713 /**
 714 * Shifting the collation element iterator position forward to prepare for
 715 * a following match. If the last character is a unsafe character, we'll only
 716 * shift by 1 to capture contractions, normalization etc.
 717 * Internal method, status assumed to be success.
 718 * @param text strsrch string search data
 719 * @param textoffset start text position to do search
 720 * @param ce the text ce which failed the match.
 721 * @param patternceindex index of the ce within the pattern ce buffer which
 722 *        failed the match
 723 * @return final offset
 724 */
 725 static
 726 inline int32_t shiftForward(UStringSearch *strsrch,
 727                                 int32_t    textoffset,
 728                                 int32_t       ce,
 729                                 int32_t        patternceindex)
 730 {
 731     UPattern *pattern = &(strsrch->pattern);
 732     if (ce != UCOL_NULLORDER) {
 733         int32_t shift = pattern->shift[hash(ce)];
 734         // this is to adjust for characters in the middle of the
 735         // substring for matching that failed.
 736         int32_t adjust = pattern->cesLength - patternceindex;
 737         if (adjust > 1 && shift >= adjust) {
 738             shift -= adjust - 1;
 739         }
 740         textoffset += shift;
 741     }
 742     else {
 743         textoffset += pattern->defaultShiftSize;
 744     }
 745
 746     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
 747     // check for unsafe characters
 748     // * if it is the start or middle of a contraction: to be done after
 749     //   a initial match is found
 750     // * thai or lao base consonant character: similar to contraction
 751     // * high surrogate character: similar to contraction
 752     // * next character is a accent: shift to the next base character
 753     return textoffset;
 754 }
 755 #endif // #if BOYER_MOORE
 756
 757 /**
 758 * sets match not found
 759 * @param strsrch string search data
 760 */
 761 static
 762 inline void setMatchNotFound(UStringSearch *strsrch)
 763 {
 764     // this method resets the match result regardless of the error status.
 765     strsrch->search->matchedIndex = USEARCH_DONE;
 766     strsrch->search->matchedLength = 0;
 767     if (strsrch->search->isForwardSearching) {
 768         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
 769     }
 770     else {
 771         setColEIterOffset(strsrch->textIter, 0);
 772     }
 773 }
 774
 775 #if BOYER_MOORE
 776 /**
 777 * Gets the offset to the next safe point in text.
 778 * ie. not the middle of a contraction, swappable characters or supplementary
 779 * characters.
 780 * @param collator collation sata
 781 * @param text string to work with
 782 * @param textoffset offset in string
 783 * @param textlength length of text string
 784 * @return offset to the next safe character
 785 */
 786 static
 787 inline int32_t getNextSafeOffset(const UCollator   *collator,
 788                                      const UChar       *text,
 789                                            int32_t  textoffset,
 790                                            int32_t      textlength)
 791 {
 792     int32_t result = textoffset; // first contraction character
 793     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
 794         result ++;
 795     }
 796     return result;
 797 }
 798
 799 /**
 800 * This checks for accents in the potential match started with a .
 801 * composite character.
 802 * This is really painful... we have to check that composite character do not
 803 * have any extra accents. We have to normalize the potential match and find
 804 * the immediate decomposed character before the match.
 805 * The first composite character would have been taken care of by the fcd
 806 * checks in checkForwardExactMatch.
 807 * This is the slow path after the fcd of the first character and
 808 * the last character has been checked by checkForwardExactMatch and we
 809 * determine that the potential match has extra non-ignorable preceding
 810 * ces.
 811 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 812 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 813 * Note here that accents checking are slow and cautioned in the API docs.
 814 * Internal method, status assumed to be a success, caller should check status
 815 * before calling this method
 816 * @param strsrch string search data
 817 * @param start index of the potential unfriendly composite character
 818 * @param end index of the potential unfriendly composite character
 819 * @param status output error status if any.
 820 * @return TRUE if there is non-ignorable accents before at the beginning
 821 *              of the match, FALSE otherwise.
 822 */
 823
 824 static
 825 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
 826                                    int32_t    end,
 827                                    UErrorCode    *status)
 828 {
 829     UBool result = FALSE;
 830     if (strsrch->pattern.hasPrefixAccents) {
 831               int32_t  length = end - start;
 832               int32_t  offset = 0;
 833         const UChar       *text   = strsrch->search->text + start;
 834
 835         U16_FWD_1(text, offset, length);
 836         // we are only concerned with the first composite character
 837         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
 838             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
 839                                                        text, 0, length);
 840             if (safeoffset != length) {
 841                 safeoffset ++;
 842             }
 843             UChar   *norm = NULL;
 844             UChar    buffer[INITIAL_ARRAY_SIZE_];
 845             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
 846                                             buffer, INITIAL_ARRAY_SIZE_,
 847                                             status);
 848             if (U_FAILURE(*status)) {
 849                 return FALSE;
 850             }
 851             if (size >= INITIAL_ARRAY_SIZE_) {
 852                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
 853                                                status);
 854                 // if allocation failed, status will be set to
 855                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
 856                 // checks for it.
 857                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
 858                                        size, status);
 859                 if (U_FAILURE(*status) && norm != NULL) {
 860                     uprv_free(norm);
 861                     return FALSE;
 862                 }
 863             }
 864             else {
 865                 norm = buffer;
 866             }
 867
 868             UCollationElements *coleiter  = strsrch->utilIter;
 869             ucol_setText(coleiter, norm, size, status);
 870             uint32_t            firstce   = strsrch->pattern.ces[0];
 871             UBool               ignorable = TRUE;
 872             uint32_t            ce        = UCOL_IGNORABLE;
 873             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
 874                 offset = ucol_getOffset(coleiter);
 875                 if (ce != firstce && ce != UCOL_IGNORABLE) {
 876                     ignorable = FALSE;
 877                 }
 878                 ce = ucol_next(coleiter, status);
 879             }
 880             UChar32 codepoint;
 881             U16_PREV(norm, 0, offset, codepoint);
 882             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
 883
 884             if (norm != buffer) {
 885                 uprv_free(norm);
 886             }
 887         }
 888     }
 889
 890     return result;
 891 }
 892
 893 /**
 894 * Used by exact matches, checks if there are accents before the match.
 895 * This is really painful... we have to check that composite characters at
 896 * the start of the matches have to not have any extra accents.
 897 * We check the FCD of the character first, if it starts with an accent and
 898 * the first pattern ce does not match the first ce of the character, we bail.
 899 * Otherwise we try normalizing the first composite
 900 * character and find the immediate decomposed character before the match to
 901 * see if it is an non-ignorable accent.
 902 * Now normalizing the first composite character is enough because we ensure
 903 * that when the match is passed in here with extra beginning ces, the
 904 * first or last ce that match has to occur within the first character.
 905 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 906 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 907 * Note here that accents checking are slow and cautioned in the API docs.
 908 * @param strsrch string search data
 909 * @param start offset
 910 * @param end offset
 911 * @return TRUE if there are accents on either side of the match,
 912 *         FALSE otherwise
 913 */
 914 static
 915 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
 916                                   int32_t    end)
 917 {
 918     if (strsrch->pattern.hasPrefixAccents) {
 919         UCollationElements *coleiter  = strsrch->textIter;
 920         UErrorCode          status    = U_ZERO_ERROR;
 921         // we have been iterating forwards previously
 922         uint32_t            ignorable = TRUE;
 923         int32_t             firstce   = strsrch->pattern.ces[0];
 924
 925         setColEIterOffset(coleiter, start);
 926         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
 927         if (U_FAILURE(status)) {
 928             return TRUE;
 929         }
 930         while (ce != firstce) {
 931             if (ce != UCOL_IGNORABLE) {
 932                 ignorable = FALSE;
 933             }
 934             ce = getCE(strsrch, ucol_next(coleiter, &status));
 935             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
 936                 return TRUE;
 937             }
 938         }
 939         if (!ignorable && inNormBuf(coleiter)) {
 940             // within normalization buffer, discontiguous handled here
 941             return TRUE;
 942         }
 943
 944         // within text
 945         int32_t temp = start;
 946         // original code
 947         // accent = (getFCD(strsrch->search->text, &temp,
 948         //                  strsrch->search->textLength)
 949         //            >> SECOND_LAST_BYTE_SHIFT_);
 950         // however this code does not work well with VC7 .net in release mode.
 951         // maybe the inlines for getFCD combined with shifting has bugs in
 952         // VC7. anyways this is a work around.
 953         UBool accent = getFCD(strsrch->search->text, &temp,
 954                               strsrch->search->textLength) > 0xFF;
 955         if (!accent) {
 956             return checkExtraMatchAccents(strsrch, start, end, &status);
 957         }
 958         if (!ignorable) {
 959             return TRUE;
 960         }
 961         if (start > 0) {
 962             temp = start;
 963             U16_BACK_1(strsrch->search->text, 0, temp);
 964             if (getFCD(strsrch->search->text, &temp,
 965                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
 966                 setColEIterOffset(coleiter, start);
 967                 ce = ucol_previous(coleiter, &status);
 968                 if (U_FAILURE(status) ||
 969                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
 970                     return TRUE;
 971                 }
 972             }
 973         }
 974     }
 975
 976     return FALSE;
 977 }
 978
 979 /**
 980 * Used by exact matches, checks if there are accents bounding the match.
 981 * Note this is the initial boundary check. If the potential match
 982 * starts or ends with composite characters, the accents in those
 983 * characters will be determined later.
 984 * Not doing backwards iteration here, since discontiguos contraction for
 985 * backwards collation element iterator, use up too many characters.
 986 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
 987 * should fail since there is a acute at the end of \u01FA
 988 * Note here that accents checking are slow and cautioned in the API docs.
 989 * @param strsrch string search data
 990 * @param start offset of match
 991 * @param end end offset of the match
 992 * @return TRUE if there are accents on either side of the match,
 993 *         FALSE otherwise
 994 */
 995 static
 996 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
 997                                  int32_t    end)
 998 {
 999     if (strsrch->pattern.hasSuffixAccents) {
1000         const UChar       *text       = strsrch->search->text;
1001               int32_t  temp       = end;
1002               int32_t      textlength = strsrch->search->textLength;
1003         U16_BACK_1(text, 0, temp);
1004         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1005             int32_t             firstce  = strsrch->pattern.ces[0];
1006             UCollationElements *coleiter = strsrch->textIter;
1007             UErrorCode          status   = U_ZERO_ERROR;
1008             int32_t ce;
1009             setColEIterOffset(coleiter, start);
1010             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1011                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1012                     return TRUE;
1013                 }
1014             }
1015             int32_t count = 1;
1016             while (count < strsrch->pattern.cesLength) {
1017                 if (getCE(strsrch, ucol_next(coleiter, &status))
1018                     == UCOL_IGNORABLE) {
1019                     // Thai can give an ignorable here.
1020                     count --;
1021                 }
1022                 if (U_FAILURE(status)) {
1023                     return TRUE;
1024                 }
1025                 count ++;
1026             }
1027
1028             ce = ucol_next(coleiter, &status);
1029             if (U_FAILURE(status)) {
1030                 return TRUE;
1031             }
1032             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1033                 ce = getCE(strsrch, ce);
1034             }
1035             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1036                 if (ucol_getOffset(coleiter) <= end) {
1037                     return TRUE;
1038                 }
1039                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1040                     return TRUE;
1041                 }
1042             }
1043         }
1044     }
1045     return FALSE;
1046 }
1047 #endif // #if BOYER_MOORE
1048
1049 /**
1050 * Checks if the offset runs out of the text string
1051 * @param offset
1052 * @param textlength of the text string
1053 * @return TRUE if offset is out of bounds, FALSE otherwise
1054 */
1055 static
1056 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1057 {
1058     return offset < 0 || offset > textlength;
1059 }
1060
1061 /**
1062 * Checks for identical match
1063 * @param strsrch string search data
1064 * @param start offset of possible match
1065 * @param end offset of possible match
1066 * @return TRUE if identical match is found
1067 */
1068 static
1069 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1070                                   int32_t    end)
1071 {
1072     if (strsrch->strength != UCOL_IDENTICAL) {
1073         return TRUE;
1074     }
1075
1076     // Note: We could use Normalizer::compare() or similar, but for short strings
1077     // which may not be in FCD it might be faster to just NFD them.
1078     UErrorCode status = U_ZERO_ERROR;
1079     UnicodeString t2, p2;
1080     strsrch->nfd->normalize(
1081         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1082     strsrch->nfd->normalize(
1083         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1084     // return FALSE if NFD failed
1085     return U_SUCCESS(status) && t2 == p2;
1086 }
1087
1088 #if BOYER_MOORE
1089 /**
1090 * Checks to see if the match is repeated
1091 * @param strsrch string search data
1092 * @param start new match start index
1093 * @param end new match end index
1094 * @return TRUE if the the match is repeated, FALSE otherwise
1095 */
1096 static
1097 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1098                                 int32_t    start,
1099                                 int32_t    end)
1100 {
1101     int32_t lastmatchindex = strsrch->search->matchedIndex;
1102     UBool       result;
1103     if (lastmatchindex == USEARCH_DONE) {
1104         return FALSE;
1105     }
1106     if (strsrch->search->isForwardSearching) {
1107         result = start <= lastmatchindex;
1108     }
1109     else {
1110         result = start >= lastmatchindex;
1111     }
1112     if (!result && !strsrch->search->isOverlap) {
1113         if (strsrch->search->isForwardSearching) {
1114             result = start < lastmatchindex + strsrch->search->matchedLength;
1115         }
1116         else {
1117             result = end > lastmatchindex;
1118         }
1119     }
1120     return result;
1121 }
1122
1123 /**
1124 * Gets the collation element iterator's current offset.
1125 * @param coleiter collation element iterator
1126 * @param forwards flag TRUE if we are moving in th forwards direction
1127 * @return current offset
1128 */
1129 static
1130 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1131                                               UBool               forwards)
1132 {
1133     int32_t result = ucol_getOffset(coleiter);
1134     // intricacies of the the backwards collation element iterator
1135     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1136         result ++;
1137     }
1138     return result;
1139 }
1140
1141 /**
1142 * Checks match for contraction.
1143 * If the match ends with a partial contraction we fail.
1144 * If the match starts too far off (because of backwards iteration) we try to
1145 * chip off the extra characters depending on whether a breakiterator has
1146 * been used.
1147 * Internal method, error assumed to be success, caller has to check status
1148 * before calling this method.
1149 * @param strsrch string search data
1150 * @param start offset of potential match, to be modified if necessary
1151 * @param end offset of potential match, to be modified if necessary
1152 * @param status output error status if any
1153 * @return TRUE if match passes the contraction test, FALSE otherwise
1154 */
1155
1156 static
1157 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1158                                      int32_t   *start,
1159                                      int32_t   *end, UErrorCode  *status)
1160 {
1161           UCollationElements *coleiter   = strsrch->textIter;
1162           int32_t             textlength = strsrch->search->textLength;
1163           int32_t             temp       = *start;
1164     const UCollator          *collator   = strsrch->collator;
1165     const UChar              *text       = strsrch->search->text;
1166     // This part checks if either ends of the match contains potential
1167     // contraction. If so we'll have to iterate through them
1168     // The start contraction needs to be checked since ucol_previous dumps
1169     // all characters till the first safe character into the buffer.
1170     // *start + 1 is used to test for the unsafe characters instead of *start
1171     // because ucol_prev takes all unsafe characters till the first safe
1172     // character ie *start. so by testing *start + 1, we can estimate if
1173     // excess prefix characters has been included in the potential search
1174     // results.
1175     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1176         (*start + 1 < textlength
1177          && ucol_unsafeCP(text[*start + 1], collator))) {
1178         int32_t expansion  = getExpansionPrefix(coleiter);
1179         UBool   expandflag = expansion > 0;
1180         setColEIterOffset(coleiter, *start);
1181         while (expansion > 0) {
1182             // getting rid of the redundant ce, caused by setOffset.
1183             // since backward contraction/expansion may have extra ces if we
1184             // are in the normalization buffer, hasAccentsBeforeMatch would
1185             // have taken care of it.
1186             // E.g. the character \u01FA will have an expansion of 3, but if
1187             // we are only looking for acute and ring \u030A and \u0301, we'll
1188             // have to skip the first ce in the expansion buffer.
1189             ucol_next(coleiter, status);
1190             if (U_FAILURE(*status)) {
1191                 return FALSE;
1192             }
1193             if (ucol_getOffset(coleiter) != temp) {
1194                 *start = temp;
1195                 temp  = ucol_getOffset(coleiter);
1196             }
1197             expansion --;
1198         }
1199
1200         int32_t  *patternce       = strsrch->pattern.ces;
1201         int32_t   patterncelength = strsrch->pattern.cesLength;
1202         int32_t   count           = 0;
1203         while (count < patterncelength) {
1204             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1205             if (ce == UCOL_IGNORABLE) {
1206                 continue;
1207             }
1208             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1209                 *start = temp;
1210                 temp   = ucol_getOffset(coleiter);
1211             }
1212             if (U_FAILURE(*status) || ce != patternce[count]) {
1213                 (*end) ++;
1214                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1215                 return FALSE;
1216             }
1217             count ++;
1218         }
1219     }
1220     return TRUE;
1221 }
1222
1223 /**
1224 * Checks and sets the match information if found.
1225 * Checks
1226 * <ul>
1227 * <li> the potential match does not repeat the previous match
1228 * <li> boundaries are correct
1229 * <li> exact matches has no extra accents
1230 * <li> identical matchesb
1231 * <li> potential match does not end in the middle of a contraction
1232 * <\ul>
1233 * Otherwise the offset will be shifted to the next character.
1234 * Internal method, status assumed to be success, caller has to check status
1235 * before calling this method.
1236 * @param strsrch string search data
1237 * @param textoffset offset in the collation element text. the returned value
1238 *        will be the truncated end offset of the match or the new start
1239 *        search offset.
1240 * @param status output error status if any
1241 * @return TRUE if the match is valid, FALSE otherwise
1242 */
1243 static
1244 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1245                                  int32_t   *textoffset, UErrorCode *status)
1246 {
1247     UCollationElements *coleiter = strsrch->textIter;
1248     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1249
1250     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1251         return FALSE;
1252     }
1253
1254     // this totally matches, however we need to check if it is repeating
1255     if (!isBreakUnit(strsrch, start, *textoffset) ||
1256         checkRepeatedMatch(strsrch, start, *textoffset) ||
1257         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1258         !checkIdentical(strsrch, start, *textoffset) ||
1259         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1260
1261         (*textoffset) ++;
1262         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1263         return FALSE;
1264     }
1265
1266     //Add breakiterator boundary check for primary strength search.
1267     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1268         checkBreakBoundary(strsrch, &start, textoffset);
1269     }
1270
1271     // totally match, we will get rid of the ending ignorables.
1272     strsrch->search->matchedIndex  = start;
1273     strsrch->search->matchedLength = *textoffset - start;
1274     return TRUE;
1275 }
1276
1277 /**
1278 * Getting the previous base character offset, or the current offset if the
1279 * current character is a base character
1280 * @param text string
1281 * @param textoffset one offset after the current character
1282 * @return the offset of the next character after the base character or the first
1283 *         composed character with accents
1284 */
1285 static
1286 inline int32_t getPreviousBaseOffset(const UChar       *text,
1287                                                int32_t  textoffset)
1288 {
1289     if (textoffset > 0) {
1290         for (;;) {
1291             int32_t result = textoffset;
1292             U16_BACK_1(text, 0, textoffset);
1293             int32_t temp = textoffset;
1294             uint16_t fcd = getFCD(text, &temp, result);
1295             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1296                 if (fcd & LAST_BYTE_MASK_) {
1297                     return textoffset;
1298                 }
1299                 return result;
1300             }
1301             if (textoffset == 0) {
1302                 return 0;
1303             }
1304         }
1305     }
1306     return textoffset;
1307 }
1308
1309 /**
1310 * Getting the indexes of the accents that are not blocked in the argument
1311 * accent array
1312 * @param accents array of accents in nfd terminated by a 0.
1313 * @param accentsindex array of indexes of the accents that are not blocked
1314 */
1315 static
1316 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1317 {
1318     int32_t index     = 0;
1319     int32_t     length    = u_strlen(accents);
1320     UChar32     codepoint = 0;
1321     int         cclass    = 0;
1322     int         result    = 0;
1323     int32_t temp;
1324     while (index < length) {
1325         temp = index;
1326         U16_NEXT(accents, index, length, codepoint);
1327         if (u_getCombiningClass(codepoint) != cclass) {
1328             cclass        = u_getCombiningClass(codepoint);
1329             accentsindex[result] = temp;
1330             result ++;
1331         }
1332     }
1333     accentsindex[result] = length;
1334     return result;
1335 }
1336
1337 /**
1338 * Appends 3 UChar arrays to a destination array.
1339 * Creates a new array if we run out of space. The caller will have to
1340 * manually deallocate the newly allocated array.
1341 * Internal method, status assumed to be success, caller has to check status
1342 * before calling this method. destination not to be NULL and has at least
1343 * size destinationlength.
1344 * @param destination target array
1345 * @param destinationlength target array size, returning the appended length
1346 * @param source1 null-terminated first array
1347 * @param source2 second array
1348 * @param source2length length of seond array
1349 * @param source3 null-terminated third array
1350 * @param status error status if any
1351 * @return new destination array, destination if there was no new allocation
1352 */
1353 static
1354 inline UChar * addToUCharArray(      UChar      *destination,
1355                                      int32_t    *destinationlength,
1356                                const UChar      *source1,
1357                                const UChar      *source2,
1358                                      int32_t     source2length,
1359                                const UChar      *source3,
1360                                      UErrorCode *status)
1361 {
1362     int32_t source1length = source1 ? u_strlen(source1) : 0;
1363     int32_t source3length = source3 ? u_strlen(source3) : 0;
1364     if (*destinationlength < source1length + source2length + source3length +
1365                                                                            1)
1366     {
1367         destination = (UChar *)allocateMemory(
1368           (source1length + source2length + source3length + 1) * sizeof(UChar),
1369           status);
1370         // if error allocating memory, status will be
1371         // U_MEMORY_ALLOCATION_ERROR
1372         if (U_FAILURE(*status)) {
1373             *destinationlength = 0;
1374             return NULL;
1375         }
1376     }
1377     if (source1length != 0) {
1378         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1379     }
1380     if (source2length != 0) {
1381         uprv_memcpy(destination + source1length, source2,
1382                     sizeof(UChar) * source2length);
1383     }
1384     if (source3length != 0) {
1385         uprv_memcpy(destination + source1length + source2length, source3,
1386                     sizeof(UChar) * source3length);
1387     }
1388     *destinationlength = source1length + source2length + source3length;
1389     return destination;
1390 }
1391
1392 /**
1393 * Running through a collation element iterator to see if the contents matches
1394 * pattern in string search data
1395 * @param strsrch string search data
1396 * @param coleiter collation element iterator
1397 * @return TRUE if a match if found, FALSE otherwise
1398 */
1399 static
1400 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1401                                        UCollationElements *coleiter)
1402 {
1403     int         patternceindex = strsrch->pattern.cesLength;
1404     int32_t    *patternce      = strsrch->pattern.ces;
1405     UErrorCode  status = U_ZERO_ERROR;
1406     while (patternceindex > 0) {
1407         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1408         if (ce == UCOL_IGNORABLE) {
1409             continue;
1410         }
1411         if (U_FAILURE(status) || ce != *patternce) {
1412             return FALSE;
1413         }
1414         patternce ++;
1415         patternceindex --;
1416     }
1417     return TRUE;
1418 }
1419
1420 /**
1421 * Rearranges the front accents to try matching.
1422 * Prefix accents in the text will be grouped according to their combining
1423 * class and the groups will be mixed and matched to try find the perfect
1424 * match with the pattern.
1425 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1426 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1427 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1428 *         "\u0301\u0325".
1429 * step 2: check if any of the generated substrings matches the pattern.
1430 * Internal method, status is assumed to be success, caller has to check status
1431 * before calling this method.
1432 * @param strsrch string search match
1433 * @param start first offset of the accents to start searching
1434 * @param end start of the last accent set
1435 * @param status output error status if any
1436 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1437 *         offset of the match. Note this start includes all preceding accents.
1438 */
1439 static
1440 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1441                                        int32_t    start,
1442                                        int32_t    end,
1443                                        UErrorCode    *status)
1444 {
1445     const UChar       *text       = strsrch->search->text;
1446           int32_t      textlength = strsrch->search->textLength;
1447           int32_t  tempstart  = start;
1448
1449     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1450         // die... failed at a base character
1451         return USEARCH_DONE;
1452     }
1453
1454     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1455     start = getPreviousBaseOffset(text, tempstart);
1456
1457     UChar       accents[INITIAL_ARRAY_SIZE_];
1458     // normalizing the offensive string
1459     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1460                     INITIAL_ARRAY_SIZE_, status);
1461     if (U_FAILURE(*status)) {
1462         return USEARCH_DONE;
1463     }
1464
1465     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1466     int32_t         accentsize = getUnblockedAccentIndex(accents,
1467                                                                  accentsindex);
1468     int32_t         count      = (2 << (accentsize - 1)) - 1;
1469     UChar               buffer[INITIAL_ARRAY_SIZE_];
1470     UCollationElements *coleiter   = strsrch->utilIter;
1471     while (U_SUCCESS(*status) && count > 0) {
1472         UChar *rearrange = strsrch->canonicalPrefixAccents;
1473         // copy the base characters
1474         for (int k = 0; k < accentsindex[0]; k ++) {
1475             *rearrange ++ = accents[k];
1476         }
1477         // forming all possible canonical rearrangement by dropping
1478         // sets of accents
1479         for (int i = 0; i <= accentsize - 1; i ++) {
1480             int32_t mask = 1 << (accentsize - i - 1);
1481             if (count & mask) {
1482                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1483                     *rearrange ++ = accents[j];
1484                 }
1485             }
1486         }
1487         *rearrange = 0;
1488         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1489         UChar   *match     = addToUCharArray(buffer, &matchsize,
1490                                            strsrch->canonicalPrefixAccents,
1491                                            strsrch->search->text + offset,
1492                                            end - offset,
1493                                            strsrch->canonicalSuffixAccents,
1494                                            status);
1495
1496         // if status is a failure, ucol_setText does nothing.
1497         // run the collator iterator through this match
1498         ucol_setText(coleiter, match, matchsize, status);
1499         if (U_SUCCESS(*status)) {
1500             if (checkCollationMatch(strsrch, coleiter)) {
1501                 if (match != buffer) {
1502                     uprv_free(match);
1503                 }
1504                 return start;
1505             }
1506         }
1507         count --;
1508     }
1509     return USEARCH_DONE;
1510 }
1511
1512 /**
1513 * Gets the offset to the safe point in text before textoffset.
1514 * ie. not the middle of a contraction, swappable characters or supplementary
1515 * characters.
1516 * @param collator collation sata
1517 * @param text string to work with
1518 * @param textoffset offset in string
1519 * @param textlength length of text string
1520 * @return offset to the previous safe character
1521 */
1522 static
1523 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1524                                       const UChar       *text,
1525                                             int32_t  textoffset)
1526 {
1527     int32_t result = textoffset; // first contraction character
1528     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1529         result --;
1530     }
1531     if (result != 0) {
1532         // the first contraction character is consider unsafe here
1533         result --;
1534     }
1535     return result;
1536 }
1537
1538 /**
1539 * Cleaning up after we passed the safe zone
1540 * @param strsrch string search data
1541 * @param safetext safe text array
1542 * @param safebuffer safe text buffer
1543 * @param coleiter collation element iterator for safe text
1544 */
1545 static
1546 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1547                                   UChar         *safebuffer)
1548 {
1549     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1550     {
1551        uprv_free(safetext);
1552     }
1553 }
1554
1555 /**
1556 * Take the rearranged end accents and tries matching. If match failed at
1557 * a seperate preceding set of accents (seperated from the rearranged on by
1558 * at least a base character) then we rearrange the preceding accents and
1559 * tries matching again.
1560 * We allow skipping of the ends of the accent set if the ces do not match.
1561 * However if the failure is found before the accent set, it fails.
1562 * Internal method, status assumed to be success, caller has to check status
1563 * before calling this method.
1564 * @param strsrch string search data
1565 * @param textoffset of the start of the rearranged accent
1566 * @param status output error status if any
1567 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1568 *         offset of the match. Note this start includes all preceding accents.
1569 */
1570 static
1571 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1572                                        int32_t    textoffset,
1573                                        UErrorCode    *status)
1574 {
1575     const UChar              *text           = strsrch->search->text;
1576     const UCollator          *collator       = strsrch->collator;
1577           int32_t             safelength     = 0;
1578           UChar              *safetext;
1579           int32_t             safetextlength;
1580           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1581           UCollationElements *coleiter       = strsrch->utilIter;
1582           int32_t         safeoffset     = textoffset;
1583
1584     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1585                                          collator)) {
1586         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1587         safelength     = textoffset - safeoffset;
1588         safetextlength = INITIAL_ARRAY_SIZE_;
1589         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1590                                          text + safeoffset, safelength,
1591                                          strsrch->canonicalSuffixAccents,
1592                                          status);
1593     }
1594     else {
1595         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1596         safetext       = strsrch->canonicalSuffixAccents;
1597     }
1598
1599     // if status is a failure, ucol_setText does nothing
1600     ucol_setText(coleiter, safetext, safetextlength, status);
1601     // status checked in loop below
1602
1603     int32_t  *ce        = strsrch->pattern.ces;
1604     int32_t   celength  = strsrch->pattern.cesLength;
1605     int       ceindex   = celength - 1;
1606     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1607
1608     while (ceindex >= 0) {
1609         int32_t textce = ucol_previous(coleiter, status);
1610         if (U_FAILURE(*status)) {
1611             if (isSafe) {
1612                 cleanUpSafeText(strsrch, safetext, safebuffer);
1613             }
1614             return USEARCH_DONE;
1615         }
1616         if (textce == UCOL_NULLORDER) {
1617             // check if we have passed the safe buffer
1618             if (coleiter == strsrch->textIter) {
1619                 cleanUpSafeText(strsrch, safetext, safebuffer);
1620                 return USEARCH_DONE;
1621             }
1622             cleanUpSafeText(strsrch, safetext, safebuffer);
1623             safetext = safebuffer;
1624             coleiter = strsrch->textIter;
1625             setColEIterOffset(coleiter, safeoffset);
1626             // status checked at the start of the loop
1627             isSafe = FALSE;
1628             continue;
1629         }
1630         textce = getCE(strsrch, textce);
1631         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1632             // do the beginning stuff
1633             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1634             if (isSafe && failedoffset >= safelength) {
1635                 // alas... no hope. failed at rearranged accent set
1636                 cleanUpSafeText(strsrch, safetext, safebuffer);
1637                 return USEARCH_DONE;
1638             }
1639             else {
1640                 if (isSafe) {
1641                     failedoffset += safeoffset;
1642                     cleanUpSafeText(strsrch, safetext, safebuffer);
1643                 }
1644
1645                 // try rearranging the front accents
1646                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1647                                         failedoffset, textoffset, status);
1648                 if (result != USEARCH_DONE) {
1649                     // if status is a failure, ucol_setOffset does nothing
1650                     setColEIterOffset(strsrch->textIter, result);
1651                 }
1652                 if (U_FAILURE(*status)) {
1653                     return USEARCH_DONE;
1654                 }
1655                 return result;
1656             }
1657         }
1658         if (textce == ce[ceindex]) {
1659             ceindex --;
1660         }
1661     }
1662     // set offset here
1663     if (isSafe) {
1664         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1665         // sets the text iterator here with the correct expansion and offset
1666         int32_t    leftoverces = getExpansionPrefix(coleiter);
1667         cleanUpSafeText(strsrch, safetext, safebuffer);
1668         if (result >= safelength) {
1669             result = textoffset;
1670         }
1671         else {
1672             result += safeoffset;
1673         }
1674         setColEIterOffset(strsrch->textIter, result);
1675         strsrch->textIter->iteratordata_.toReturn =
1676                        setExpansionPrefix(strsrch->textIter, leftoverces);
1677         return result;
1678     }
1679
1680     return ucol_getOffset(coleiter);
1681 }
1682
1683 /**
1684 * Trying out the substring and sees if it can be a canonical match.
1685 * This will try normalizing the end accents and arranging them into canonical
1686 * equivalents and check their corresponding ces with the pattern ce.
1687 * Suffix accents in the text will be grouped according to their combining
1688 * class and the groups will be mixed and matched to try find the perfect
1689 * match with the pattern.
1690 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1691 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1692 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1693 *         "\u0301\u0325".
1694 * step 2: check if any of the generated substrings matches the pattern.
1695 * Internal method, status assumed to be success, caller has to check status
1696 * before calling this method.
1697 * @param strsrch string search data
1698 * @param textoffset end offset in the collation element text that ends with
1699 *                   the accents to be rearranged
1700 * @param status error status if any
1701 * @return TRUE if the match is valid, FALSE otherwise
1702 */
1703 static
1704 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1705                            int32_t    textoffset,
1706                            UErrorCode    *status)
1707 {
1708     const UChar       *text = strsrch->search->text;
1709           int32_t  temp = textoffset;
1710     U16_BACK_1(text, 0, temp);
1711     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1712         UCollationElements *coleiter = strsrch->textIter;
1713         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1714         if (strsrch->pattern.hasPrefixAccents) {
1715             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1716                                                 status);
1717             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1718                 setColEIterOffset(coleiter, offset);
1719                 return TRUE;
1720             }
1721         }
1722         return FALSE;
1723     }
1724
1725     if (!strsrch->pattern.hasSuffixAccents) {
1726         return FALSE;
1727     }
1728
1729     UChar       accents[INITIAL_ARRAY_SIZE_];
1730     // offset to the last base character in substring to search
1731     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1732     // normalizing the offensive string
1733     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1734                                0, accents, INITIAL_ARRAY_SIZE_, status);
1735     // status checked in loop below
1736
1737     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1738     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1739
1740     // 2 power n - 1 plus the full set of accents
1741     int32_t  count = (2 << (size - 1)) - 1;
1742     while (U_SUCCESS(*status) && count > 0) {
1743         UChar *rearrange = strsrch->canonicalSuffixAccents;
1744         // copy the base characters
1745         for (int k = 0; k < accentsindex[0]; k ++) {
1746             *rearrange ++ = accents[k];
1747         }
1748         // forming all possible canonical rearrangement by dropping
1749         // sets of accents
1750         for (int i = 0; i <= size - 1; i ++) {
1751             int32_t mask = 1 << (size - i - 1);
1752             if (count & mask) {
1753                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1754                     *rearrange ++ = accents[j];
1755                 }
1756             }
1757         }
1758         *rearrange = 0;
1759         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1760                                                         status);
1761         if (offset != USEARCH_DONE) {
1762             return TRUE; // match found
1763         }
1764         count --;
1765     }
1766     return FALSE;
1767 }
1768
1769 /**
1770 * Gets the previous base character offset depending on the string search
1771 * pattern data
1772 * @param strsrch string search data
1773 * @param textoffset current offset, current character
1774 * @return the offset of the next character after this base character or itself
1775 *         if it is a composed character with accents
1776 */
1777 static
1778 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1779                                                       int32_t textoffset)
1780 {
1781     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1782         const UChar       *text = strsrch->search->text;
1783               int32_t  offset = textoffset;
1784         if (getFCD(text, &offset, strsrch->search->textLength) >>
1785                                                    SECOND_LAST_BYTE_SHIFT_) {
1786             return getPreviousBaseOffset(text, textoffset);
1787         }
1788     }
1789     return textoffset;
1790 }
1791
1792 /**
1793 * Checks match for contraction.
1794 * If the match ends with a partial contraction we fail.
1795 * If the match starts too far off (because of backwards iteration) we try to
1796 * chip off the extra characters
1797 * Internal method, status assumed to be success, caller has to check status
1798 * before calling this method.
1799 * @param strsrch string search data
1800 * @param start offset of potential match, to be modified if necessary
1801 * @param end offset of potential match, to be modified if necessary
1802 * @param status output error status if any
1803 * @return TRUE if match passes the contraction test, FALSE otherwise
1804 */
1805 static
1806 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1807                                          int32_t   *start,
1808                                          int32_t   *end,
1809                                          UErrorCode    *status)
1810 {
1811           UCollationElements *coleiter   = strsrch->textIter;
1812           int32_t             textlength = strsrch->search->textLength;
1813           int32_t         temp       = *start;
1814     const UCollator          *collator   = strsrch->collator;
1815     const UChar              *text       = strsrch->search->text;
1816     // This part checks if either ends of the match contains potential
1817     // contraction. If so we'll have to iterate through them
1818     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1819         (*start + 1 < textlength
1820          && ucol_unsafeCP(text[*start + 1], collator))) {
1821         int32_t expansion  = getExpansionPrefix(coleiter);
1822         UBool   expandflag = expansion > 0;
1823         setColEIterOffset(coleiter, *start);
1824         while (expansion > 0) {
1825             // getting rid of the redundant ce, caused by setOffset.
1826             // since backward contraction/expansion may have extra ces if we
1827             // are in the normalization buffer, hasAccentsBeforeMatch would
1828             // have taken care of it.
1829             // E.g. the character \u01FA will have an expansion of 3, but if
1830             // we are only looking for acute and ring \u030A and \u0301, we'll
1831             // have to skip the first ce in the expansion buffer.
1832             ucol_next(coleiter, status);
1833             if (U_FAILURE(*status)) {
1834                 return FALSE;
1835             }
1836             if (ucol_getOffset(coleiter) != temp) {
1837                 *start = temp;
1838                 temp  = ucol_getOffset(coleiter);
1839             }
1840             expansion --;
1841         }
1842
1843         int32_t  *patternce       = strsrch->pattern.ces;
1844         int32_t   patterncelength = strsrch->pattern.cesLength;
1845         int32_t   count           = 0;
1846         int32_t   textlength      = strsrch->search->textLength;
1847         while (count < patterncelength) {
1848             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1849             // status checked below, note that if status is a failure
1850             // ucol_next returns UCOL_NULLORDER
1851             if (ce == UCOL_IGNORABLE) {
1852                 continue;
1853             }
1854             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1855                 *start = temp;
1856                 temp   = ucol_getOffset(coleiter);
1857             }
1858
1859             if (count == 0 && ce != patternce[0]) {
1860                 // accents may have extra starting ces, this occurs when a
1861                 // pure accent pattern is matched without rearrangement
1862                 // text \u0325\u0300 and looking for \u0300
1863                 int32_t expected = patternce[0];
1864                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1865                     ce = getCE(strsrch, ucol_next(coleiter, status));
1866                     while (U_SUCCESS(*status) && ce != expected &&
1867                            ce != UCOL_NULLORDER &&
1868                            ucol_getOffset(coleiter) <= *end) {
1869                         ce = getCE(strsrch, ucol_next(coleiter, status));
1870                     }
1871                 }
1872             }
1873             if (U_FAILURE(*status) || ce != patternce[count]) {
1874                 (*end) ++;
1875                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1876                 return FALSE;
1877             }
1878             count ++;
1879         }
1880     }
1881     return TRUE;
1882 }
1883
1884 /**
1885 * Checks and sets the match information if found.
1886 * Checks
1887 * <ul>
1888 * <li> the potential match does not repeat the previous match
1889 * <li> boundaries are correct
1890 * <li> potential match does not end in the middle of a contraction
1891 * <li> identical matches
1892 * <\ul>
1893 * Otherwise the offset will be shifted to the next character.
1894 * Internal method, status assumed to be success, caller has to check the
1895 * status before calling this method.
1896 * @param strsrch string search data
1897 * @param textoffset offset in the collation element text. the returned value
1898 *        will be the truncated end offset of the match or the new start
1899 *        search offset.
1900 * @param status output error status if any
1901 * @return TRUE if the match is valid, FALSE otherwise
1902 */
1903 static
1904 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1905                                      int32_t   *textoffset,
1906                                      UErrorCode    *status)
1907 {
1908     // to ensure that the start and ends are not composite characters
1909     UCollationElements *coleiter = strsrch->textIter;
1910     // if we have a canonical accent match
1911     if ((strsrch->pattern.hasSuffixAccents &&
1912         strsrch->canonicalSuffixAccents[0]) ||
1913         (strsrch->pattern.hasPrefixAccents &&
1914         strsrch->canonicalPrefixAccents[0])) {
1915         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1916                                                     strsrch,
1917                                                     ucol_getOffset(coleiter));
1918         strsrch->search->matchedLength = *textoffset -
1919                                                 strsrch->search->matchedIndex;
1920         return TRUE;
1921     }
1922
1923     int32_t start = getColElemIterOffset(coleiter, FALSE);
1924     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1925                                             status) || U_FAILURE(*status)) {
1926         return FALSE;
1927     }
1928
1929     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1930     // this totally matches, however we need to check if it is repeating
1931     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1932         !isBreakUnit(strsrch, start, *textoffset) ||
1933         !checkIdentical(strsrch, start, *textoffset)) {
1934         (*textoffset) ++;
1935         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1936                                         strsrch->search->textLength);
1937         return FALSE;
1938     }
1939
1940     strsrch->search->matchedIndex  = start;
1941     strsrch->search->matchedLength = *textoffset - start;
1942     return TRUE;
1943 }
1944
1945 /**
1946 * Shifting the collation element iterator position forward to prepare for
1947 * a preceding match. If the first character is a unsafe character, we'll only
1948 * shift by 1 to capture contractions, normalization etc.
1949 * Internal method, status assumed to be success, caller has to check status
1950 * before calling this method.
1951 * @param text strsrch string search data
1952 * @param textoffset start text position to do search
1953 * @param ce the text ce which failed the match.
1954 * @param patternceindex index of the ce within the pattern ce buffer which
1955 *        failed the match
1956 * @return final offset
1957 */
1958 static
1959 inline int32_t reverseShift(UStringSearch *strsrch,
1960                                 int32_t    textoffset,
1961                                 int32_t       ce,
1962                                 int32_t        patternceindex)
1963 {
1964     if (strsrch->search->isOverlap) {
1965         if (textoffset != strsrch->search->textLength) {
1966             textoffset --;
1967         }
1968         else {
1969             textoffset -= strsrch->pattern.defaultShiftSize;
1970         }
1971     }
1972     else {
1973         if (ce != UCOL_NULLORDER) {
1974             int32_t shift = strsrch->pattern.backShift[hash(ce)];
1975
1976             // this is to adjust for characters in the middle of the substring
1977             // for matching that failed.
1978             int32_t adjust = patternceindex;
1979             if (adjust > 1 && shift > adjust) {
1980                 shift -= adjust - 1;
1981             }
1982             textoffset -= shift;
1983         }
1984         else {
1985             textoffset -= strsrch->pattern.defaultShiftSize;
1986         }
1987     }
1988     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1989     return textoffset;
1990 }
1991
1992 /**
1993 * Checks match for contraction.
1994 * If the match starts with a partial contraction we fail.
1995 * Internal method, status assumed to be success, caller has to check status
1996 * before calling this method.
1997 * @param strsrch string search data
1998 * @param start offset of potential match, to be modified if necessary
1999 * @param end offset of potential match, to be modified if necessary
2000 * @param status output error status if any
2001 * @return TRUE if match passes the contraction test, FALSE otherwise
2002 */
2003 static
2004 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2005                                      int32_t   *start,
2006                                      int32_t   *end, UErrorCode  *status)
2007 {
2008           UCollationElements *coleiter   = strsrch->textIter;
2009           int32_t             textlength = strsrch->search->textLength;
2010           int32_t             temp       = *end;
2011     const UCollator          *collator   = strsrch->collator;
2012     const UChar              *text       = strsrch->search->text;
2013     // This part checks if either if the start of the match contains potential
2014     // contraction. If so we'll have to iterate through them
2015     // Since we used ucol_next while previously looking for the potential
2016     // match, this guarantees that our end will not be a partial contraction,
2017     // or a partial supplementary character.
2018     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2019         int32_t expansion  = getExpansionSuffix(coleiter);
2020         UBool   expandflag = expansion > 0;
2021         setColEIterOffset(coleiter, *end);
2022         while (U_SUCCESS(*status) && expansion > 0) {
2023             // getting rid of the redundant ce
2024             // since forward contraction/expansion may have extra ces
2025             // if we are in the normalization buffer, hasAccentsBeforeMatch
2026             // would have taken care of it.
2027             // E.g. the character \u01FA will have an expansion of 3, but if
2028             // we are only looking for A ring A\u030A, we'll have to skip the
2029             // last ce in the expansion buffer
2030             ucol_previous(coleiter, status);
2031             if (U_FAILURE(*status)) {
2032                 return FALSE;
2033             }
2034             if (ucol_getOffset(coleiter) != temp) {
2035                 *end = temp;
2036                 temp  = ucol_getOffset(coleiter);
2037             }
2038             expansion --;
2039         }
2040
2041         int32_t  *patternce       = strsrch->pattern.ces;
2042         int32_t   patterncelength = strsrch->pattern.cesLength;
2043         int32_t   count           = patterncelength;
2044         while (count > 0) {
2045             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2046             // status checked below, note that if status is a failure
2047             // ucol_previous returns UCOL_NULLORDER
2048             if (ce == UCOL_IGNORABLE) {
2049                 continue;
2050             }
2051             if (expandflag && count == 0 &&
2052                 getColElemIterOffset(coleiter, FALSE) != temp) {
2053                 *end = temp;
2054                 temp  = ucol_getOffset(coleiter);
2055             }
2056             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2057                 (*start) --;
2058                 *start = getPreviousBaseOffset(text, *start);
2059                 return FALSE;
2060             }
2061             count --;
2062         }
2063     }
2064     return TRUE;
2065 }
2066
2067 /**
2068 * Checks and sets the match information if found.
2069 * Checks
2070 * <ul>
2071 * <li> the current match does not repeat the last match
2072 * <li> boundaries are correct
2073 * <li> exact matches has no extra accents
2074 * <li> identical matches
2075 * <\ul>
2076 * Otherwise the offset will be shifted to the preceding character.
2077 * Internal method, status assumed to be success, caller has to check status
2078 * before calling this method.
2079 * @param strsrch string search data
2080 * @param collator
2081 * @param coleiter collation element iterator
2082 * @param text string
2083 * @param textoffset offset in the collation element text. the returned value
2084 *        will be the truncated start offset of the match or the new start
2085 *        search offset.
2086 * @param status output error status if any
2087 * @return TRUE if the match is valid, FALSE otherwise
2088 */
2089 static
2090 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2091                                      int32_t   *textoffset,
2092                                      UErrorCode    *status)
2093 {
2094     // to ensure that the start and ends are not composite characters
2095     int32_t end = ucol_getOffset(strsrch->textIter);
2096     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2097         || U_FAILURE(*status)) {
2098             return FALSE;
2099     }
2100
2101     // this totally matches, however we need to check if it is repeating
2102     // the old match
2103     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2104         !isBreakUnit(strsrch, *textoffset, end) ||
2105         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2106         !checkIdentical(strsrch, *textoffset, end) ||
2107         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2108         (*textoffset) --;
2109         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2110                                             *textoffset);
2111         return FALSE;
2112     }
2113
2114     //Add breakiterator boundary check for primary strength search.
2115     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2116         checkBreakBoundary(strsrch, textoffset, &end);
2117     }
2118
2119     strsrch->search->matchedIndex = *textoffset;
2120     strsrch->search->matchedLength = end - *textoffset;
2121     return TRUE;
2122 }
2123
2124 /**
2125 * Rearranges the end accents to try matching.
2126 * Suffix accents in the text will be grouped according to their combining
2127 * class and the groups will be mixed and matched to try find the perfect
2128 * match with the pattern.
2129 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2130 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2131 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2132 *         "\u0301\u0325".
2133 * step 2: check if any of the generated substrings matches the pattern.
2134 * Internal method, status assumed to be success, user has to check status
2135 * before calling this method.
2136 * @param strsrch string search match
2137 * @param start offset of the first base character
2138 * @param end start of the last accent set
2139 * @param status only error status if any
2140 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2141 *         offset of the match. Note this start includes all following accents.
2142 */
2143 static
2144 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2145                                            int32_t    start,
2146                                            int32_t    end,
2147                                            UErrorCode    *status)
2148 {
2149     const UChar       *text       = strsrch->search->text;
2150           int32_t  tempend    = end;
2151
2152     U16_BACK_1(text, 0, tempend);
2153     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2154                                                            LAST_BYTE_MASK_)) {
2155         // die... failed at a base character
2156         return USEARCH_DONE;
2157     }
2158     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2159
2160     if (U_SUCCESS(*status)) {
2161         UChar       accents[INITIAL_ARRAY_SIZE_];
2162         int32_t offset = getPreviousBaseOffset(text, end);
2163         // normalizing the offensive string
2164         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2165                         INITIAL_ARRAY_SIZE_, status);
2166
2167         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2168         int32_t         accentsize = getUnblockedAccentIndex(accents,
2169                                                          accentsindex);
2170         int32_t         count      = (2 << (accentsize - 1)) - 1;
2171         UChar               buffer[INITIAL_ARRAY_SIZE_];
2172         UCollationElements *coleiter = strsrch->utilIter;
2173         while (U_SUCCESS(*status) && count > 0) {
2174             UChar *rearrange = strsrch->canonicalSuffixAccents;
2175             // copy the base characters
2176             for (int k = 0; k < accentsindex[0]; k ++) {
2177                 *rearrange ++ = accents[k];
2178             }
2179             // forming all possible canonical rearrangement by dropping
2180             // sets of accents
2181             for (int i = 0; i <= accentsize - 1; i ++) {
2182                 int32_t mask = 1 << (accentsize - i - 1);
2183                 if (count & mask) {
2184                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2185                         *rearrange ++ = accents[j];
2186                     }
2187                 }
2188             }
2189             *rearrange = 0;
2190             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2191             UChar   *match     = addToUCharArray(buffer, &matchsize,
2192                                            strsrch->canonicalPrefixAccents,
2193                                            strsrch->search->text + start,
2194                                            offset - start,
2195                                            strsrch->canonicalSuffixAccents,
2196                                            status);
2197
2198             // run the collator iterator through this match
2199             // if status is a failure ucol_setText does nothing
2200             ucol_setText(coleiter, match, matchsize, status);
2201             if (U_SUCCESS(*status)) {
2202                 if (checkCollationMatch(strsrch, coleiter)) {
2203                     if (match != buffer) {
2204                         uprv_free(match);
2205                     }
2206                     return end;
2207                 }
2208             }
2209             count --;
2210         }
2211     }
2212     return USEARCH_DONE;
2213 }
2214
2215 /**
2216 * Take the rearranged start accents and tries matching. If match failed at
2217 * a seperate following set of accents (seperated from the rearranged on by
2218 * at least a base character) then we rearrange the preceding accents and
2219 * tries matching again.
2220 * We allow skipping of the ends of the accent set if the ces do not match.
2221 * However if the failure is found before the accent set, it fails.
2222 * Internal method, status assumed to be success, caller has to check status
2223 * before calling this method.
2224 * @param strsrch string search data
2225 * @param textoffset of the ends of the rearranged accent
2226 * @param status output error status if any
2227 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2228 *         offset of the match. Note this start includes all following accents.
2229 */
2230 static
2231 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2232                                            int32_t    textoffset,
2233                                            UErrorCode    *status)
2234 {
2235     const UChar       *text       = strsrch->search->text;
2236     const UCollator   *collator   = strsrch->collator;
2237           int32_t      safelength = 0;
2238           UChar       *safetext;
2239           int32_t      safetextlength;
2240           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2241           int32_t  safeoffset = textoffset;
2242
2243     if (textoffset &&
2244         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2245                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2246                                          ], collator)) {
2247         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2248                                            strsrch->search->textLength);
2249         safelength     = safeoffset - textoffset;
2250         safetextlength = INITIAL_ARRAY_SIZE_;
2251         safetext       = addToUCharArray(safebuffer, &safetextlength,
2252                                          strsrch->canonicalPrefixAccents,
2253                                          text + textoffset, safelength,
2254                                          NULL, status);
2255     }
2256     else {
2257         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2258         safetext       = strsrch->canonicalPrefixAccents;
2259     }
2260
2261     UCollationElements *coleiter = strsrch->utilIter;
2262      // if status is a failure, ucol_setText does nothing
2263     ucol_setText(coleiter, safetext, safetextlength, status);
2264     // status checked in loop below
2265
2266     int32_t  *ce           = strsrch->pattern.ces;
2267     int32_t   celength     = strsrch->pattern.cesLength;
2268     int       ceindex      = 0;
2269     UBool     isSafe       = TRUE; // safe zone indication flag for position
2270     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2271
2272     while (ceindex < celength) {
2273         int32_t textce = ucol_next(coleiter, status);
2274         if (U_FAILURE(*status)) {
2275             if (isSafe) {
2276                 cleanUpSafeText(strsrch, safetext, safebuffer);
2277             }
2278             return USEARCH_DONE;
2279         }
2280         if (textce == UCOL_NULLORDER) {
2281             // check if we have passed the safe buffer
2282             if (coleiter == strsrch->textIter) {
2283                 cleanUpSafeText(strsrch, safetext, safebuffer);
2284                 return USEARCH_DONE;
2285             }
2286             cleanUpSafeText(strsrch, safetext, safebuffer);
2287             safetext = safebuffer;
2288             coleiter = strsrch->textIter;
2289             setColEIterOffset(coleiter, safeoffset);
2290             // status checked at the start of the loop
2291             isSafe = FALSE;
2292             continue;
2293         }
2294         textce = getCE(strsrch, textce);
2295         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2296             // do the beginning stuff
2297             int32_t failedoffset = ucol_getOffset(coleiter);
2298             if (isSafe && failedoffset <= prefixlength) {
2299                 // alas... no hope. failed at rearranged accent set
2300                 cleanUpSafeText(strsrch, safetext, safebuffer);
2301                 return USEARCH_DONE;
2302             }
2303             else {
2304                 if (isSafe) {
2305                     failedoffset = safeoffset - failedoffset;
2306                     cleanUpSafeText(strsrch, safetext, safebuffer);
2307                 }
2308
2309                 // try rearranging the end accents
2310                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2311                                         textoffset, failedoffset, status);
2312                 if (result != USEARCH_DONE) {
2313                     // if status is a failure, ucol_setOffset does nothing
2314                     setColEIterOffset(strsrch->textIter, result);
2315                 }
2316                 if (U_FAILURE(*status)) {
2317                     return USEARCH_DONE;
2318                 }
2319                 return result;
2320             }
2321         }
2322         if (textce == ce[ceindex]) {
2323             ceindex ++;
2324         }
2325     }
2326     // set offset here
2327     if (isSafe) {
2328         int32_t result      = ucol_getOffset(coleiter);
2329         // sets the text iterator here with the correct expansion and offset
2330         int32_t     leftoverces = getExpansionSuffix(coleiter);
2331         cleanUpSafeText(strsrch, safetext, safebuffer);
2332         if (result <= prefixlength) {
2333             result = textoffset;
2334         }
2335         else {
2336             result = textoffset + (safeoffset - result);
2337         }
2338         setColEIterOffset(strsrch->textIter, result);
2339         setExpansionSuffix(strsrch->textIter, leftoverces);
2340         return result;
2341     }
2342
2343     return ucol_getOffset(coleiter);
2344 }
2345
2346 /**
2347 * Trying out the substring and sees if it can be a canonical match.
2348 * This will try normalizing the starting accents and arranging them into
2349 * canonical equivalents and check their corresponding ces with the pattern ce.
2350 * Prefix accents in the text will be grouped according to their combining
2351 * class and the groups will be mixed and matched to try find the perfect
2352 * match with the pattern.
2353 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2354 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2355 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2356 *         "\u0301\u0325".
2357 * step 2: check if any of the generated substrings matches the pattern.
2358 * Internal method, status assumed to be success, caller has to check status
2359 * before calling this method.
2360 * @param strsrch string search data
2361 * @param textoffset start offset in the collation element text that starts
2362 *                   with the accents to be rearranged
2363 * @param status output error status if any
2364 * @return TRUE if the match is valid, FALSE otherwise
2365 */
2366 static
2367 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2368                                int32_t    textoffset,
2369                                UErrorCode    *status)
2370 {
2371     const UChar       *text       = strsrch->search->text;
2372           int32_t  temp       = textoffset;
2373           int32_t      textlength = strsrch->search->textLength;
2374     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2375         UCollationElements *coleiter = strsrch->textIter;
2376         int32_t         offset   = ucol_getOffset(coleiter);
2377         if (strsrch->pattern.hasSuffixAccents) {
2378             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2379                                                     offset, status);
2380             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2381                 setColEIterOffset(coleiter, offset);
2382                 return TRUE;
2383             }
2384         }
2385         return FALSE;
2386     }
2387
2388     if (!strsrch->pattern.hasPrefixAccents) {
2389         return FALSE;
2390     }
2391
2392     UChar       accents[INITIAL_ARRAY_SIZE_];
2393     // offset to the last base character in substring to search
2394     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2395     // normalizing the offensive string
2396     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2397                                0, accents, INITIAL_ARRAY_SIZE_, status);
2398     // status checked in loop
2399
2400     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2401     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2402
2403     // 2 power n - 1 plus the full set of accents
2404     int32_t  count = (2 << (size - 1)) - 1;
2405     while (U_SUCCESS(*status) && count > 0) {
2406         UChar *rearrange = strsrch->canonicalPrefixAccents;
2407         // copy the base characters
2408         for (int k = 0; k < accentsindex[0]; k ++) {
2409             *rearrange ++ = accents[k];
2410         }
2411         // forming all possible canonical rearrangement by dropping
2412         // sets of accents
2413         for (int i = 0; i <= size - 1; i ++) {
2414             int32_t mask = 1 << (size - i - 1);
2415             if (count & mask) {
2416                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2417                     *rearrange ++ = accents[j];
2418                 }
2419             }
2420         }
2421         *rearrange = 0;
2422         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2423                                                           baseoffset, status);
2424         if (offset != USEARCH_DONE) {
2425             return TRUE; // match found
2426         }
2427         count --;
2428     }
2429     return FALSE;
2430 }
2431
2432 /**
2433 * Checks match for contraction.
2434 * If the match starts with a partial contraction we fail.
2435 * Internal method, status assumed to be success, caller has to check status
2436 * before calling this method.
2437 * @param strsrch string search data
2438 * @param start offset of potential match, to be modified if necessary
2439 * @param end offset of potential match, to be modified if necessary
2440 * @param status only error status if any
2441 * @return TRUE if match passes the contraction test, FALSE otherwise
2442 */
2443 static
2444 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2445                                      int32_t   *start,
2446                                      int32_t   *end, UErrorCode  *status)
2447 {
2448           UCollationElements *coleiter   = strsrch->textIter;
2449           int32_t             textlength = strsrch->search->textLength;
2450           int32_t         temp       = *end;
2451     const UCollator          *collator   = strsrch->collator;
2452     const UChar              *text       = strsrch->search->text;
2453     // This part checks if either if the start of the match contains potential
2454     // contraction. If so we'll have to iterate through them
2455     // Since we used ucol_next while previously looking for the potential
2456     // match, this guarantees that our end will not be a partial contraction,
2457     // or a partial supplementary character.
2458     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2459         int32_t expansion  = getExpansionSuffix(coleiter);
2460         UBool   expandflag = expansion > 0;
2461         setColEIterOffset(coleiter, *end);
2462         while (expansion > 0) {
2463             // getting rid of the redundant ce
2464             // since forward contraction/expansion may have extra ces
2465             // if we are in the normalization buffer, hasAccentsBeforeMatch
2466             // would have taken care of it.
2467             // E.g. the character \u01FA will have an expansion of 3, but if
2468             // we are only looking for A ring A\u030A, we'll have to skip the
2469             // last ce in the expansion buffer
2470             ucol_previous(coleiter, status);
2471             if (U_FAILURE(*status)) {
2472                 return FALSE;
2473             }
2474             if (ucol_getOffset(coleiter) != temp) {
2475                 *end = temp;
2476                 temp  = ucol_getOffset(coleiter);
2477             }
2478             expansion --;
2479         }
2480
2481         int32_t  *patternce       = strsrch->pattern.ces;
2482         int32_t   patterncelength = strsrch->pattern.cesLength;
2483         int32_t   count           = patterncelength;
2484         while (count > 0) {
2485             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2486             // status checked below, note that if status is a failure
2487             // ucol_previous returns UCOL_NULLORDER
2488             if (ce == UCOL_IGNORABLE) {
2489                 continue;
2490             }
2491             if (expandflag && count == 0 &&
2492                 getColElemIterOffset(coleiter, FALSE) != temp) {
2493                 *end = temp;
2494                 temp  = ucol_getOffset(coleiter);
2495             }
2496             if (count == patterncelength &&
2497                 ce != patternce[patterncelength - 1]) {
2498                 // accents may have extra starting ces, this occurs when a
2499                 // pure accent pattern is matched without rearrangement
2500                 int32_t    expected = patternce[patterncelength - 1];
2501                 U16_BACK_1(text, 0, *end);
2502                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2503                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2504                     while (U_SUCCESS(*status) && ce != expected &&
2505                            ce != UCOL_NULLORDER &&
2506                            ucol_getOffset(coleiter) <= *start) {
2507                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2508                     }
2509                 }
2510             }
2511             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2512                 (*start) --;
2513                 *start = getPreviousBaseOffset(text, *start);
2514                 return FALSE;
2515             }
2516             count --;
2517         }
2518     }
2519     return TRUE;
2520 }
2521
2522 /**
2523 * Checks and sets the match information if found.
2524 * Checks
2525 * <ul>
2526 * <li> the potential match does not repeat the previous match
2527 * <li> boundaries are correct
2528 * <li> potential match does not end in the middle of a contraction
2529 * <li> identical matches
2530 * <\ul>
2531 * Otherwise the offset will be shifted to the next character.
2532 * Internal method, status assumed to be success, caller has to check status
2533 * before calling this method.
2534 * @param strsrch string search data
2535 * @param textoffset offset in the collation element text. the returned value
2536 *        will be the truncated start offset of the match or the new start
2537 *        search offset.
2538 * @param status only error status if any
2539 * @return TRUE if the match is valid, FALSE otherwise
2540 */
2541 static
2542 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2543                                          int32_t   *textoffset,
2544                                          UErrorCode    *status)
2545 {
2546     // to ensure that the start and ends are not composite characters
2547     UCollationElements *coleiter = strsrch->textIter;
2548     // if we have a canonical accent match
2549     if ((strsrch->pattern.hasSuffixAccents &&
2550         strsrch->canonicalSuffixAccents[0]) ||
2551         (strsrch->pattern.hasPrefixAccents &&
2552         strsrch->canonicalPrefixAccents[0])) {
2553         strsrch->search->matchedIndex  = *textoffset;
2554         strsrch->search->matchedLength =
2555             getNextUStringSearchBaseOffset(strsrch,
2556                                       getColElemIterOffset(coleiter, FALSE))
2557             - *textoffset;
2558         return TRUE;
2559     }
2560
2561     int32_t end = ucol_getOffset(coleiter);
2562     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2563                                                 status) ||
2564          U_FAILURE(*status)) {
2565         return FALSE;
2566     }
2567
2568     end = getNextUStringSearchBaseOffset(strsrch, end);
2569     // this totally matches, however we need to check if it is repeating
2570     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2571         !isBreakUnit(strsrch, *textoffset, end) ||
2572         !checkIdentical(strsrch, *textoffset, end)) {
2573         (*textoffset) --;
2574         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2575                                             *textoffset);
2576         return FALSE;
2577     }
2578
2579     strsrch->search->matchedIndex  = *textoffset;
2580     strsrch->search->matchedLength = end - *textoffset;
2581     return TRUE;
2582 }
2583 #endif // #if BOYER_MOORE
2584
2585 // constructors and destructor -------------------------------------------
2586
2587 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2588                                           int32_t         patternlength,
2589                                     const UChar          *text,
2590                                           int32_t         textlength,
2591                                     const char           *locale,
2592                                           UBreakIterator *breakiter,
2593                                           UErrorCode     *status)
2594 {
2595     if (U_FAILURE(*status)) {
2596         return NULL;
2597     }
2598 #if UCONFIG_NO_BREAK_ITERATION
2599     if (breakiter != NULL) {
2600         *status = U_UNSUPPORTED_ERROR;
2601         return NULL;
2602     }
2603 #endif
2604     if (locale) {
2605         // ucol_open internally checks for status
2606         UCollator     *collator = ucol_open(locale, status);
2607         // pattern, text checks are done in usearch_openFromCollator
2608         UStringSearch *result   = usearch_openFromCollator(pattern,
2609                                               patternlength, text, textlength,
2610                                               collator, breakiter, status);
2611
2612         if (result == NULL || U_FAILURE(*status)) {
2613             if (collator) {
2614                 ucol_close(collator);
2615             }
2616             return NULL;
2617         }
2618         else {
2619             result->ownCollator = TRUE;
2620         }
2621         return result;
2622     }
2623     *status = U_ILLEGAL_ARGUMENT_ERROR;
2624     return NULL;
2625 }
2626
2627 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2628                                   const UChar          *pattern,
2629                                         int32_t         patternlength,
2630                                   const UChar          *text,
2631                                         int32_t         textlength,
2632                                   const UCollator      *collator,
2633                                         UBreakIterator *breakiter,
2634                                         UErrorCode     *status)
2635 {
2636     if (U_FAILURE(*status)) {
2637         return NULL;
2638     }
2639 #if UCONFIG_NO_BREAK_ITERATION
2640     if (breakiter != NULL) {
2641         *status = U_UNSUPPORTED_ERROR;
2642         return NULL;
2643     }
2644 #endif
2645     if (pattern == NULL || text == NULL || collator == NULL) {
2646         *status = U_ILLEGAL_ARGUMENT_ERROR;
2647         return NULL;
2648     }
2649
2650     // string search does not really work when numeric collation is turned on
2651     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2652         *status = U_UNSUPPORTED_ERROR;
2653         return NULL;
2654     }
2655
2656     if (U_SUCCESS(*status)) {
2657         initializeFCD(status);
2658         if (U_FAILURE(*status)) {
2659             return NULL;
2660         }
2661
2662         UStringSearch *result;
2663         if (textlength == -1) {
2664             textlength = u_strlen(text);
2665         }
2666         if (patternlength == -1) {
2667             patternlength = u_strlen(pattern);
2668         }
2669         if (textlength <= 0 || patternlength <= 0) {
2670             *status = U_ILLEGAL_ARGUMENT_ERROR;
2671             return NULL;
2672         }
2673
2674         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2675         if (result == NULL) {
2676             *status = U_MEMORY_ALLOCATION_ERROR;
2677             return NULL;
2678         }
2679
2680         result->collator    = collator;
2681         result->strength    = ucol_getStrength(collator);
2682         result->ceMask      = getMask(result->strength);
2683         result->toShift     =
2684              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2685                                                             UCOL_SHIFTED;
2686         result->variableTop = ucol_getVariableTop(collator, status);
2687
2688         result->nfd         = Normalizer2::getNFDInstance(*status);
2689
2690         if (U_FAILURE(*status)) {
2691             uprv_free(result);
2692             return NULL;
2693         }
2694
2695         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2696         if (result->search == NULL) {
2697             *status = U_MEMORY_ALLOCATION_ERROR;
2698             uprv_free(result);
2699             return NULL;
2700         }
2701
2702         result->search->text       = text;
2703         result->search->textLength = textlength;
2704
2705         result->pattern.text       = pattern;
2706         result->pattern.textLength = patternlength;
2707         result->pattern.ces         = NULL;
2708         result->pattern.pces        = NULL;
2709
2710         result->search->breakIter  = breakiter;
2711 #if !UCONFIG_NO_BREAK_ITERATION
2712         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2713         if (breakiter) {
2714             ubrk_setText(breakiter, text, textlength, status);
2715         }
2716 #endif
2717
2718         result->ownCollator           = FALSE;
2719         result->search->matchedLength = 0;
2720         result->search->matchedIndex  = USEARCH_DONE;
2721         result->utilIter              = NULL;
2722         result->textIter              = ucol_openElements(collator, text,
2723                                                           textlength, status);
2724         result->textProcessedIter     = NULL;
2725         if (U_FAILURE(*status)) {
2726             usearch_close(result);
2727             return NULL;
2728         }
2729
2730         result->search->isOverlap          = FALSE;
2731         result->search->isCanonicalMatch   = FALSE;
2732         result->search->elementComparisonType = 0;
2733         result->search->isForwardSearching = TRUE;
2734         result->search->reset              = TRUE;
2735
2736         initialize(result, status);
2737
2738         if (U_FAILURE(*status)) {
2739             usearch_close(result);
2740             return NULL;
2741         }
2742
2743         return result;
2744     }
2745     return NULL;
2746 }
2747
2748 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2749 {
2750     if (strsrch) {
2751         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2752             strsrch->pattern.ces) {
2753             uprv_free(strsrch->pattern.ces);
2754         }
2755
2756         if (strsrch->pattern.pces != NULL &&
2757             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2758             uprv_free(strsrch->pattern.pces);
2759         }
2760
2761         delete strsrch->textProcessedIter;
2762         ucol_closeElements(strsrch->textIter);
2763         ucol_closeElements(strsrch->utilIter);
2764
2765         if (strsrch->ownCollator && strsrch->collator) {
2766             ucol_close((UCollator *)strsrch->collator);
2767         }
2768
2769 #if !UCONFIG_NO_BREAK_ITERATION
2770         if (strsrch->search->internalBreakIter) {
2771             ubrk_close(strsrch->search->internalBreakIter);
2772         }
2773 #endif
2774
2775         uprv_free(strsrch->search);
2776         uprv_free(strsrch);
2777     }
2778 }
2779
2780 namespace {
2781
2782 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2783     if (U_FAILURE(*status)) { return FALSE; }
2784     if (strsrch->textProcessedIter == NULL) {
2785         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2786         if (strsrch->textProcessedIter == NULL) {
2787             *status = U_MEMORY_ALLOCATION_ERROR;
2788             return FALSE;
2789         }
2790     } else {
2791         strsrch->textProcessedIter->init(strsrch->textIter);
2792     }
2793     return TRUE;
2794 }
2795
2796 }
2797
2798 // set and get methods --------------------------------------------------
2799
2800 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2801                                         int32_t    position,
2802                                         UErrorCode    *status)
2803 {
2804     if (U_SUCCESS(*status) && strsrch) {
2805         if (isOutOfBounds(strsrch->search->textLength, position)) {
2806             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2807         }
2808         else {
2809             setColEIterOffset(strsrch->textIter, position);
2810         }
2811         strsrch->search->matchedIndex  = USEARCH_DONE;
2812         strsrch->search->matchedLength = 0;
2813         strsrch->search->reset         = FALSE;
2814     }
2815 }
2816
2817 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2818 {
2819     if (strsrch) {
2820         int32_t result = ucol_getOffset(strsrch->textIter);
2821         if (isOutOfBounds(strsrch->search->textLength, result)) {
2822             return USEARCH_DONE;
2823         }
2824         return result;
2825     }
2826     return USEARCH_DONE;
2827 }
2828
2829 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2830                                  USearchAttribute attribute,
2831                                  USearchAttributeValue value,
2832                                  UErrorCode *status)
2833 {
2834     if (U_SUCCESS(*status) && strsrch) {
2835         switch (attribute)
2836         {
2837         case USEARCH_OVERLAP :
2838             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2839             break;
2840         case USEARCH_CANONICAL_MATCH :
2841             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2842                                                                       FALSE);
2843             break;
2844         case USEARCH_ELEMENT_COMPARISON :
2845             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2846                 strsrch->search->elementComparisonType = (int16_t)value;
2847             } else {
2848                 strsrch->search->elementComparisonType = 0;
2849             }
2850             break;
2851         case USEARCH_ATTRIBUTE_COUNT :
2852         default:
2853             *status = U_ILLEGAL_ARGUMENT_ERROR;
2854         }
2855     }
2856     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2857         *status = U_ILLEGAL_ARGUMENT_ERROR;
2858     }
2859 }
2860
2861 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2862                                                 const UStringSearch *strsrch,
2863                                                 USearchAttribute attribute)
2864 {
2865     if (strsrch) {
2866         switch (attribute) {
2867         case USEARCH_OVERLAP :
2868             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2869                                                         USEARCH_OFF);
2870         case USEARCH_CANONICAL_MATCH :
2871             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2872                                                                USEARCH_OFF);
2873         case USEARCH_ELEMENT_COMPARISON :
2874             {
2875                 int16_t value = strsrch->search->elementComparisonType;
2876                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2877                     return (USearchAttributeValue)value;
2878                 } else {
2879                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
2880                 }
2881             }
2882         case USEARCH_ATTRIBUTE_COUNT :
2883             return USEARCH_DEFAULT;
2884         }
2885     }
2886     return USEARCH_DEFAULT;
2887 }
2888
2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890                                                 const UStringSearch *strsrch)
2891 {
2892     if (strsrch == NULL) {
2893         return USEARCH_DONE;
2894     }
2895     return strsrch->search->matchedIndex;
2896 }
2897
2898
2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900                                             UChar         *result,
2901                                             int32_t        resultCapacity,
2902                                             UErrorCode    *status)
2903 {
2904     if (U_FAILURE(*status)) {
2905         return USEARCH_DONE;
2906     }
2907     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2908         result == NULL)) {
2909         *status = U_ILLEGAL_ARGUMENT_ERROR;
2910         return USEARCH_DONE;
2911     }
2912
2913     int32_t     copylength = strsrch->search->matchedLength;
2914     int32_t copyindex  = strsrch->search->matchedIndex;
2915     if (copyindex == USEARCH_DONE) {
2916         u_terminateUChars(result, resultCapacity, 0, status);
2917         return USEARCH_DONE;
2918     }
2919
2920     if (resultCapacity < copylength) {
2921         copylength = resultCapacity;
2922     }
2923     if (copylength > 0) {
2924         uprv_memcpy(result, strsrch->search->text + copyindex,
2925                     copylength * sizeof(UChar));
2926     }
2927     return u_terminateUChars(result, resultCapacity,
2928                              strsrch->search->matchedLength, status);
2929 }
2930
2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932                                               const UStringSearch *strsrch)
2933 {
2934     if (strsrch) {
2935         return strsrch->search->matchedLength;
2936     }
2937     return USEARCH_DONE;
2938 }
2939
2940 #if !UCONFIG_NO_BREAK_ITERATION
2941
2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2943                                                UBreakIterator *breakiter,
2944                                                UErrorCode     *status)
2945 {
2946     if (U_SUCCESS(*status) && strsrch) {
2947         strsrch->search->breakIter = breakiter;
2948         if (breakiter) {
2949             ubrk_setText(breakiter, strsrch->search->text,
2950                          strsrch->search->textLength, status);
2951         }
2952     }
2953 }
2954
2955 U_CAPI const UBreakIterator* U_EXPORT2
2956 usearch_getBreakIterator(const UStringSearch *strsrch)
2957 {
2958     if (strsrch) {
2959         return strsrch->search->breakIter;
2960     }
2961     return NULL;
2962 }
2963
2964 #endif
2965
2966 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2967                                       const UChar         *text,
2968                                             int32_t        textlength,
2969                                             UErrorCode    *status)
2970 {
2971     if (U_SUCCESS(*status)) {
2972         if (strsrch == NULL || text == NULL || textlength < -1 ||
2973             textlength == 0) {
2974             *status = U_ILLEGAL_ARGUMENT_ERROR;
2975         }
2976         else {
2977             if (textlength == -1) {
2978                 textlength = u_strlen(text);
2979             }
2980             strsrch->search->text       = text;
2981             strsrch->search->textLength = textlength;
2982             ucol_setText(strsrch->textIter, text, textlength, status);
2983             strsrch->search->matchedIndex  = USEARCH_DONE;
2984             strsrch->search->matchedLength = 0;
2985             strsrch->search->reset         = TRUE;
2986 #if !UCONFIG_NO_BREAK_ITERATION
2987             if (strsrch->search->breakIter != NULL) {
2988                 ubrk_setText(strsrch->search->breakIter, text,
2989                              textlength, status);
2990             }
2991             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2992 #endif
2993         }
2994     }
2995 }
2996
2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2998                                                      int32_t       *length)
2999 {
3000     if (strsrch) {
3001         *length = strsrch->search->textLength;
3002         return strsrch->search->text;
3003     }
3004     return NULL;
3005 }
3006
3007 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3008                                           const UCollator     *collator,
3009                                                 UErrorCode    *status)
3010 {
3011     if (U_SUCCESS(*status)) {
3012         if (collator == NULL) {
3013             *status = U_ILLEGAL_ARGUMENT_ERROR;
3014             return;
3015         }
3016
3017         if (strsrch) {
3018             delete strsrch->textProcessedIter;
3019             strsrch->textProcessedIter = NULL;
3020             ucol_closeElements(strsrch->textIter);
3021             ucol_closeElements(strsrch->utilIter);
3022             strsrch->textIter = strsrch->utilIter = NULL;
3023             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3024                 ucol_close((UCollator *)strsrch->collator);
3025                 strsrch->ownCollator = FALSE;
3026             }
3027             strsrch->collator    = collator;
3028             strsrch->strength    = ucol_getStrength(collator);
3029             strsrch->ceMask      = getMask(strsrch->strength);
3030 #if !UCONFIG_NO_BREAK_ITERATION
3031             ubrk_close(strsrch->search->internalBreakIter);
3032             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3033                                                      strsrch->search->text, strsrch->search->textLength, status);
3034 #endif
3035             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3036             strsrch->toShift     =
3037                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3038                                                                 UCOL_SHIFTED;
3039             // if status is a failure, ucol_getVariableTop returns 0
3040             strsrch->variableTop = ucol_getVariableTop(collator, status);
3041             strsrch->textIter = ucol_openElements(collator,
3042                                       strsrch->search->text,
3043                                       strsrch->search->textLength,
3044                                       status);
3045             strsrch->utilIter = ucol_openElements(
3046                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3047             // initialize() _after_ setting the iterators for the new collator.
3048             initialize(strsrch, status);
3049         }
3050
3051         // **** are these calls needed?
3052         // **** we call uprv_init_pce in initializePatternPCETable
3053         // **** and the CEIBuffer constructor...
3054 #if 0
3055         uprv_init_pce(strsrch->textIter);
3056         uprv_init_pce(strsrch->utilIter);
3057 #endif
3058     }
3059 }
3060
3061 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3062 {
3063     if (strsrch) {
3064         return (UCollator *)strsrch->collator;
3065     }
3066     return NULL;
3067 }
3068
3069 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3070                                          const UChar         *pattern,
3071                                                int32_t        patternlength,
3072                                                UErrorCode    *status)
3073 {
3074     if (U_SUCCESS(*status)) {
3075         if (strsrch == NULL || pattern == NULL) {
3076             *status = U_ILLEGAL_ARGUMENT_ERROR;
3077         }
3078         else {
3079             if (patternlength == -1) {
3080                 patternlength = u_strlen(pattern);
3081             }
3082             if (patternlength == 0) {
3083                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3084                 return;
3085             }
3086             strsrch->pattern.text       = pattern;
3087             strsrch->pattern.textLength = patternlength;
3088             initialize(strsrch, status);
3089         }
3090     }
3091 }
3092
3093 U_CAPI const UChar* U_EXPORT2
3094 usearch_getPattern(const UStringSearch *strsrch,
3095                    int32_t       *length)
3096 {
3097     if (strsrch) {
3098         *length = strsrch->pattern.textLength;
3099         return strsrch->pattern.text;
3100     }
3101     return NULL;
3102 }
3103
3104 // miscellanous methods --------------------------------------------------
3105
3106 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3107                                            UErrorCode    *status)
3108 {
3109     if (strsrch && U_SUCCESS(*status)) {
3110         strsrch->search->isForwardSearching = TRUE;
3111         usearch_setOffset(strsrch, 0, status);
3112         if (U_SUCCESS(*status)) {
3113             return usearch_next(strsrch, status);
3114         }
3115     }
3116     return USEARCH_DONE;
3117 }
3118
3119 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3120                                                int32_t    position,
3121                                                UErrorCode    *status)
3122 {
3123     if (strsrch && U_SUCCESS(*status)) {
3124         strsrch->search->isForwardSearching = TRUE;
3125         // position checked in usearch_setOffset
3126         usearch_setOffset(strsrch, position, status);
3127         if (U_SUCCESS(*status)) {
3128             return usearch_next(strsrch, status);
3129         }
3130     }
3131     return USEARCH_DONE;
3132 }
3133
3134 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3135                                           UErrorCode    *status)
3136 {
3137     if (strsrch && U_SUCCESS(*status)) {
3138         strsrch->search->isForwardSearching = FALSE;
3139         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3140         if (U_SUCCESS(*status)) {
3141             return usearch_previous(strsrch, status);
3142         }
3143     }
3144     return USEARCH_DONE;
3145 }
3146
3147 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3148                                                int32_t    position,
3149                                                UErrorCode    *status)
3150 {
3151     if (strsrch && U_SUCCESS(*status)) {
3152         strsrch->search->isForwardSearching = FALSE;
3153         // position checked in usearch_setOffset
3154         usearch_setOffset(strsrch, position, status);
3155         if (U_SUCCESS(*status)) {
3156             return usearch_previous(strsrch, status);
3157         }
3158     }
3159     return USEARCH_DONE;
3160 }
3161
3162 /**
3163 * If a direction switch is required, we'll count the number of ces till the
3164 * beginning of the collation element iterator and iterate forwards that
3165 * number of times. This is so that we get to the correct point within the
3166 * string to continue the search in. Imagine when we are in the middle of the
3167 * normalization buffer when the change in direction is request. arrrgghh....
3168 * After searching the offset within the collation element iterator will be
3169 * shifted to the start of the match. If a match is not found, the offset would
3170 * have been set to the end of the text string in the collation element
3171 * iterator.
3172 * Okay, here's my take on normalization buffer. The only time when there can
3173 * be 2 matches within the same normalization is when the pattern is consists
3174 * of all accents. But since the offset returned is from the text string, we
3175 * should not confuse the caller by returning the second match within the
3176 * same normalization buffer. If we do, the 2 results will have the same match
3177 * offsets, and that'll be confusing. I'll return the next match that doesn't
3178 * fall within the same normalization buffer. Note this does not affect the
3179 * results of matches spanning the text and the normalization buffer.
3180 * The position to start searching is taken from the collation element
3181 * iterator. Callers of this API would have to set the offset in the collation
3182 * element iterator before using this method.
3183 */
3184 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3185                                           UErrorCode    *status)
3186 {
3187     if (U_SUCCESS(*status) && strsrch) {
3188         // note offset is either equivalent to the start of the previous match
3189         // or is set by the user
3190         int32_t      offset       = usearch_getOffset(strsrch);
3191         USearch     *search       = strsrch->search;
3192         search->reset             = FALSE;
3193         int32_t      textlength   = search->textLength;
3194         if (search->isForwardSearching) {
3195 #if BOYER_MOORE
3196             if (offset == textlength
3197                 || (!search->isOverlap &&
3198                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3199                     (search->matchedIndex != USEARCH_DONE &&
3200                      offset + search->matchedLength >= textlength)))) {
3201                 // not enough characters to match
3202                 setMatchNotFound(strsrch);
3203                 return USEARCH_DONE;
3204             }
3205 #else
3206             if (offset == textlength ||
3207                 (! search->isOverlap &&
3208                 (search->matchedIndex != USEARCH_DONE &&
3209                 offset + search->matchedLength > textlength))) {
3210                     // not enough characters to match
3211                     setMatchNotFound(strsrch);
3212                     return USEARCH_DONE;
3213             }
3214 #endif
3215         }
3216         else {
3217             // switching direction.
3218             // if matchedIndex == USEARCH_DONE, it means that either a
3219             // setOffset has been called or that previous ran off the text
3220             // string. the iterator would have been set to offset 0 if a
3221             // match is not found.
3222             search->isForwardSearching = TRUE;
3223             if (search->matchedIndex != USEARCH_DONE) {
3224                 // there's no need to set the collation element iterator
3225                 // the next call to next will set the offset.
3226                 return search->matchedIndex;
3227             }
3228         }
3229
3230         if (U_SUCCESS(*status)) {
3231             if (strsrch->pattern.cesLength == 0) {
3232                 if (search->matchedIndex == USEARCH_DONE) {
3233                     search->matchedIndex = offset;
3234                 }
3235                 else { // moves by codepoints
3236                     U16_FWD_1(search->text, search->matchedIndex, textlength);
3237                 }
3238
3239                 search->matchedLength = 0;
3240                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3241                 // status checked below
3242                 if (search->matchedIndex == textlength) {
3243                     search->matchedIndex = USEARCH_DONE;
3244                 }
3245             }
3246             else {
3247                 if (search->matchedLength > 0) {
3248                     // if matchlength is 0 we are at the start of the iteration
3249                     if (search->isOverlap) {
3250                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3251                     }
3252                     else {
3253                         ucol_setOffset(strsrch->textIter,
3254                                        offset + search->matchedLength, status);
3255                     }
3256                 }
3257                 else {
3258                     // for boundary check purposes. this will ensure that the
3259                     // next match will not preceed the current offset
3260                     // note search->matchedIndex will always be set to something
3261                     // in the code
3262                     search->matchedIndex = offset - 1;
3263                 }
3264
3265                 if (search->isCanonicalMatch) {
3266                     // can't use exact here since extra accents are allowed.
3267                     usearch_handleNextCanonical(strsrch, status);
3268                 }
3269                 else {
3270                     usearch_handleNextExact(strsrch, status);
3271                 }
3272             }
3273
3274             if (U_FAILURE(*status)) {
3275                 return USEARCH_DONE;
3276             }
3277
3278 #if !BOYER_MOORE
3279             if (search->matchedIndex == USEARCH_DONE) {
3280                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3281             } else {
3282                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3283             }
3284 #endif
3285
3286             return search->matchedIndex;
3287         }
3288     }
3289     return USEARCH_DONE;
3290 }
3291
3292 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3293                                               UErrorCode *status)
3294 {
3295     if (U_SUCCESS(*status) && strsrch) {
3296         int32_t offset;
3297         USearch *search = strsrch->search;
3298         if (search->reset) {
3299             offset                     = search->textLength;
3300             search->isForwardSearching = FALSE;
3301             search->reset              = FALSE;
3302             setColEIterOffset(strsrch->textIter, offset);
3303         }
3304         else {
3305             offset = usearch_getOffset(strsrch);
3306         }
3307
3308         int32_t matchedindex = search->matchedIndex;
3309         if (search->isForwardSearching == TRUE) {
3310             // switching direction.
3311             // if matchedIndex == USEARCH_DONE, it means that either a
3312             // setOffset has been called or that next ran off the text
3313             // string. the iterator would have been set to offset textLength if
3314             // a match is not found.
3315             search->isForwardSearching = FALSE;
3316             if (matchedindex != USEARCH_DONE) {
3317                 return matchedindex;
3318             }
3319         }
3320         else {
3321 #if BOYER_MOORE
3322             if (offset == 0 || matchedindex == 0 ||
3323                 (!search->isOverlap &&
3324                     (offset < strsrch->pattern.defaultShiftSize ||
3325                     (matchedindex != USEARCH_DONE &&
3326                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3327                 // not enough characters to match
3328                 setMatchNotFound(strsrch);
3329                 return USEARCH_DONE;
3330             }
3331 #else
3332             // Could check pattern length, but the
3333             // linear search will do the right thing
3334             if (offset == 0 || matchedindex == 0) {
3335                 setMatchNotFound(strsrch);
3336                 return USEARCH_DONE;
3337             }
3338 #endif
3339         }
3340
3341         if (U_SUCCESS(*status)) {
3342             if (strsrch->pattern.cesLength == 0) {
3343                 search->matchedIndex =
3344                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3345                 if (search->matchedIndex == 0) {
3346                     setMatchNotFound(strsrch);
3347                     // status checked below
3348                 }
3349                 else { // move by codepoints
3350                     U16_BACK_1(search->text, 0, search->matchedIndex);
3351                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3352                     // status checked below
3353                     search->matchedLength = 0;
3354                 }
3355             }
3356             else {
3357                 if (strsrch->search->isCanonicalMatch) {
3358                     // can't use exact here since extra accents are allowed.
3359                     usearch_handlePreviousCanonical(strsrch, status);
3360                     // status checked below
3361                 }
3362                 else {
3363                     usearch_handlePreviousExact(strsrch, status);
3364                     // status checked below
3365                 }
3366             }
3367
3368             if (U_FAILURE(*status)) {
3369                 return USEARCH_DONE;
3370             }
3371
3372             return search->matchedIndex;
3373         }
3374     }
3375     return USEARCH_DONE;
3376 }
3377
3378
3379
3380 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3381 {
3382     /*
3383     reset is setting the attributes that are already in
3384     string search, hence all attributes in the collator should
3385     be retrieved without any problems
3386     */
3387     if (strsrch) {
3388         UErrorCode status            = U_ZERO_ERROR;
3389         UBool      sameCollAttribute = TRUE;
3390         uint32_t   ceMask;
3391         UBool      shift;
3392         uint32_t   varTop;
3393
3394         // **** hack to deal w/ how processed CEs encode quaternary ****
3395         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3396         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3397             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3398                 sameCollAttribute = FALSE;
3399         }
3400
3401         strsrch->strength    = ucol_getStrength(strsrch->collator);
3402         ceMask = getMask(strsrch->strength);
3403         if (strsrch->ceMask != ceMask) {
3404             strsrch->ceMask = ceMask;
3405             sameCollAttribute = FALSE;
3406         }
3407
3408         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3409         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3410                                   &status) == UCOL_SHIFTED;
3411         if (strsrch->toShift != shift) {
3412             strsrch->toShift  = shift;
3413             sameCollAttribute = FALSE;
3414         }
3415
3416         // if status is a failure, ucol_getVariableTop returns 0
3417         varTop = ucol_getVariableTop(strsrch->collator, &status);
3418         if (strsrch->variableTop != varTop) {
3419             strsrch->variableTop = varTop;
3420             sameCollAttribute    = FALSE;
3421         }
3422         if (!sameCollAttribute) {
3423             initialize(strsrch, &status);
3424         }
3425         ucol_setText(strsrch->textIter, strsrch->search->text,
3426                               strsrch->search->textLength,
3427                               &status);
3428         strsrch->search->matchedLength      = 0;
3429         strsrch->search->matchedIndex       = USEARCH_DONE;
3430         strsrch->search->isOverlap          = FALSE;
3431         strsrch->search->isCanonicalMatch   = FALSE;
3432         strsrch->search->elementComparisonType = 0;
3433         strsrch->search->isForwardSearching = TRUE;
3434         strsrch->search->reset              = TRUE;
3435     }
3436 }
3437
3438 //
3439 //  CEI  Collation Element + source text index.
3440 //       These structs are kept in the circular buffer.
3441 //
3442 struct  CEI {
3443     int64_t ce;
3444     int32_t lowIndex;
3445     int32_t highIndex;
3446 };
3447
3448 U_NAMESPACE_BEGIN
3449
3450 namespace {
3451 //
3452 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
3453 //
3454 #define   DEFAULT_CEBUFFER_SIZE 96
3455 #define   CEBUFFER_EXTRA 32
3456 // Some typical max values to make buffer size more reasonable for asymmetric search.
3457 // #8694 is for a better long-term solution to allocation of this buffer.
3458 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3459 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3460 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3461 struct CEIBuffer {
3462     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3463     CEI                 *buf;
3464     int32_t              bufSize;
3465     int32_t              firstIx;
3466     int32_t              limitIx;
3467     UCollationElements  *ceIter;
3468     UStringSearch       *strSearch;
3469
3470
3471
3472                CEIBuffer(UStringSearch *ss, UErrorCode *status);
3473                ~CEIBuffer();
3474    const CEI   *get(int32_t index);
3475    const CEI   *getPrevious(int32_t index);
3476 };
3477
3478
3479 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3480     buf = defBuf;
3481     strSearch = ss;
3482     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3483     if (ss->search->elementComparisonType != 0) {
3484         const UChar * patText = ss->pattern.text;
3485         if (patText) {
3486             const UChar * patTextLimit = patText + ss->pattern.textLength;
3487             while ( patText < patTextLimit ) {
3488                 UChar c = *patText++;
3489                 if (MIGHT_BE_JAMO_L(c)) {
3490                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3491                 } else {
3492                     // No check for surrogates, we might allocate slightly more buffer than necessary.
3493                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3494                 }
3495             }
3496         }
3497     }
3498     ceIter    = ss->textIter;
3499     firstIx = 0;
3500     limitIx = 0;
3501
3502     if (!initTextProcessedIter(ss, status)) { return; }
3503
3504     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3505         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3506         if (buf == NULL) {
3507             *status = U_MEMORY_ALLOCATION_ERROR;
3508         }
3509     }
3510 }
3511
3512 // TODO: add a reset or init function so that allocated
3513 //       buffers can be retained & reused.
3514
3515 CEIBuffer::~CEIBuffer() {
3516     if (buf != defBuf) {
3517         uprv_free(buf);
3518     }
3519 }
3520
3521
3522 // Get the CE with the specified index.
3523 //   Index must be in the range
3524 //          n-history_size < index < n+1
3525 //   where n is the largest index to have been fetched by some previous call to this function.
3526 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3527 //
3528 const CEI *CEIBuffer::get(int32_t index) {
3529     int i = index % bufSize;
3530
3531     if (index>=firstIx && index<limitIx) {
3532         // The request was for an entry already in our buffer.
3533         //  Just return it.
3534         return &buf[i];
3535     }
3536
3537     // Caller is requesting a new, never accessed before, CE.
3538     //   Verify that it is the next one in sequence, which is all
3539     //   that is allowed.
3540     if (index != limitIx) {
3541         U_ASSERT(FALSE);
3542
3543         return NULL;
3544     }
3545
3546     // Manage the circular CE buffer indexing
3547     limitIx++;
3548
3549     if (limitIx - firstIx >= bufSize) {
3550         // The buffer is full, knock out the lowest-indexed entry.
3551         firstIx++;
3552     }
3553
3554     UErrorCode status = U_ZERO_ERROR;
3555
3556     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3557
3558     return &buf[i];
3559 }
3560
3561 // Get the CE with the specified index.
3562 //   Index must be in the range
3563 //          n-history_size < index < n+1
3564 //   where n is the largest index to have been fetched by some previous call to this function.
3565 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3566 //
3567 const CEI *CEIBuffer::getPrevious(int32_t index) {
3568     int i = index % bufSize;
3569
3570     if (index>=firstIx && index<limitIx) {
3571         // The request was for an entry already in our buffer.
3572         //  Just return it.
3573         return &buf[i];
3574     }
3575
3576     // Caller is requesting a new, never accessed before, CE.
3577     //   Verify that it is the next one in sequence, which is all
3578     //   that is allowed.
3579     if (index != limitIx) {
3580         U_ASSERT(FALSE);
3581
3582         return NULL;
3583     }
3584
3585     // Manage the circular CE buffer indexing
3586     limitIx++;
3587
3588     if (limitIx - firstIx >= bufSize) {
3589         // The buffer is full, knock out the lowest-indexed entry.
3590         firstIx++;
3591     }
3592
3593     UErrorCode status = U_ZERO_ERROR;
3594
3595     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3596
3597     return &buf[i];
3598 }
3599
3600 }
3601
3602 U_NAMESPACE_END
3603
3604
3605 // #define USEARCH_DEBUG
3606
3607 #ifdef USEARCH_DEBUG
3608 #include <stdio.h>
3609 #include <stdlib.h>
3610 #endif
3611
3612 /*
3613  * Find the next break boundary after startIndex. If the UStringSearch object
3614  * has an external break iterator, use that. Otherwise use the internal character
3615  * break iterator.
3616  */
3617 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3618 #if 0
3619     const UChar *text = strsrch->search->text;
3620     int32_t textLen   = strsrch->search->textLength;
3621
3622     U_ASSERT(startIndex>=0);
3623     U_ASSERT(startIndex<=textLen);
3624
3625     if (startIndex >= textLen) {
3626         return startIndex;
3627     }
3628
3629     UChar32  c;
3630     int32_t  i = startIndex;
3631     U16_NEXT(text, i, textLen, c);
3632
3633     // If we are on a control character, stop without looking for combining marks.
3634     //    Control characters do not combine.
3635     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3636     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3637         return i;
3638     }
3639
3640     // The initial character was not a control, and can thus accept trailing
3641     //   combining characters.  Advance over however many of them there are.
3642     int32_t  indexOfLastCharChecked;
3643     for (;;) {
3644         indexOfLastCharChecked = i;
3645         if (i>=textLen) {
3646             break;
3647         }
3648         U16_NEXT(text, i, textLen, c);
3649         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3650         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3651             break;
3652         }
3653     }
3654     return indexOfLastCharChecked;
3655 #elif !UCONFIG_NO_BREAK_ITERATION
3656     UBreakIterator *breakiterator = strsrch->search->breakIter;
3657
3658     if (breakiterator == NULL) {
3659         breakiterator = strsrch->search->internalBreakIter;
3660     }
3661
3662     if (breakiterator != NULL) {
3663         return ubrk_following(breakiterator, startIndex);
3664     }
3665
3666     return startIndex;
3667 #else
3668     // **** or should we use the original code? ****
3669     return startIndex;
3670 #endif
3671
3672 }
3673
3674 /*
3675  * Returns TRUE if index is on a break boundary. If the UStringSearch
3676  * has an external break iterator, test using that, otherwise test
3677  * using the internal character break iterator.
3678  */
3679 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3680 #if 0
3681     const UChar *text = strsrch->search->text;
3682     int32_t textLen   = strsrch->search->textLength;
3683
3684     U_ASSERT(index>=0);
3685     U_ASSERT(index<=textLen);
3686
3687     if (index>=textLen || index<=0) {
3688         return TRUE;
3689     }
3690
3691     // If the character at the current index is not a GRAPHEME_EXTEND
3692     //    then we can not be within a combining sequence.
3693     UChar32  c;
3694     U16_GET(text, 0, index, textLen, c);
3695     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3696     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3697         return TRUE;
3698     }
3699
3700     // We are at a combining mark.  If the preceding character is anything
3701     //   except a CONTROL, CR or LF, we are in a combining sequence.
3702     U16_PREV(text, 0, index, c);
3703     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3704     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3705     return !combining;
3706 #elif !UCONFIG_NO_BREAK_ITERATION
3707     UBreakIterator *breakiterator = strsrch->search->breakIter;
3708
3709     if (breakiterator == NULL) {
3710         breakiterator = strsrch->search->internalBreakIter;
3711     }
3712
3713     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3714 #else
3715     // **** or use the original code? ****
3716     return TRUE;
3717 #endif
3718 }
3719
3720 #if 0
3721 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3722 {
3723 #if !UCONFIG_NO_BREAK_ITERATION
3724     UBreakIterator *breakiterator = strsrch->search->breakIter;
3725
3726     if (breakiterator != NULL) {
3727         int32_t startindex = ubrk_first(breakiterator);
3728         int32_t endindex   = ubrk_last(breakiterator);
3729
3730         // out-of-range indexes are never boundary positions
3731         if (start < startindex || start > endindex ||
3732             end < startindex || end > endindex) {
3733             return FALSE;
3734         }
3735
3736         return ubrk_isBoundary(breakiterator, start) &&
3737                ubrk_isBoundary(breakiterator, end);
3738     }
3739 #endif
3740
3741     return TRUE;
3742 }
3743 #endif
3744
3745 typedef enum {
3746     U_CE_MATCH = -1,
3747     U_CE_NO_MATCH = 0,
3748     U_CE_SKIP_TARG,
3749     U_CE_SKIP_PATN
3750 } UCompareCEsResult;
3751 #define U_CE_LEVEL2_BASE 0x00000005
3752 #define U_CE_LEVEL3_BASE 0x00050000
3753
3754 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3755     if (targCE == patCE) {
3756         return U_CE_MATCH;
3757     }
3758     if (compareType == 0) {
3759         return U_CE_NO_MATCH;
3760     }
3761
3762     int64_t targCEshifted = targCE >> 32;
3763     int64_t patCEshifted = patCE >> 32;
3764     int64_t mask;
3765
3766     mask = 0xFFFF0000;
3767     int32_t targLev1 = (int32_t)(targCEshifted & mask);
3768     int32_t patLev1 = (int32_t)(patCEshifted & mask);
3769     if ( targLev1 != patLev1 ) {
3770         if ( targLev1 == 0 ) {
3771             return U_CE_SKIP_TARG;
3772         }
3773         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3774             return U_CE_SKIP_PATN;
3775         }
3776         return U_CE_NO_MATCH;
3777     }
3778
3779     mask = 0x0000FFFF;
3780     int32_t targLev2 = (int32_t)(targCEshifted & mask);
3781     int32_t patLev2 = (int32_t)(patCEshifted & mask);
3782     if ( targLev2 != patLev2 ) {
3783         if ( targLev2 == 0 ) {
3784             return U_CE_SKIP_TARG;
3785         }
3786         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3787             return U_CE_SKIP_PATN;
3788         }
3789         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3790             U_CE_MATCH: U_CE_NO_MATCH;
3791     }
3792
3793     mask = 0xFFFF0000;
3794     int32_t targLev3 = (int32_t)(targCE & mask);
3795     int32_t patLev3 = (int32_t)(patCE & mask);
3796     if ( targLev3 != patLev3 ) {
3797         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3798             U_CE_MATCH: U_CE_NO_MATCH;
3799    }
3800
3801     return U_CE_MATCH;
3802 }
3803
3804 #if BOYER_MOORE
3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3806 #endif
3807
3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3809                                        int32_t        startIdx,
3810                                        int32_t        *matchStart,
3811                                        int32_t        *matchLimit,
3812                                        UErrorCode     *status)
3813 {
3814     if (U_FAILURE(*status)) {
3815         return FALSE;
3816     }
3817
3818     // TODO:  reject search patterns beginning with a combining char.
3819
3820 #ifdef USEARCH_DEBUG
3821     if (getenv("USEARCH_DEBUG") != NULL) {
3822         printf("Pattern CEs\n");
3823         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3824             printf(" %8x", strsrch->pattern.ces[ii]);
3825         }
3826         printf("\n");
3827     }
3828
3829 #endif
3830     // Input parameter sanity check.
3831     //  TODO:  should input indicies clip to the text length
3832     //         in the same way that UText does.
3833     if(strsrch->pattern.cesLength == 0         ||
3834        startIdx < 0                           ||
3835        startIdx > strsrch->search->textLength ||
3836        strsrch->pattern.ces == NULL) {
3837            *status = U_ILLEGAL_ARGUMENT_ERROR;
3838            return FALSE;
3839     }
3840
3841     if (strsrch->pattern.pces == NULL) {
3842         initializePatternPCETable(strsrch, status);
3843     }
3844
3845     ucol_setOffset(strsrch->textIter, startIdx, status);
3846     CEIBuffer ceb(strsrch, status);
3847
3848
3849     int32_t    targetIx = 0;
3850     const CEI *targetCEI = NULL;
3851     int32_t    patIx;
3852     UBool      found;
3853
3854     int32_t  mStart = -1;
3855     int32_t  mLimit = -1;
3856     int32_t  minLimit;
3857     int32_t  maxLimit;
3858
3859
3860
3861     // Outer loop moves over match starting positions in the
3862     //      target CE space.
3863     // Here we see the target as a sequence of collation elements, resulting from the following:
3864     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3865     //    (for example, digraphs such as IJ may be broken into two characters).
3866     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3867     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3868     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3869     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3870     //    then the CE is deleted, so the following code sees only CEs that are relevant.
3871     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3872     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3873     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3874     //
3875     for(targetIx=0; ; targetIx++)
3876     {
3877         found = TRUE;
3878         //  Inner loop checks for a match beginning at each
3879         //  position from the outer loop.
3880         int32_t targetIxOffset = 0;
3881         int64_t patCE = 0;
3882         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3883         // (compared to the last CE fetched for the previous targetIx value) as we need to go
3884         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3885         const CEI *firstCEI = ceb.get(targetIx);
3886         if (firstCEI == NULL) {
3887             *status = U_INTERNAL_PROGRAM_ERROR;
3888             found = FALSE;
3889             break;
3890         }
3891
3892         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3893             patCE = strsrch->pattern.pces[patIx];
3894             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3895             //  Compare CE from target string with CE from the pattern.
3896             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3897             //    which will fail the compare, below.
3898             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3899             if ( ceMatch == U_CE_NO_MATCH ) {
3900                 found = FALSE;
3901                 break;
3902             } else if ( ceMatch > U_CE_NO_MATCH ) {
3903                 if ( ceMatch == U_CE_SKIP_TARG ) {
3904                     // redo with same patCE, next targCE
3905                     patIx--;
3906                     targetIxOffset++;
3907                 } else { // ceMatch == U_CE_SKIP_PATN
3908                     // redo with same targCE, next patCE
3909                     targetIxOffset--;
3910                 }
3911             }
3912         }
3913         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3914
3915         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3916             // No match at this targetIx.  Try again at the next.
3917             continue;
3918         }
3919
3920         if (!found) {
3921             // No match at all, we have run off the end of the target text.
3922             break;
3923         }
3924
3925
3926         // We have found a match in CE space.
3927         // Now determine the bounds in string index space.
3928         //  There still is a chance of match failure if the CE range not correspond to
3929         //     an acceptable character range.
3930         //
3931         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3932
3933         mStart   = firstCEI->lowIndex;
3934         minLimit = lastCEI->lowIndex;
3935
3936         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3937         //   extended to the end of input, and the match is good.
3938
3939         // Look at the high and low indices of the CE following the match. If
3940         // they are the same it means one of two things:
3941         //    1. The match extended to the last CE from the target text, which is OK, or
3942         //    2. The last CE that was part of the match is in an expansion that extends
3943         //       to the first CE after the match. In this case, we reject the match.
3944         const CEI *nextCEI = 0;
3945         if (strsrch->search->elementComparisonType == 0) {
3946             nextCEI  = ceb.get(targetIx + targetIxOffset);
3947             maxLimit = nextCEI->lowIndex;
3948             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3949                 found = FALSE;
3950             }
3951         } else {
3952             for ( ; ; ++targetIxOffset ) {
3953                 nextCEI = ceb.get(targetIx + targetIxOffset);
3954                 maxLimit = nextCEI->lowIndex;
3955                 // If we are at the end of the target too, match succeeds
3956                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3957                     break;
3958                 }
3959                 // As long as the next CE has primary weight of 0,
3960                 // it is part of the last target element matched by the pattern;
3961                 // make sure it can be part of a match with the last patCE
3962                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3963                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3964                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3965                         found = FALSE;
3966                         break;
3967                     }
3968                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3969                 // target element, but it has non-zero primary weight => match fails
3970                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3971                     found = false;
3972                     break;
3973                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3974                 } else {
3975                     break;
3976                 }
3977             }
3978         }
3979
3980
3981         // Check for the start of the match being within a combining sequence.
3982         //   This can happen if the pattern itself begins with a combining char, and
3983         //   the match found combining marks in the target text that were attached
3984         //    to something else.
3985         //   This type of match should be rejected for not completely consuming a
3986         //   combining sequence.
3987         if (!isBreakBoundary(strsrch, mStart)) {
3988             found = FALSE;
3989         }
3990
3991         // Check for the start of the match being within an Collation Element Expansion,
3992         //   meaning that the first char of the match is only partially matched.
3993         //   With exapnsions, the first CE will report the index of the source
3994         //   character, and all subsequent (expansions) CEs will report the source index of the
3995         //    _following_ character.
3996         int32_t secondIx = firstCEI->highIndex;
3997         if (mStart == secondIx) {
3998             found = FALSE;
3999         }
4000
4001         //  Advance the match end position to the first acceptable match boundary.
4002         //    This advances the index over any combining charcters.
4003         mLimit = maxLimit;
4004         if (minLimit < maxLimit) {
4005             // When the last CE's low index is same with its high index, the CE is likely
4006             // a part of expansion. In this case, the index is located just after the
4007             // character corresponding to the CEs compared above. If the index is right
4008             // at the break boundary, move the position to the next boundary will result
4009             // incorrect match length when there are ignorable characters exist between
4010             // the position and the next character produces CE(s). See ticket#8482.
4011             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4012                 mLimit = minLimit;
4013             } else {
4014                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4015                 if (nba >= lastCEI->highIndex) {
4016                     mLimit = nba;
4017                 }
4018             }
4019         }
4020
4021     #ifdef USEARCH_DEBUG
4022         if (getenv("USEARCH_DEBUG") != NULL) {
4023             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4024         }
4025     #endif
4026
4027         // If default breakIter is being used, and next collation element belonging to this
4028         //    combining sequence has non-zero primary weight and corresponds to a separate
4029         //    character following the one at end of the current match, then do NOT require
4030         //    that match end position be on a breakIter boundary, or that end of the
4031         //    combining sequence not extend beyond the match in CE space. Only do those
4032         //    tests if the conditions above are not met. Added this to make prefix search
4033         //    work in Indic scripts per <rdar://problem/18063262>.
4034         UBool doLimitTests = !(strsrch->search->breakIter == NULL &&
4035                     nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4036                     nextCEI->lowIndex >= lastCEI->highIndex && nextCEI->highIndex > nextCEI->lowIndex);
4037
4038         if (doLimitTests) { // <rdar://problem/18063262>
4039             // If advancing to the end of a combining sequence in character indexing space
4040             //   advanced us beyond the end of the match in CE space, reject this match.
4041             if (mLimit > maxLimit) {
4042                 found = FALSE;
4043             }
4044
4045             if (!isBreakBoundary(strsrch, mLimit)) {
4046                 found = FALSE;
4047             }
4048         }
4049
4050         if (! checkIdentical(strsrch, mStart, mLimit)) {
4051             found = FALSE;
4052         }
4053
4054         if (found) {
4055             break;
4056         }
4057     }
4058
4059     #ifdef USEARCH_DEBUG
4060     if (getenv("USEARCH_DEBUG") != NULL) {
4061         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4062         int32_t  lastToPrint = ceb.limitIx+2;
4063         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4064             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4065         }
4066         printf("\n%s\n", found? "match found" : "no match");
4067     }
4068     #endif
4069
4070     // All Done.  Store back the match bounds to the caller.
4071     //
4072     if (found==FALSE) {
4073         mLimit = -1;
4074         mStart = -1;
4075     }
4076
4077     if (matchStart != NULL) {
4078         *matchStart= mStart;
4079     }
4080
4081     if (matchLimit != NULL) {
4082         *matchLimit = mLimit;
4083     }
4084
4085     return found;
4086 }
4087
4088 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4089                                                 int32_t        startIdx,
4090                                                 int32_t        *matchStart,
4091                                                 int32_t        *matchLimit,
4092                                                 UErrorCode     *status)
4093 {
4094     if (U_FAILURE(*status)) {
4095         return FALSE;
4096     }
4097
4098     // TODO:  reject search patterns beginning with a combining char.
4099
4100 #ifdef USEARCH_DEBUG
4101     if (getenv("USEARCH_DEBUG") != NULL) {
4102         printf("Pattern CEs\n");
4103         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4104             printf(" %8x", strsrch->pattern.ces[ii]);
4105         }
4106         printf("\n");
4107     }
4108
4109 #endif
4110     // Input parameter sanity check.
4111     //  TODO:  should input indicies clip to the text length
4112     //         in the same way that UText does.
4113     if(strsrch->pattern.cesLength == 0         ||
4114        startIdx < 0                           ||
4115        startIdx > strsrch->search->textLength ||
4116        strsrch->pattern.ces == NULL) {
4117            *status = U_ILLEGAL_ARGUMENT_ERROR;
4118            return FALSE;
4119     }
4120
4121     if (strsrch->pattern.pces == NULL) {
4122         initializePatternPCETable(strsrch, status);
4123     }
4124
4125     CEIBuffer ceb(strsrch, status);
4126     int32_t    targetIx = 0;
4127
4128     /*
4129      * Pre-load the buffer with the CE's for the grapheme
4130      * after our starting position so that we're sure that
4131      * we can look at the CE following the match when we
4132      * check the match boundaries.
4133      *
4134      * This will also pre-fetch the first CE that we'll
4135      * consider for the match.
4136      */
4137     if (startIdx < strsrch->search->textLength) {
4138         UBreakIterator *bi = strsrch->search->internalBreakIter;
4139         int32_t next = ubrk_following(bi, startIdx);
4140
4141         ucol_setOffset(strsrch->textIter, next, status);
4142
4143         for (targetIx = 0; ; targetIx += 1) {
4144             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4145                 break;
4146             }
4147         }
4148     } else {
4149         ucol_setOffset(strsrch->textIter, startIdx, status);
4150     }
4151
4152
4153     const CEI *targetCEI = NULL;
4154     int32_t    patIx;
4155     UBool      found;
4156
4157     int32_t  limitIx = targetIx;
4158     int32_t  mStart = -1;
4159     int32_t  mLimit = -1;
4160     int32_t  minLimit;
4161     int32_t  maxLimit;
4162
4163
4164
4165     // Outer loop moves over match starting positions in the
4166     //      target CE space.
4167     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4168     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4169     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4170     // and the beginning of the base text.
4171     for(targetIx = limitIx; ; targetIx += 1)
4172     {
4173         found = TRUE;
4174         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4175         // (compared to the last CE fetched for the previous targetIx value) as we need to go
4176         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4177         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4178         if (lastCEI == NULL) {
4179             *status = U_INTERNAL_PROGRAM_ERROR;
4180             found = FALSE;
4181              break;
4182         }
4183         //  Inner loop checks for a match beginning at each
4184         //  position from the outer loop.
4185         int32_t targetIxOffset = 0;
4186         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4187             int64_t patCE = strsrch->pattern.pces[patIx];
4188
4189             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4190             //  Compare CE from target string with CE from the pattern.
4191             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4192             //    which will fail the compare, below.
4193             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4194             if ( ceMatch == U_CE_NO_MATCH ) {
4195                 found = FALSE;
4196                 break;
4197             } else if ( ceMatch > U_CE_NO_MATCH ) {
4198                 if ( ceMatch == U_CE_SKIP_TARG ) {
4199                     // redo with same patCE, next targCE
4200                     patIx++;
4201                     targetIxOffset++;
4202                 } else { // ceMatch == U_CE_SKIP_PATN
4203                     // redo with same targCE, next patCE
4204                     targetIxOffset--;
4205                 }
4206             }
4207         }
4208
4209         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4210             // No match at this targetIx.  Try again at the next.
4211             continue;
4212         }
4213
4214         if (!found) {
4215             // No match at all, we have run off the end of the target text.
4216             break;
4217         }
4218
4219
4220         // We have found a match in CE space.
4221         // Now determine the bounds in string index space.
4222         //  There still is a chance of match failure if the CE range not correspond to
4223         //     an acceptable character range.
4224         //
4225         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4226         mStart   = firstCEI->lowIndex;
4227
4228         // Check for the start of the match being within a combining sequence.
4229         //   This can happen if the pattern itself begins with a combining char, and
4230         //   the match found combining marks in the target text that were attached
4231         //    to something else.
4232         //   This type of match should be rejected for not completely consuming a
4233         //   combining sequence.
4234         if (!isBreakBoundary(strsrch, mStart)) {
4235             found = FALSE;
4236         }
4237
4238         // Look at the high index of the first CE in the match. If it's the same as the
4239         // low index, the first CE in the match is in the middle of an expansion.
4240         if (mStart == firstCEI->highIndex) {
4241             found = FALSE;
4242         }
4243
4244
4245         minLimit = lastCEI->lowIndex;
4246
4247         if (targetIx > 0) {
4248             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4249             //   extended to the end of input, and the match is good.
4250
4251             // Look at the high and low indices of the CE following the match. If
4252             // they are the same it means one of two things:
4253             //    1. The match extended to the last CE from the target text, which is OK, or
4254             //    2. The last CE that was part of the match is in an expansion that extends
4255             //       to the first CE after the match. In this case, we reject the match.
4256             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4257
4258             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4259                 found = FALSE;
4260             }
4261
4262             mLimit = maxLimit = nextCEI->lowIndex;
4263
4264             //  Advance the match end position to the first acceptable match boundary.
4265             //    This advances the index over any combining characters.
4266             if (minLimit < maxLimit) {
4267                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4268
4269                 if (nba >= lastCEI->highIndex) {
4270                     mLimit = nba;
4271                 }
4272             }
4273
4274             // If default breakIter is being used, and next collation element belonging to this
4275             //    combining sequence has non-zero primary weight and corresponds to a separate
4276             //    character following the one at end of the current match, then do NOT require
4277             //    that match end position be on a breakIter boundary, or that end of the
4278             //    combining sequence not extend beyond the match in CE space. Only do those
4279             //    tests if the conditions above are not met. Added this to make prefix search
4280             //    work in Indic scripts per <rdar://problem/18063262>.
4281             UBool doLimitTests = !(strsrch->search->breakIter == NULL &&
4282                        nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4283                        nextCEI->lowIndex >= lastCEI->highIndex && nextCEI->highIndex > nextCEI->lowIndex);
4284
4285             if (doLimitTests) { // <rdar://problem/18063262>
4286                 // If advancing to the end of a combining sequence in character indexing space
4287                 //   advanced us beyond the end of the match in CE space, reject this match.
4288                 if (mLimit > maxLimit) {
4289                     found = FALSE;
4290                 }
4291
4292                 // Make sure the end of the match is on a break boundary
4293                 if (!isBreakBoundary(strsrch, mLimit)) {
4294                     found = FALSE;
4295                 }
4296             }
4297
4298         } else {
4299             // No non-ignorable CEs after this point.
4300             // The maximum position is detected by boundary after
4301             // the last non-ignorable CE. Combining sequence
4302             // across the start index will be truncated.
4303             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4304             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4305         }
4306
4307     #ifdef USEARCH_DEBUG
4308         if (getenv("USEARCH_DEBUG") != NULL) {
4309             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4310         }
4311     #endif
4312
4313
4314         if (! checkIdentical(strsrch, mStart, mLimit)) {
4315             found = FALSE;
4316         }
4317
4318         if (found) {
4319             break;
4320         }
4321     }
4322
4323     #ifdef USEARCH_DEBUG
4324     if (getenv("USEARCH_DEBUG") != NULL) {
4325         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4326         int32_t  lastToPrint = ceb.limitIx+2;
4327         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4328             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4329         }
4330         printf("\n%s\n", found? "match found" : "no match");
4331     }
4332     #endif
4333
4334     // All Done.  Store back the match bounds to the caller.
4335     //
4336     if (found==FALSE) {
4337         mLimit = -1;
4338         mStart = -1;
4339     }
4340
4341     if (matchStart != NULL) {
4342         *matchStart= mStart;
4343     }
4344
4345     if (matchLimit != NULL) {
4346         *matchLimit = mLimit;
4347     }
4348
4349     return found;
4350 }
4351
4352 // internal use methods declared in usrchimp.h -----------------------------
4353
4354 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4355 {
4356     if (U_FAILURE(*status)) {
4357         setMatchNotFound(strsrch);
4358         return FALSE;
4359     }
4360
4361 #if BOYER_MOORE
4362     UCollationElements *coleiter        = strsrch->textIter;
4363     int32_t             textlength      = strsrch->search->textLength;
4364     int32_t            *patternce       = strsrch->pattern.ces;
4365     int32_t             patterncelength = strsrch->pattern.cesLength;
4366     int32_t             textoffset      = ucol_getOffset(coleiter);
4367
4368     // status used in setting coleiter offset, since offset is checked in
4369     // shiftForward before setting the coleiter offset, status never
4370     // a failure
4371     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4372                               patterncelength);
4373     while (textoffset <= textlength)
4374     {
4375         uint32_t    patternceindex = patterncelength - 1;
4376         int32_t     targetce;
4377         UBool       found          = FALSE;
4378         int32_t    lastce          = UCOL_NULLORDER;
4379
4380         setColEIterOffset(coleiter, textoffset);
4381
4382         for (;;) {
4383             // finding the last pattern ce match, imagine composite characters
4384             // for example: search for pattern A in text \u00C0
4385             // we'll have to skip \u0300 the grave first before we get to A
4386             targetce = ucol_previous(coleiter, status);
4387             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4388                 found = FALSE;
4389                 break;
4390             }
4391             targetce = getCE(strsrch, targetce);
4392             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4393                 // this is for the text \u0315\u0300 that requires
4394                 // normalization and pattern \u0300, where \u0315 is ignorable
4395                 continue;
4396             }
4397             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4398                 lastce = targetce;
4399             }
4400             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4401             if (targetce == patternce[patternceindex]) {
4402                 // the first ce can be a contraction
4403                 found = TRUE;
4404                 break;
4405             }
4406             if (!hasExpansion(coleiter)) {
4407                 found = FALSE;
4408                 break;
4409             }
4410         }
4411
4412         //targetce = lastce;
4413
4414         while (found && patternceindex > 0) {
4415             lastce = targetce;
4416             targetce    = ucol_previous(coleiter, status);
4417             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4418                 found = FALSE;
4419                 break;
4420             }
4421             targetce    = getCE(strsrch, targetce);
4422             if (targetce == UCOL_IGNORABLE) {
4423                 continue;
4424             }
4425
4426             patternceindex --;
4427             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4428             found = found && targetce == patternce[patternceindex];
4429         }
4430
4431         targetce = lastce;
4432
4433         if (!found) {
4434             if (U_FAILURE(*status)) {
4435                 break;
4436             }
4437             textoffset = shiftForward(strsrch, textoffset, lastce,
4438                                       patternceindex);
4439             // status checked at loop.
4440             patternceindex = patterncelength;
4441             continue;
4442         }
4443
4444         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4445             // status checked in ucol_setOffset
4446             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4447             return TRUE;
4448         }
4449     }
4450     setMatchNotFound(strsrch);
4451     return FALSE;
4452 #else
4453     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4454     int32_t start = -1;
4455     int32_t end = -1;
4456
4457     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4458         strsrch->search->matchedIndex  = start;
4459         strsrch->search->matchedLength = end - start;
4460         return TRUE;
4461     } else {
4462         setMatchNotFound(strsrch);
4463         return FALSE;
4464     }
4465 #endif
4466 }
4467
4468 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4469 {
4470     if (U_FAILURE(*status)) {
4471         setMatchNotFound(strsrch);
4472         return FALSE;
4473     }
4474
4475 #if BOYER_MOORE
4476     UCollationElements *coleiter        = strsrch->textIter;
4477     int32_t             textlength      = strsrch->search->textLength;
4478     int32_t            *patternce       = strsrch->pattern.ces;
4479     int32_t             patterncelength = strsrch->pattern.cesLength;
4480     int32_t             textoffset      = ucol_getOffset(coleiter);
4481     UBool               hasPatternAccents =
4482        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4483
4484     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4485                               patterncelength);
4486     strsrch->canonicalPrefixAccents[0] = 0;
4487     strsrch->canonicalSuffixAccents[0] = 0;
4488
4489     while (textoffset <= textlength)
4490     {
4491         int32_t     patternceindex = patterncelength - 1;
4492         int32_t     targetce;
4493         UBool       found          = FALSE;
4494         int32_t     lastce         = UCOL_NULLORDER;
4495
4496         setColEIterOffset(coleiter, textoffset);
4497
4498         for (;;) {
4499             // finding the last pattern ce match, imagine composite characters
4500             // for example: search for pattern A in text \u00C0
4501             // we'll have to skip \u0300 the grave first before we get to A
4502             targetce = ucol_previous(coleiter, status);
4503             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4504                 found = FALSE;
4505                 break;
4506             }
4507             targetce = getCE(strsrch, targetce);
4508             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4509                 lastce = targetce;
4510             }
4511             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4512             if (targetce == patternce[patternceindex]) {
4513                 // the first ce can be a contraction
4514                 found = TRUE;
4515                 break;
4516             }
4517             if (!hasExpansion(coleiter)) {
4518                 found = FALSE;
4519                 break;
4520             }
4521         }
4522
4523         while (found && patternceindex > 0) {
4524             targetce    = ucol_previous(coleiter, status);
4525             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4526                 found = FALSE;
4527                 break;
4528             }
4529             targetce    = getCE(strsrch, targetce);
4530             if (targetce == UCOL_IGNORABLE) {
4531                 continue;
4532             }
4533
4534             patternceindex --;
4535             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4536             found = found && targetce == patternce[patternceindex];
4537         }
4538
4539         // initializing the rearranged accent array
4540         if (hasPatternAccents && !found) {
4541             strsrch->canonicalPrefixAccents[0] = 0;
4542             strsrch->canonicalSuffixAccents[0] = 0;
4543             if (U_FAILURE(*status)) {
4544                 break;
4545             }
4546             found = doNextCanonicalMatch(strsrch, textoffset, status);
4547         }
4548
4549         if (!found) {
4550             if (U_FAILURE(*status)) {
4551                 break;
4552             }
4553             textoffset = shiftForward(strsrch, textoffset, lastce,
4554                                       patternceindex);
4555             // status checked at loop
4556             patternceindex = patterncelength;
4557             continue;
4558         }
4559
4560         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4561             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4562             return TRUE;
4563         }
4564     }
4565     setMatchNotFound(strsrch);
4566     return FALSE;
4567 #else
4568     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4569     int32_t start = -1;
4570     int32_t end = -1;
4571
4572     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4573         strsrch->search->matchedIndex  = start;
4574         strsrch->search->matchedLength = end - start;
4575         return TRUE;
4576     } else {
4577         setMatchNotFound(strsrch);
4578         return FALSE;
4579     }
4580 #endif
4581 }
4582
4583 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4584 {
4585     if (U_FAILURE(*status)) {
4586         setMatchNotFound(strsrch);
4587         return FALSE;
4588     }
4589
4590 #if BOYER_MOORE
4591     UCollationElements *coleiter        = strsrch->textIter;
4592     int32_t            *patternce       = strsrch->pattern.ces;
4593     int32_t             patterncelength = strsrch->pattern.cesLength;
4594     int32_t             textoffset      = ucol_getOffset(coleiter);
4595
4596     // shifting it check for setting offset
4597     // if setOffset is called previously or there was no previous match, we
4598     // leave the offset as it is.
4599     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4600         textoffset = strsrch->search->matchedIndex;
4601     }
4602
4603     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4604                               patterncelength);
4605
4606     while (textoffset >= 0)
4607     {
4608         int32_t     patternceindex = 1;
4609         int32_t     targetce;
4610         UBool       found          = FALSE;
4611         int32_t     firstce        = UCOL_NULLORDER;
4612
4613         // if status is a failure, ucol_setOffset does nothing
4614         setColEIterOffset(coleiter, textoffset);
4615
4616         for (;;) {
4617             // finding the first pattern ce match, imagine composite
4618             // characters. for example: search for pattern \u0300 in text
4619             // \u00C0, we'll have to skip A first before we get to
4620             // \u0300 the grave accent
4621             targetce = ucol_next(coleiter, status);
4622             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4623                 found = FALSE;
4624                 break;
4625             }
4626             targetce = getCE(strsrch, targetce);
4627             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4628                 firstce = targetce;
4629             }
4630             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4631                 continue;
4632             }
4633             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4634             if (targetce == patternce[0]) {
4635                 found = TRUE;
4636                 break;
4637             }
4638             if (!hasExpansion(coleiter)) {
4639                 // checking for accents in composite character
4640                 found = FALSE;
4641                 break;
4642             }
4643         }
4644
4645         //targetce = firstce;
4646
4647         while (found && (patternceindex < patterncelength)) {
4648             firstce = targetce;
4649             targetce    = ucol_next(coleiter, status);
4650             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4651                 found = FALSE;
4652                 break;
4653             }
4654             targetce    = getCE(strsrch, targetce);
4655             if (targetce == UCOL_IGNORABLE) {
4656                 continue;
4657             }
4658
4659             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4660             found = found && targetce == patternce[patternceindex];
4661             patternceindex ++;
4662         }
4663
4664         targetce = firstce;
4665
4666         if (!found) {
4667             if (U_FAILURE(*status)) {
4668                 break;
4669             }
4670
4671             textoffset = reverseShift(strsrch, textoffset, targetce,
4672                                       patternceindex);
4673             patternceindex = 0;
4674             continue;
4675         }
4676
4677         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4678             setColEIterOffset(coleiter, textoffset);
4679             return TRUE;
4680         }
4681     }
4682     setMatchNotFound(strsrch);
4683     return FALSE;
4684 #else
4685     int32_t textOffset;
4686
4687     if (strsrch->search->isOverlap) {
4688         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4689             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4690         } else {
4691             // move the start position at the end of possible match
4692             initializePatternPCETable(strsrch, status);
4693             if (!initTextProcessedIter(strsrch, status)) {
4694                 setMatchNotFound(strsrch);
4695                 return FALSE;
4696             }
4697             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4698                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4699                 if (pce == UCOL_PROCESSED_NULLORDER) {
4700                     // at the end of the text
4701                     break;
4702                 }
4703             }
4704             if (U_FAILURE(*status)) {
4705                 setMatchNotFound(strsrch);
4706                 return FALSE;
4707             }
4708             textOffset = ucol_getOffset(strsrch->textIter);
4709         }
4710     } else {
4711         textOffset = ucol_getOffset(strsrch->textIter);
4712     }
4713
4714     int32_t start = -1;
4715     int32_t end = -1;
4716
4717     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4718         strsrch->search->matchedIndex = start;
4719         strsrch->search->matchedLength = end - start;
4720         return TRUE;
4721     } else {
4722         setMatchNotFound(strsrch);
4723         return FALSE;
4724     }
4725 #endif
4726 }
4727
4728 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4729                                       UErrorCode    *status)
4730 {
4731     if (U_FAILURE(*status)) {
4732         setMatchNotFound(strsrch);
4733         return FALSE;
4734     }
4735
4736 #if BOYER_MOORE
4737     UCollationElements *coleiter        = strsrch->textIter;
4738     int32_t            *patternce       = strsrch->pattern.ces;
4739     int32_t             patterncelength = strsrch->pattern.cesLength;
4740     int32_t             textoffset      = ucol_getOffset(coleiter);
4741     UBool               hasPatternAccents =
4742        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4743
4744     // shifting it check for setting offset
4745     // if setOffset is called previously or there was no previous match, we
4746     // leave the offset as it is.
4747     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4748         textoffset = strsrch->search->matchedIndex;
4749     }
4750
4751     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4752                               patterncelength);
4753     strsrch->canonicalPrefixAccents[0] = 0;
4754     strsrch->canonicalSuffixAccents[0] = 0;
4755
4756     while (textoffset >= 0)
4757     {
4758         int32_t     patternceindex = 1;
4759         int32_t     targetce;
4760         UBool       found          = FALSE;
4761         int32_t     firstce        = UCOL_NULLORDER;
4762
4763         setColEIterOffset(coleiter, textoffset);
4764         for (;;) {
4765             // finding the first pattern ce match, imagine composite
4766             // characters. for example: search for pattern \u0300 in text
4767             // \u00C0, we'll have to skip A first before we get to
4768             // \u0300 the grave accent
4769             targetce = ucol_next(coleiter, status);
4770             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4771                 found = FALSE;
4772                 break;
4773             }
4774             targetce = getCE(strsrch, targetce);
4775             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4776                 firstce = targetce;
4777             }
4778
4779             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4780             if (targetce == patternce[0]) {
4781                 // the first ce can be a contraction
4782                 found = TRUE;
4783                 break;
4784             }
4785             if (!hasExpansion(coleiter)) {
4786                 // checking for accents in composite character
4787                 found = FALSE;
4788                 break;
4789             }
4790         }
4791
4792         targetce = firstce;
4793
4794         while (found && patternceindex < patterncelength) {
4795             targetce    = ucol_next(coleiter, status);
4796             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4797                 found = FALSE;
4798                 break;
4799             }
4800             targetce = getCE(strsrch, targetce);
4801             if (targetce == UCOL_IGNORABLE) {
4802                 continue;
4803             }
4804
4805             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4806             found = found && targetce == patternce[patternceindex];
4807             patternceindex ++;
4808         }
4809
4810         // initializing the rearranged accent array
4811         if (hasPatternAccents && !found) {
4812             strsrch->canonicalPrefixAccents[0] = 0;
4813             strsrch->canonicalSuffixAccents[0] = 0;
4814             if (U_FAILURE(*status)) {
4815                 break;
4816             }
4817             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4818         }
4819
4820         if (!found) {
4821             if (U_FAILURE(*status)) {
4822                 break;
4823             }
4824             textoffset = reverseShift(strsrch, textoffset, targetce,
4825                                       patternceindex);
4826             patternceindex = 0;
4827             continue;
4828         }
4829
4830         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4831             setColEIterOffset(coleiter, textoffset);
4832             return TRUE;
4833         }
4834     }
4835     setMatchNotFound(strsrch);
4836     return FALSE;
4837 #else
4838     int32_t textOffset;
4839
4840     if (strsrch->search->isOverlap) {
4841         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4842             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4843         } else {
4844             // move the start position at the end of possible match
4845             initializePatternPCETable(strsrch, status);
4846             if (!initTextProcessedIter(strsrch, status)) {
4847                 setMatchNotFound(strsrch);
4848                 return FALSE;
4849             }
4850             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4851                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4852                 if (pce == UCOL_PROCESSED_NULLORDER) {
4853                     // at the end of the text
4854                     break;
4855                 }
4856             }
4857             if (U_FAILURE(*status)) {
4858                 setMatchNotFound(strsrch);
4859                 return FALSE;
4860             }
4861             textOffset = ucol_getOffset(strsrch->textIter);
4862         }
4863     } else {
4864         textOffset = ucol_getOffset(strsrch->textIter);
4865     }
4866
4867     int32_t start = -1;
4868     int32_t end = -1;
4869
4870     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4871         strsrch->search->matchedIndex = start;
4872         strsrch->search->matchedLength = end - start;
4873         return TRUE;
4874     } else {
4875         setMatchNotFound(strsrch);
4876         return FALSE;
4877     }
4878 #endif
4879 }
4880
4881 #endif /* #if !UCONFIG_NO_COLLATION */