icuSources/i18n/usearch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2006 IBM and others. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *  07/02/2001   synwee      Creation.
   7 **********************************************************************
   8 */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_COLLATION
  13
  14 #include "unicode/usearch.h"
  15 #include "unicode/ustring.h"
  16 #include "unicode/uchar.h"
  17 #include "unormimp.h"
  18 #include "ucol_imp.h"
  19 #include "usrchimp.h"
  20 #include "cmemory.h"
  21 #include "ucln_in.h"
  22
  23 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  24
  25 // internal definition ---------------------------------------------------
  26
  27 #define LAST_BYTE_MASK_          0xFF
  28 #define SECOND_LAST_BYTE_SHIFT_  8
  29 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
  30
  31 static const uint16_t *FCD_ = NULL;
  32
  33 // internal methods -------------------------------------------------
  34
  35 /**
  36 * Fast collation element iterator setOffset.
  37 * This function does not check for bounds.
  38 * @param coleiter collation element iterator
  39 * @param offset to set
  40 */
  41 static
  42 inline void setColEIterOffset(UCollationElements *elems,
  43                       int32_t             offset)
  44 {
  45     collIterate *ci = &(elems->iteratordata_);
  46     ci->pos         = ci->string + offset;
  47     ci->CEpos       = ci->toReturn = ci->CEs;
  48     if (ci->flags & UCOL_ITER_INNORMBUF) {
  49         ci->flags = ci->origFlags;
  50     }
  51     ci->fcdPosition = NULL;
  52 }
  53
  54 /**
  55 * Getting the mask for collation strength
  56 * @param strength collation strength
  57 * @return collation element mask
  58 */
  59 static
  60 inline uint32_t getMask(UCollationStrength strength)
  61 {
  62     switch (strength)
  63     {
  64     case UCOL_PRIMARY:
  65         return UCOL_PRIMARYORDERMASK;
  66     case UCOL_SECONDARY:
  67         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
  68     default:
  69         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
  70                UCOL_PRIMARYORDERMASK;
  71     }
  72 }
  73
  74 /**
  75 * This is to squeeze the 21bit ces into a 256 table
  76 * @param ce collation element
  77 * @return collapsed version of the collation element
  78 */
  79 static
  80 inline int hash(uint32_t ce)
  81 {
  82     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
  83     // well with the new collation where most of the latin 1 characters
  84     // are of the value xx000xxx. their hashes will most of the time be 0
  85     // to be discussed on the hash algo.
  86     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
  87 }
  88
  89 U_CDECL_BEGIN
  90 static UBool U_CALLCONV
  91 usearch_cleanup(void) {
  92     FCD_ = NULL;
  93     return TRUE;
  94 }
  95 U_CDECL_END
  96
  97 /**
  98 * Initializing the fcd tables.
  99 * Internal method, status assumed to be a success.
 100 * @param status output error if any, caller to check status before calling
 101 *               method, status assumed to be success when passed in.
 102 */
 103 static
 104 inline void initializeFCD(UErrorCode *status)
 105 {
 106     if (FCD_ == NULL) {
 107         FCD_ = unorm_getFCDTrie(status);
 108         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
 109     }
 110 }
 111
 112 /**
 113 * Gets the fcd value for a character at the argument index.
 114 * This method takes into accounts of the supplementary characters.
 115 * @param str UTF16 string where character for fcd retrieval resides
 116 * @param offset position of the character whose fcd is to be retrieved, to be
 117 *               overwritten with the next character position, taking
 118 *               surrogate characters into consideration.
 119 * @param strlength length of the argument string
 120 * @return fcd value
 121 */
 122 static
 123 uint16_t getFCD(const UChar   *str, int32_t *offset,
 124                              int32_t  strlength)
 125 {
 126     int32_t temp = *offset;
 127     uint16_t    result;
 128     UChar       ch   = str[temp];
 129     result = unorm_getFCD16(FCD_, ch);
 130     temp ++;
 131
 132     if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
 133         ch = str[temp];
 134         if (UTF_IS_SECOND_SURROGATE(ch)) {
 135             result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
 136             temp ++;
 137         } else {
 138             result = 0;
 139         }
 140     }
 141     *offset = temp;
 142     return result;
 143 }
 144
 145 /**
 146 * Getting the modified collation elements taking into account the collation
 147 * attributes
 148 * @param strsrch string search data
 149 * @param sourcece
 150 * @return the modified collation element
 151 */
 152 static
 153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
 154 {
 155     // note for tertiary we can't use the collator->tertiaryMask, that
 156     // is a preprocessed mask that takes into account case options. since
 157     // we are only concerned with exact matches, we don't need that.
 158     sourcece &= strsrch->ceMask;
 159
 160     if (strsrch->toShift) {
 161         // alternate handling here, since only the 16 most significant digits
 162         // is only used, we can safely do a compare without masking
 163         // if the ce is a variable, we mask and get only the primary values
 164         // no shifting to quartenary is required since all primary values
 165         // less than variabletop will need to be masked off anyway.
 166         if (strsrch->variableTop > sourcece) {
 167             if (strsrch->strength == UCOL_QUATERNARY) {
 168                 sourcece &= UCOL_PRIMARYORDERMASK;
 169             }
 170             else {
 171                 sourcece = UCOL_IGNORABLE;
 172             }
 173         }
 174     }
 175
 176     return sourcece;
 177 }
 178
 179 /**
 180 * Allocate a memory and returns NULL if it failed.
 181 * Internal method, status assumed to be a success.
 182 * @param size to allocate
 183 * @param status output error if any, caller to check status before calling
 184 *               method, status assumed to be success when passed in.
 185 * @return newly allocated array, NULL otherwise
 186 */
 187 static
 188 inline void * allocateMemory(uint32_t size, UErrorCode *status)
 189 {
 190     uint32_t *result = (uint32_t *)uprv_malloc(size);
 191     if (result == NULL) {
 192         *status = U_MEMORY_ALLOCATION_ERROR;
 193     }
 194     return result;
 195 }
 196
 197 /**
 198 * Adds a uint32_t value to a destination array.
 199 * Creates a new array if we run out of space. The caller will have to
 200 * manually deallocate the newly allocated array.
 201 * Internal method, status assumed to be success, caller has to check status
 202 * before calling this method. destination not to be NULL and has at least
 203 * size destinationlength.
 204 * @param destination target array
 205 * @param offset destination offset to add value
 206 * @param destinationlength target array size, return value for the new size
 207 * @param value to be added
 208 * @param increments incremental size expected
 209 * @param status output error if any, caller to check status before calling
 210 *               method, status assumed to be success when passed in.
 211 * @return new destination array, destination if there was no new allocation
 212 */
 213 static
 214 inline int32_t * addTouint32_tArray(int32_t    *destination,
 215                                     uint32_t    offset,
 216                                     uint32_t   *destinationlength,
 217                                     uint32_t    value,
 218                                     uint32_t    increments,
 219                                     UErrorCode *status)
 220 {
 221     uint32_t newlength = *destinationlength;
 222     if (offset + 1 == newlength) {
 223         newlength += increments;
 224         int32_t *temp = (int32_t *)allocateMemory(
 225                                          sizeof(int32_t) * newlength, status);
 226         if (U_FAILURE(*status)) {
 227             return NULL;
 228         }
 229         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
 230         *destinationlength = newlength;
 231         destination        = temp;
 232     }
 233     destination[offset] = value;
 234     return destination;
 235 }
 236
 237 /**
 238 * Initializing the ce table for a pattern.
 239 * Stores non-ignorable collation keys.
 240 * Table size will be estimated by the size of the pattern text. Table
 241 * expansion will be perform as we go along. Adding 1 to ensure that the table
 242 * size definitely increases.
 243 * Internal method, status assumed to be a success.
 244 * @param strsrch string search data
 245 * @param status output error if any, caller to check status before calling
 246 *               method, status assumed to be success when passed in.
 247 * @return total number of expansions
 248 */
 249 static
 250 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
 251                                          UErrorCode    *status)
 252 {
 253     UPattern *pattern            = &(strsrch->pattern);
 254     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
 255     int32_t  *cetable            = pattern->CEBuffer;
 256     uint32_t  patternlength      = pattern->textLength;
 257     UCollationElements *coleiter = strsrch->utilIter;
 258
 259     if (coleiter == NULL) {
 260         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 261                                      patternlength, status);
 262         // status will be checked in ucol_next(..) later and if it is an
 263         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 264         // returned.
 265         strsrch->utilIter = coleiter;
 266     }
 267     else {
 268         uprv_init_collIterate(strsrch->collator, pattern->text,
 269                          pattern->textLength,
 270                          &coleiter->iteratordata_);
 271     }
 272
 273     if (pattern->CE != cetable && pattern->CE) {
 274         uprv_free(pattern->CE);
 275     }
 276
 277     uint16_t  offset      = 0;
 278     uint16_t  result      = 0;
 279     int32_t   ce;
 280
 281     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
 282            U_SUCCESS(*status)) {
 283         uint32_t newce = getCE(strsrch, ce);
 284         if (newce) {
 285             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
 286                                   newce,
 287                                   patternlength - ucol_getOffset(coleiter) + 1,
 288                                   status);
 289             if (U_FAILURE(*status)) {
 290                 return 0;
 291             }
 292             offset ++;
 293             if (cetable != temp && cetable != pattern->CEBuffer) {
 294                 uprv_free(cetable);
 295             }
 296             cetable = temp;
 297         }
 298         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 299     }
 300
 301     cetable[offset]   = 0;
 302     pattern->CE       = cetable;
 303     pattern->CELength = offset;
 304
 305     return result;
 306 }
 307
 308 /**
 309 * Initializes the pattern struct.
 310 * Internal method, status assumed to be success.
 311 * @param strsrch UStringSearch data storage
 312 * @param status output error if any, caller to check status before calling
 313 *               method, status assumed to be success when passed in.
 314 * @return expansionsize the total expansion size of the pattern
 315 */
 316 static
 317 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
 318 {
 319           UPattern   *pattern     = &(strsrch->pattern);
 320     const UChar      *patterntext = pattern->text;
 321           int32_t     length      = pattern->textLength;
 322           int32_t index       = 0;
 323
 324     pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
 325                                                      SECOND_LAST_BYTE_SHIFT_;
 326     index = length;
 327     UTF_BACK_1(patterntext, 0, index);
 328     pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
 329                                                              LAST_BYTE_MASK_;
 330     // since intializePattern is an internal method status is a success.
 331     return initializePatternCETable(strsrch, status);
 332 }
 333
 334 /**
 335 * Initializing shift tables, with the default values.
 336 * If a corresponding default value is 0, the shift table is not set.
 337 * @param shift table for forwards shift
 338 * @param backshift table for backwards shift
 339 * @param cetable table containing pattern ce
 340 * @param cesize size of the pattern ces
 341 * @param expansionsize total size of the expansions
 342 * @param defaultforward the default forward value
 343 * @param defaultbackward the default backward value
 344 */
 345 static
 346 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
 347                           int32_t  *cetable, int32_t cesize,
 348                           int16_t   expansionsize,
 349                           int16_t   defaultforward,
 350                           int16_t   defaultbackward)
 351 {
 352     // estimate the value to shift. to do that we estimate the smallest
 353     // number of characters to give the relevant ces, ie approximately
 354     // the number of ces minus their expansion, since expansions can come
 355     // from a character.
 356     int32_t count;
 357     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 358         shift[count] = defaultforward;
 359     }
 360     cesize --; // down to the last index
 361     for (count = 0; count < cesize; count ++) {
 362         // number of ces from right of array to the count
 363         int temp = defaultforward - count - 1;
 364         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
 365     }
 366     shift[hash(cetable[cesize])] = 1;
 367     // for ignorables we just shift by one. see test examples.
 368     shift[hash(0)] = 1;
 369
 370     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 371         backshift[count] = defaultbackward;
 372     }
 373     for (count = cesize; count > 0; count --) {
 374         // the original value count does not seem to work
 375         backshift[hash(cetable[count])] = count > expansionsize ?
 376                                           (int16_t)(count - expansionsize) : 1;
 377     }
 378     backshift[hash(cetable[0])] = 1;
 379     backshift[hash(0)] = 1;
 380 }
 381
 382 /**
 383 * Building of the pattern collation element list and the boyer moore strsrch
 384 * table.
 385 * The canonical match will only be performed after the default match fails.
 386 * For both cases we need to remember the size of the composed and decomposed
 387 * versions of the string. Since the Boyer-Moore shift calculations shifts by
 388 * a number of characters in the text and tries to match the pattern from that
 389 * offset, the shift value can not be too large in case we miss some
 390 * characters. To choose a right shift size, we estimate the NFC form of the
 391 * and use its size as a shift guide. The NFC form should be the small
 392 * possible representation of the pattern. Anyways, we'll err on the smaller
 393 * shift size. Hence the calculation for minlength.
 394 * Canonical match will be performed slightly differently. We'll split the
 395 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
 396 * the first and last base character (MS), the ending accents (EA). Matches
 397 * will be done on MS first, and only when we match MS then some processing
 398 * will be required for the prefix and end accents in order to determine if
 399 * they match PA and EA. Hence the default shift values
 400 * for the canonical match will take the size of either end's accent into
 401 * consideration. Forwards search will take the end accents into consideration
 402 * for the default shift values and the backwards search will take the prefix
 403 * accents into consideration.
 404 * If pattern has no non-ignorable ce, we return a illegal argument error.
 405 * Internal method, status assumed to be success.
 406 * @param strsrch UStringSearch data storage
 407 * @param status  for output errors if it occurs, status is assumed to be a
 408 *                success when it is passed in.
 409 */
 410 static
 411 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
 412 {
 413     int16_t expandlength  = initializePattern(strsrch, status);
 414     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
 415         UPattern *pattern = &strsrch->pattern;
 416         int32_t   cesize  = pattern->CELength;
 417
 418         int16_t minlength = cesize > expandlength
 419                             ? (int16_t)cesize - expandlength : 1;
 420         pattern->defaultShiftSize    = minlength;
 421         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
 422                       cesize, expandlength, minlength, minlength);
 423         return;
 424     }
 425     strsrch->pattern.defaultShiftSize = 0;
 426 }
 427
 428 /**
 429 * Determine whether the target text in UStringSearch bounded by the offset
 430 * start and end is one or more whole units of text as
 431 * determined by the breakiterator in UStringSearch.
 432 * @param strsrch string search data
 433 * @param start target text start offset
 434 * @param end target text end offset
 435 */
 436 static
 437 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
 438                                int32_t    end)
 439 {
 440 #if !UCONFIG_NO_BREAK_ITERATION
 441     UBreakIterator *breakiterator = strsrch->search->breakIter;
 442     if (breakiterator) {
 443         int32_t startindex = ubrk_first(breakiterator);
 444         int32_t endindex   = ubrk_last(breakiterator);
 445
 446         // out-of-range indexes are never boundary positions
 447         if (start < startindex || start > endindex ||
 448             end < startindex || end > endindex) {
 449             return FALSE;
 450         }
 451         // otherwise, we can use following() on the position before the
 452         // specified one and return true of the position we get back is the
 453         // one the user specified
 454         UBool result = (start == startindex ||
 455                 ubrk_following(breakiterator, start - 1) == start) &&
 456                (end == endindex ||
 457                 ubrk_following(breakiterator, end - 1) == end);
 458         if (result) {
 459             // iterates the individual ces
 460                   UCollationElements *coleiter  = strsrch->utilIter;
 461             const UChar              *text      = strsrch->search->text +
 462                                                                       start;
 463                   UErrorCode          status    = U_ZERO_ERROR;
 464             ucol_setText(coleiter, text, end - start, &status);
 465             for (int32_t count = 0; count < strsrch->pattern.CELength;
 466                  count ++) {
 467                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 468                 if (ce == UCOL_IGNORABLE) {
 469                     count --;
 470                     continue;
 471                 }
 472                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
 473                     return FALSE;
 474                 }
 475             }
 476             int32_t nextce = ucol_next(coleiter, &status);
 477             while (ucol_getOffset(coleiter) == (end - start)
 478                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
 479                 nextce = ucol_next(coleiter, &status);
 480             }
 481             if (ucol_getOffset(coleiter) == (end - start)
 482                 && nextce != UCOL_NULLORDER) {
 483                 // extra collation elements at the end of the match
 484                 return FALSE;
 485             }
 486         }
 487         return result;
 488     }
 489 #endif
 490     return TRUE;
 491 }
 492
 493 /**
 494 * Getting the next base character offset if current offset is an accent,
 495 * or the current offset if the current character contains a base character.
 496 * accents the following base character will be returned
 497 * @param text string
 498 * @param textoffset current offset
 499 * @param textlength length of text string
 500 * @return the next base character or the current offset
 501 *         if the current character is contains a base character.
 502 */
 503 static
 504 inline int32_t getNextBaseOffset(const UChar       *text,
 505                                            int32_t  textoffset,
 506                                            int32_t      textlength)
 507 {
 508     if (textoffset < textlength) {
 509         int32_t temp = textoffset;
 510         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 511             while (temp < textlength) {
 512                 int32_t result = temp;
 513                 if ((getFCD(text, &temp, textlength) >>
 514                      SECOND_LAST_BYTE_SHIFT_) == 0) {
 515                     return result;
 516                 }
 517             }
 518             return textlength;
 519         }
 520     }
 521     return textoffset;
 522 }
 523
 524 /**
 525 * Gets the next base character offset depending on the string search pattern
 526 * data
 527 * @param strsrch string search data
 528 * @param textoffset current offset, one offset away from the last character
 529 *                   to search for.
 530 * @return start index of the next base character or the current offset
 531 *         if the current character is contains a base character.
 532 */
 533 static
 534 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
 535                                                   int32_t    textoffset)
 536 {
 537     int32_t textlength = strsrch->search->textLength;
 538     if (strsrch->pattern.hasSuffixAccents &&
 539         textoffset < textlength) {
 540               int32_t  temp       = textoffset;
 541         const UChar       *text       = strsrch->search->text;
 542         UTF_BACK_1(text, 0, temp);
 543         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 544             return getNextBaseOffset(text, textoffset, textlength);
 545         }
 546     }
 547     return textoffset;
 548 }
 549
 550 /**
 551 * Shifting the collation element iterator position forward to prepare for
 552 * a following match. If the last character is a unsafe character, we'll only
 553 * shift by 1 to capture contractions, normalization etc.
 554 * Internal method, status assumed to be success.
 555 * @param text strsrch string search data
 556 * @param textoffset start text position to do search
 557 * @param ce the text ce which failed the match.
 558 * @param patternceindex index of the ce within the pattern ce buffer which
 559 *        failed the match
 560 * @return final offset
 561 */
 562 static
 563 inline int32_t shiftForward(UStringSearch *strsrch,
 564                                 int32_t    textoffset,
 565                                 int32_t       ce,
 566                                 int32_t        patternceindex)
 567 {
 568     UPattern *pattern = &(strsrch->pattern);
 569     if (ce != UCOL_NULLORDER) {
 570         int32_t shift = pattern->shift[hash(ce)];
 571         // this is to adjust for characters in the middle of the
 572         // substring for matching that failed.
 573         int32_t adjust = pattern->CELength - patternceindex;
 574         if (adjust > 1 && shift >= adjust) {
 575             shift -= adjust - 1;
 576         }
 577         textoffset += shift;
 578     }
 579     else {
 580         textoffset += pattern->defaultShiftSize;
 581     }
 582
 583     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
 584     // check for unsafe characters
 585     // * if it is the start or middle of a contraction: to be done after
 586     //   a initial match is found
 587     // * thai or lao base consonant character: similar to contraction
 588     // * high surrogate character: similar to contraction
 589     // * next character is a accent: shift to the next base character
 590     return textoffset;
 591 }
 592
 593 /**
 594 * sets match not found
 595 * @param strsrch string search data
 596 */
 597 static
 598 inline void setMatchNotFound(UStringSearch *strsrch)
 599 {
 600     // this method resets the match result regardless of the error status.
 601     strsrch->search->matchedIndex = USEARCH_DONE;
 602     strsrch->search->matchedLength = 0;
 603     if (strsrch->search->isForwardSearching) {
 604         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
 605     }
 606     else {
 607         setColEIterOffset(strsrch->textIter, 0);
 608     }
 609 }
 610
 611 /**
 612 * Gets the offset to the next safe point in text.
 613 * ie. not the middle of a contraction, swappable characters or supplementary
 614 * characters.
 615 * @param collator collation sata
 616 * @param text string to work with
 617 * @param textoffset offset in string
 618 * @param textlength length of text string
 619 * @return offset to the next safe character
 620 */
 621 static
 622 inline int32_t getNextSafeOffset(const UCollator   *collator,
 623                                      const UChar       *text,
 624                                            int32_t  textoffset,
 625                                            int32_t      textlength)
 626 {
 627     int32_t result = textoffset; // first contraction character
 628     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
 629         result ++;
 630     }
 631     return result;
 632 }
 633
 634 /**
 635 * This checks for accents in the potential match started with a .
 636 * composite character.
 637 * This is really painful... we have to check that composite character do not
 638 * have any extra accents. We have to normalize the potential match and find
 639 * the immediate decomposed character before the match.
 640 * The first composite character would have been taken care of by the fcd
 641 * checks in checkForwardExactMatch.
 642 * This is the slow path after the fcd of the first character and
 643 * the last character has been checked by checkForwardExactMatch and we
 644 * determine that the potential match has extra non-ignorable preceding
 645 * ces.
 646 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 647 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 648 * Note here that accents checking are slow and cautioned in the API docs.
 649 * Internal method, status assumed to be a success, caller should check status
 650 * before calling this method
 651 * @param strsrch string search data
 652 * @param start index of the potential unfriendly composite character
 653 * @param end index of the potential unfriendly composite character
 654 * @param status output error status if any.
 655 * @return TRUE if there is non-ignorable accents before at the beginning
 656 *              of the match, FALSE otherwise.
 657 */
 658
 659 static
 660 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
 661                                    int32_t    end,
 662                                    UErrorCode    *status)
 663 {
 664     UBool result = FALSE;
 665     if (strsrch->pattern.hasPrefixAccents) {
 666               int32_t  length = end - start;
 667               int32_t  offset = 0;
 668         const UChar       *text   = strsrch->search->text + start;
 669
 670         UTF_FWD_1(text, offset, length);
 671         // we are only concerned with the first composite character
 672         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
 673             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
 674                                                        text, 0, length);
 675             if (safeoffset != length) {
 676                 safeoffset ++;
 677             }
 678             UChar   *norm = NULL;
 679             UChar    buffer[INITIAL_ARRAY_SIZE_];
 680             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
 681                                             buffer, INITIAL_ARRAY_SIZE_,
 682                                             status);
 683             if (U_FAILURE(*status)) {
 684                 return FALSE;
 685             }
 686             if (size >= INITIAL_ARRAY_SIZE_) {
 687                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
 688                                                status);
 689                 // if allocation failed, status will be set to
 690                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
 691                 // checks for it.
 692                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
 693                                        size, status);
 694                 if (U_FAILURE(*status) && norm != NULL) {
 695                     uprv_free(norm);
 696                     return FALSE;
 697                 }
 698             }
 699             else {
 700                 norm = buffer;
 701             }
 702
 703             UCollationElements *coleiter  = strsrch->utilIter;
 704             ucol_setText(coleiter, norm, size, status);
 705             uint32_t            firstce   = strsrch->pattern.CE[0];
 706             UBool               ignorable = TRUE;
 707             uint32_t            ce        = UCOL_IGNORABLE;
 708             while (U_SUCCESS(*status) && ce != firstce) {
 709                 offset = ucol_getOffset(coleiter);
 710                 if (ce != firstce && ce != UCOL_IGNORABLE) {
 711                     ignorable = FALSE;
 712                 }
 713                 ce = ucol_next(coleiter, status);
 714             }
 715             UChar32 codepoint;
 716             UTF_PREV_CHAR(norm, 0, offset, codepoint);
 717             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
 718
 719             if (norm != buffer) {
 720                 uprv_free(norm);
 721             }
 722         }
 723     }
 724
 725     return result;
 726 }
 727
 728 /**
 729 * Used by exact matches, checks if there are accents before the match.
 730 * This is really painful... we have to check that composite characters at
 731 * the start of the matches have to not have any extra accents.
 732 * We check the FCD of the character first, if it starts with an accent and
 733 * the first pattern ce does not match the first ce of the character, we bail.
 734 * Otherwise we try normalizing the first composite
 735 * character and find the immediate decomposed character before the match to
 736 * see if it is an non-ignorable accent.
 737 * Now normalizing the first composite character is enough because we ensure
 738 * that when the match is passed in here with extra beginning ces, the
 739 * first or last ce that match has to occur within the first character.
 740 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 741 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 742 * Note here that accents checking are slow and cautioned in the API docs.
 743 * @param strsrch string search data
 744 * @param start offset
 745 * @param end offset
 746 * @return TRUE if there are accents on either side of the match,
 747 *         FALSE otherwise
 748 */
 749 static
 750 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
 751                                   int32_t    end)
 752 {
 753     if (strsrch->pattern.hasPrefixAccents) {
 754         UCollationElements *coleiter  = strsrch->textIter;
 755         UErrorCode          status    = U_ZERO_ERROR;
 756         // we have been iterating forwards previously
 757         uint32_t            ignorable = TRUE;
 758         int32_t             firstce   = strsrch->pattern.CE[0];
 759
 760         setColEIterOffset(coleiter, start);
 761         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
 762         if (U_FAILURE(status)) {
 763             return TRUE;
 764         }
 765         while (ce != firstce) {
 766             if (ce != UCOL_IGNORABLE) {
 767                 ignorable = FALSE;
 768             }
 769             ce = getCE(strsrch, ucol_next(coleiter, &status));
 770             if (U_FAILURE(status)) {
 771                 return TRUE;
 772             }
 773         }
 774         if (!ignorable && inNormBuf(coleiter)) {
 775             // within normalization buffer, discontiguous handled here
 776             return TRUE;
 777         }
 778
 779         // within text
 780         int32_t temp = start;
 781         // original code
 782         // accent = (getFCD(strsrch->search->text, &temp,
 783         //                  strsrch->search->textLength)
 784         //            >> SECOND_LAST_BYTE_SHIFT_);
 785         // however this code does not work well with VC7 .net in release mode.
 786         // maybe the inlines for getFCD combined with shifting has bugs in
 787         // VC7. anyways this is a work around.
 788         UBool accent = getFCD(strsrch->search->text, &temp,
 789                               strsrch->search->textLength) > 0xFF;
 790         if (!accent) {
 791             return checkExtraMatchAccents(strsrch, start, end, &status);
 792         }
 793         if (!ignorable) {
 794             return TRUE;
 795         }
 796         if (start > 0) {
 797             temp = start;
 798             UTF_BACK_1(strsrch->search->text, 0, temp);
 799             if (getFCD(strsrch->search->text, &temp,
 800                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
 801                 setColEIterOffset(coleiter, start);
 802                 ce = ucol_previous(coleiter, &status);
 803                 if (U_FAILURE(status) ||
 804                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
 805                     return TRUE;
 806                 }
 807             }
 808         }
 809     }
 810
 811     return FALSE;
 812 }
 813
 814 /**
 815 * Used by exact matches, checks if there are accents bounding the match.
 816 * Note this is the initial boundary check. If the potential match
 817 * starts or ends with composite characters, the accents in those
 818 * characters will be determined later.
 819 * Not doing backwards iteration here, since discontiguos contraction for
 820 * backwards collation element iterator, use up too many characters.
 821 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
 822 * should fail since there is a acute at the end of \u01FA
 823 * Note here that accents checking are slow and cautioned in the API docs.
 824 * @param strsrch string search data
 825 * @param start offset of match
 826 * @param end end offset of the match
 827 * @return TRUE if there are accents on either side of the match,
 828 *         FALSE otherwise
 829 */
 830 static
 831 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
 832                                  int32_t    end)
 833 {
 834     if (strsrch->pattern.hasSuffixAccents) {
 835         const UChar       *text       = strsrch->search->text;
 836               int32_t  temp       = end;
 837               int32_t      textlength = strsrch->search->textLength;
 838         UTF_BACK_1(text, 0, temp);
 839         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 840             int32_t             firstce  = strsrch->pattern.CE[0];
 841             UCollationElements *coleiter = strsrch->textIter;
 842             UErrorCode          status   = U_ZERO_ERROR;
 843             setColEIterOffset(coleiter, start);
 844             while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
 845                 if (U_FAILURE(status)) {
 846                     return TRUE;
 847                 }
 848             }
 849             int32_t count = 1;
 850             while (count < strsrch->pattern.CELength) {
 851                 if (getCE(strsrch, ucol_next(coleiter, &status))
 852                     == UCOL_IGNORABLE) {
 853                     // Thai can give an ignorable here.
 854                     count --;
 855                 }
 856                 if (U_FAILURE(status)) {
 857                     return TRUE;
 858                 }
 859                 count ++;
 860             }
 861             int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 862             if (U_FAILURE(status)) {
 863                 return TRUE;
 864             }
 865             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
 866                 if (ucol_getOffset(coleiter) <= end) {
 867                     return TRUE;
 868                 }
 869                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 870                     return TRUE;
 871                 }
 872             }
 873         }
 874     }
 875     return FALSE;
 876 }
 877
 878 /**
 879 * Checks if the offset runs out of the text string
 880 * @param offset
 881 * @param textlength of the text string
 882 * @return TRUE if offset is out of bounds, FALSE otherwise
 883 */
 884 static
 885 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
 886 {
 887     return offset < 0 || offset > textlength;
 888 }
 889
 890 /**
 891 * Checks for identical match
 892 * @param strsrch string search data
 893 * @param start offset of possible match
 894 * @param end offset of possible match
 895 * @return TRUE if identical match is found
 896 */
 897 static
 898 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
 899                                   int32_t    end)
 900 {
 901     UChar t2[32], p2[32];
 902     int32_t length = end - start;
 903     if (strsrch->strength != UCOL_IDENTICAL) {
 904         return TRUE;
 905     }
 906
 907     UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
 908     int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
 909                                        strsrch->search->text + start, length,
 910                                        FALSE, 0, &status);
 911     // use separate status2 in case of buffer overflow
 912     if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
 913                                         strsrch->pattern.text,
 914                                         strsrch->pattern.textLength,
 915                                         FALSE, 0, &status2)) {
 916         return FALSE; // lengths are different
 917     }
 918
 919     // compare contents
 920     UChar *text, *pattern;
 921     if(U_SUCCESS(status)) {
 922         text = t2;
 923         pattern = p2;
 924     } else if(status==U_BUFFER_OVERFLOW_ERROR) {
 925         status = U_ZERO_ERROR;
 926         // allocate one buffer for both decompositions
 927         text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
 928         pattern = text + decomplength;
 929         unorm_decompose(text, decomplength, strsrch->search->text + start,
 930                         length, FALSE, 0, &status);
 931         unorm_decompose(pattern, decomplength, strsrch->pattern.text,
 932                         strsrch->pattern.textLength, FALSE, 0, &status);
 933     } else {
 934         // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
 935         // and that we don't uprv_free() an undefined text pointer
 936         text = pattern = t2;
 937         decomplength = 0;
 938     }
 939     UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
 940     if(text != t2) {
 941         uprv_free(text);
 942     }
 943     // return FALSE if NFD failed
 944     return U_SUCCESS(status) && result;
 945 }
 946
 947 /**
 948 * Checks to see if the match is repeated
 949 * @param strsrch string search data
 950 * @param start new match start index
 951 * @param end new match end index
 952 * @return TRUE if the the match is repeated, FALSE otherwise
 953 */
 954 static
 955 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
 956                                 int32_t    start,
 957                                 int32_t    end)
 958 {
 959     int32_t lastmatchindex = strsrch->search->matchedIndex;
 960     UBool       result;
 961     if (lastmatchindex == USEARCH_DONE) {
 962         return FALSE;
 963     }
 964     if (strsrch->search->isForwardSearching) {
 965         result = start <= lastmatchindex;
 966     }
 967     else {
 968         result = start >= lastmatchindex;
 969     }
 970     if (!result && !strsrch->search->isOverlap) {
 971         if (strsrch->search->isForwardSearching) {
 972             result = start < lastmatchindex + strsrch->search->matchedLength;
 973         }
 974         else {
 975             result = end > lastmatchindex;
 976         }
 977     }
 978     return result;
 979 }
 980
 981 /**
 982 * Gets the collation element iterator's current offset.
 983 * @param coleiter collation element iterator
 984 * @param forwards flag TRUE if we are moving in th forwards direction
 985 * @return current offset
 986 */
 987 static
 988 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
 989                                               UBool               forwards)
 990 {
 991     int32_t result = ucol_getOffset(coleiter);
 992     // intricacies of the the backwards collation element iterator
 993     if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
 994         result ++;
 995     }
 996     return result;
 997 }
 998
 999 /**
1000 * Checks match for contraction.
1001 * If the match ends with a partial contraction we fail.
1002 * If the match starts too far off (because of backwards iteration) we try to
1003 * chip off the extra characters depending on whether a breakiterator has
1004 * been used.
1005 * Internal method, error assumed to be success, caller has to check status
1006 * before calling this method.
1007 * @param strsrch string search data
1008 * @param start offset of potential match, to be modified if necessary
1009 * @param end offset of potential match, to be modified if necessary
1010 * @param status output error status if any
1011 * @return TRUE if match passes the contraction test, FALSE otherwise
1012 */
1013
1014 static
1015 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1016                                      int32_t   *start,
1017                                      int32_t   *end, UErrorCode  *status)
1018 {
1019           UCollationElements *coleiter   = strsrch->textIter;
1020           int32_t             textlength = strsrch->search->textLength;
1021           int32_t         temp       = *start;
1022     const UCollator          *collator   = strsrch->collator;
1023     const UChar              *text       = strsrch->search->text;
1024     // This part checks if either ends of the match contains potential
1025     // contraction. If so we'll have to iterate through them
1026     // The start contraction needs to be checked since ucol_previous dumps
1027     // all characters till the first safe character into the buffer.
1028     // *start + 1 is used to test for the unsafe characters instead of *start
1029     // because ucol_prev takes all unsafe characters till the first safe
1030     // character ie *start. so by testing *start + 1, we can estimate if
1031     // excess prefix characters has been included in the potential search
1032     // results.
1033     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1034         (*start + 1 < textlength
1035          && ucol_unsafeCP(text[*start + 1], collator))) {
1036         int32_t expansion  = getExpansionPrefix(coleiter);
1037         UBool   expandflag = expansion > 0;
1038         setColEIterOffset(coleiter, *start);
1039         while (expansion > 0) {
1040             // getting rid of the redundant ce, caused by setOffset.
1041             // since backward contraction/expansion may have extra ces if we
1042             // are in the normalization buffer, hasAccentsBeforeMatch would
1043             // have taken care of it.
1044             // E.g. the character \u01FA will have an expansion of 3, but if
1045             // we are only looking for acute and ring \u030A and \u0301, we'll
1046             // have to skip the first ce in the expansion buffer.
1047             ucol_next(coleiter, status);
1048             if (U_FAILURE(*status)) {
1049                 return FALSE;
1050             }
1051             if (ucol_getOffset(coleiter) != temp) {
1052                 *start = temp;
1053                 temp  = ucol_getOffset(coleiter);
1054             }
1055             expansion --;
1056         }
1057
1058         int32_t  *patternce       = strsrch->pattern.CE;
1059         int32_t   patterncelength = strsrch->pattern.CELength;
1060         int32_t   count           = 0;
1061         while (count < patterncelength) {
1062             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1063             if (ce == UCOL_IGNORABLE) {
1064                 continue;
1065             }
1066             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1067                 *start = temp;
1068                 temp   = ucol_getOffset(coleiter);
1069             }
1070             if (U_FAILURE(*status) || ce != patternce[count]) {
1071                 (*end) ++;
1072                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1073                 return FALSE;
1074             }
1075             count ++;
1076         }
1077     }
1078     return TRUE;
1079 }
1080
1081 /**
1082 * Checks and sets the match information if found.
1083 * Checks
1084 * <ul>
1085 * <li> the potential match does not repeat the previous match
1086 * <li> boundaries are correct
1087 * <li> exact matches has no extra accents
1088 * <li> identical matchesb
1089 * <li> potential match does not end in the middle of a contraction
1090 * <\ul>
1091 * Otherwise the offset will be shifted to the next character.
1092 * Internal method, status assumed to be success, caller has to check status
1093 * before calling this method.
1094 * @param strsrch string search data
1095 * @param textoffset offset in the collation element text. the returned value
1096 *        will be the truncated end offset of the match or the new start
1097 *        search offset.
1098 * @param status output error status if any
1099 * @return TRUE if the match is valid, FALSE otherwise
1100 */
1101 static
1102 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1103                                  int32_t   *textoffset, UErrorCode *status)
1104 {
1105     UCollationElements *coleiter = strsrch->textIter;
1106     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1107
1108     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1109         return FALSE;
1110     }
1111
1112     // this totally matches, however we need to check if it is repeating
1113     if (!isBreakUnit(strsrch, start, *textoffset) ||
1114         checkRepeatedMatch(strsrch, start, *textoffset) ||
1115         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1116         !checkIdentical(strsrch, start, *textoffset) ||
1117         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1118
1119         (*textoffset) ++;
1120         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1121         return FALSE;
1122     }
1123
1124     // totally match, we will get rid of the ending ignorables.
1125     strsrch->search->matchedIndex  = start;
1126     strsrch->search->matchedLength = *textoffset - start;
1127     return TRUE;
1128 }
1129
1130 /**
1131 * Getting the previous base character offset, or the current offset if the
1132 * current character is a base character
1133 * @param text string
1134 * @param textoffset one offset after the current character
1135 * @return the offset of the next character after the base character or the first
1136 *         composed character with accents
1137 */
1138 static
1139 inline int32_t getPreviousBaseOffset(const UChar       *text,
1140                                                int32_t  textoffset)
1141 {
1142     if (textoffset > 0) {
1143         while (TRUE) {
1144             int32_t result = textoffset;
1145             UTF_BACK_1(text, 0, textoffset);
1146             int32_t temp = textoffset;
1147             uint16_t fcd = getFCD(text, &temp, result);
1148             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1149                 if (fcd & LAST_BYTE_MASK_) {
1150                     return textoffset;
1151                 }
1152                 return result;
1153             }
1154             if (textoffset == 0) {
1155                 return 0;
1156             }
1157         }
1158     }
1159     return textoffset;
1160 }
1161
1162 /**
1163 * Getting the indexes of the accents that are not blocked in the argument
1164 * accent array
1165 * @param accents array of accents in nfd terminated by a 0.
1166 * @param accentsindex array of indexes of the accents that are not blocked
1167 */
1168 static
1169 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1170 {
1171     int32_t index     = 0;
1172     int32_t     length    = u_strlen(accents);
1173     UChar32     codepoint = 0;
1174     int         cclass    = 0;
1175     int         result    = 0;
1176     int32_t temp;
1177     while (index < length) {
1178         temp = index;
1179         UTF_NEXT_CHAR(accents, index, length, codepoint);
1180         if (u_getCombiningClass(codepoint) != cclass) {
1181             cclass        = u_getCombiningClass(codepoint);
1182             accentsindex[result] = temp;
1183             result ++;
1184         }
1185     }
1186     accentsindex[result] = length;
1187     return result;
1188 }
1189
1190 /**
1191 * Appends 3 UChar arrays to a destination array.
1192 * Creates a new array if we run out of space. The caller will have to
1193 * manually deallocate the newly allocated array.
1194 * Internal method, status assumed to be success, caller has to check status
1195 * before calling this method. destination not to be NULL and has at least
1196 * size destinationlength.
1197 * @param destination target array
1198 * @param destinationlength target array size, returning the appended length
1199 * @param source1 null-terminated first array
1200 * @param source2 second array
1201 * @param source2length length of seond array
1202 * @param source3 null-terminated third array
1203 * @param status error status if any
1204 * @return new destination array, destination if there was no new allocation
1205 */
1206 static
1207 inline UChar * addToUCharArray(      UChar      *destination,
1208                                      int32_t    *destinationlength,
1209                                const UChar      *source1,
1210                                const UChar      *source2,
1211                                      int32_t     source2length,
1212                                const UChar      *source3,
1213                                      UErrorCode *status)
1214 {
1215     int32_t source1length = source1 ? u_strlen(source1) : 0;
1216     int32_t source3length = source3 ? u_strlen(source3) : 0;
1217     if (*destinationlength < source1length + source2length + source3length +
1218                                                                            1)
1219     {
1220         destination = (UChar *)allocateMemory(
1221           (source1length + source2length + source3length + 1) * sizeof(UChar),
1222           status);
1223         // if error allocating memory, status will be
1224         // U_MEMORY_ALLOCATION_ERROR
1225         if (U_FAILURE(*status)) {
1226             *destinationlength = 0;
1227             return NULL;
1228         }
1229     }
1230     if (source1length != 0) {
1231         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1232     }
1233     if (source2length != 0) {
1234         uprv_memcpy(destination + source1length, source2,
1235                     sizeof(UChar) * source2length);
1236     }
1237     if (source3length != 0) {
1238         uprv_memcpy(destination + source1length + source2length, source3,
1239                     sizeof(UChar) * source3length);
1240     }
1241     *destinationlength = source1length + source2length + source3length;
1242     return destination;
1243 }
1244
1245 /**
1246 * Running through a collation element iterator to see if the contents matches
1247 * pattern in string search data
1248 * @param strsrch string search data
1249 * @param coleiter collation element iterator
1250 * @return TRUE if a match if found, FALSE otherwise
1251 */
1252 static
1253 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1254                                        UCollationElements *coleiter)
1255 {
1256     int         patternceindex = strsrch->pattern.CELength;
1257     int32_t    *patternce      = strsrch->pattern.CE;
1258     UErrorCode  status = U_ZERO_ERROR;
1259     while (patternceindex > 0) {
1260         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1261         if (ce == UCOL_IGNORABLE) {
1262             continue;
1263         }
1264         if (U_FAILURE(status) || ce != *patternce) {
1265             return FALSE;
1266         }
1267         patternce ++;
1268         patternceindex --;
1269     }
1270     return TRUE;
1271 }
1272
1273 /**
1274 * Rearranges the front accents to try matching.
1275 * Prefix accents in the text will be grouped according to their combining
1276 * class and the groups will be mixed and matched to try find the perfect
1277 * match with the pattern.
1278 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1279 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1280 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1281 *         "\u0301\u0325".
1282 * step 2: check if any of the generated substrings matches the pattern.
1283 * Internal method, status is assumed to be success, caller has to check status
1284 * before calling this method.
1285 * @param strsrch string search match
1286 * @param start first offset of the accents to start searching
1287 * @param end start of the last accent set
1288 * @param status output error status if any
1289 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1290 *         offset of the match. Note this start includes all preceding accents.
1291 */
1292 static
1293 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1294                                        int32_t    start,
1295                                        int32_t    end,
1296                                        UErrorCode    *status)
1297 {
1298     const UChar       *text       = strsrch->search->text;
1299           int32_t      textlength = strsrch->search->textLength;
1300           int32_t  tempstart  = start;
1301
1302     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1303         // die... failed at a base character
1304         return USEARCH_DONE;
1305     }
1306
1307     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1308     start = getPreviousBaseOffset(text, tempstart);
1309
1310     UChar       accents[INITIAL_ARRAY_SIZE_];
1311     // normalizing the offensive string
1312     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1313                     INITIAL_ARRAY_SIZE_, status);
1314     if (U_FAILURE(*status)) {
1315         return USEARCH_DONE;
1316     }
1317
1318     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1319     int32_t         accentsize = getUnblockedAccentIndex(accents,
1320                                                                  accentsindex);
1321     int32_t         count      = (2 << (accentsize - 1)) - 1;
1322     UChar               buffer[INITIAL_ARRAY_SIZE_];
1323     UCollationElements *coleiter   = strsrch->utilIter;
1324     while (U_SUCCESS(*status) && count > 0) {
1325         UChar *rearrange = strsrch->canonicalPrefixAccents;
1326         // copy the base characters
1327         for (int k = 0; k < accentsindex[0]; k ++) {
1328             *rearrange ++ = accents[k];
1329         }
1330         // forming all possible canonical rearrangement by dropping
1331         // sets of accents
1332         for (int i = 0; i <= accentsize - 1; i ++) {
1333             int32_t mask = 1 << (accentsize - i - 1);
1334             if (count & mask) {
1335                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1336                     *rearrange ++ = accents[j];
1337                 }
1338             }
1339         }
1340         *rearrange = 0;
1341         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1342         UChar   *match     = addToUCharArray(buffer, &matchsize,
1343                                            strsrch->canonicalPrefixAccents,
1344                                            strsrch->search->text + offset,
1345                                            end - offset,
1346                                            strsrch->canonicalSuffixAccents,
1347                                            status);
1348
1349         // if status is a failure, ucol_setText does nothing.
1350         // run the collator iterator through this match
1351         ucol_setText(coleiter, match, matchsize, status);
1352         if (U_SUCCESS(*status)) {
1353             if (checkCollationMatch(strsrch, coleiter)) {
1354                 if (match != buffer) {
1355                     uprv_free(match);
1356                 }
1357                 return start;
1358             }
1359         }
1360         count --;
1361     }
1362     return USEARCH_DONE;
1363 }
1364
1365 /**
1366 * Gets the offset to the safe point in text before textoffset.
1367 * ie. not the middle of a contraction, swappable characters or supplementary
1368 * characters.
1369 * @param collator collation sata
1370 * @param text string to work with
1371 * @param textoffset offset in string
1372 * @param textlength length of text string
1373 * @return offset to the previous safe character
1374 */
1375 static
1376 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1377                                       const UChar       *text,
1378                                             int32_t  textoffset)
1379 {
1380     int32_t result = textoffset; // first contraction character
1381     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1382         result --;
1383     }
1384     if (result != 0) {
1385         // the first contraction character is consider unsafe here
1386         result --;
1387     }
1388     return result;
1389 }
1390
1391 /**
1392 * Cleaning up after we passed the safe zone
1393 * @param strsrch string search data
1394 * @param safetext safe text array
1395 * @param safebuffer safe text buffer
1396 * @param coleiter collation element iterator for safe text
1397 */
1398 static
1399 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1400                                   UChar         *safebuffer)
1401 {
1402     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1403     {
1404        uprv_free(safetext);
1405     }
1406 }
1407
1408 /**
1409 * Take the rearranged end accents and tries matching. If match failed at
1410 * a seperate preceding set of accents (seperated from the rearranged on by
1411 * at least a base character) then we rearrange the preceding accents and
1412 * tries matching again.
1413 * We allow skipping of the ends of the accent set if the ces do not match.
1414 * However if the failure is found before the accent set, it fails.
1415 * Internal method, status assumed to be success, caller has to check status
1416 * before calling this method.
1417 * @param strsrch string search data
1418 * @param textoffset of the start of the rearranged accent
1419 * @param status output error status if any
1420 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1421 *         offset of the match. Note this start includes all preceding accents.
1422 */
1423 static
1424 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1425                                        int32_t    textoffset,
1426                                        UErrorCode    *status)
1427 {
1428     const UChar              *text           = strsrch->search->text;
1429     const UCollator          *collator       = strsrch->collator;
1430           int32_t             safelength     = 0;
1431           UChar              *safetext;
1432           int32_t             safetextlength;
1433           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1434           UCollationElements *coleiter       = strsrch->utilIter;
1435           int32_t         safeoffset     = textoffset;
1436
1437     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1438                                          collator)) {
1439         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1440         safelength     = textoffset - safeoffset;
1441         safetextlength = INITIAL_ARRAY_SIZE_;
1442         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1443                                          text + safeoffset, safelength,
1444                                          strsrch->canonicalSuffixAccents,
1445                                          status);
1446     }
1447     else {
1448         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1449         safetext       = strsrch->canonicalSuffixAccents;
1450     }
1451
1452     // if status is a failure, ucol_setText does nothing
1453     ucol_setText(coleiter, safetext, safetextlength, status);
1454     // status checked in loop below
1455
1456     int32_t  *ce        = strsrch->pattern.CE;
1457     int32_t   celength  = strsrch->pattern.CELength;
1458     int       ceindex   = celength - 1;
1459     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1460
1461     while (ceindex >= 0) {
1462         int32_t textce = ucol_previous(coleiter, status);
1463         if (U_FAILURE(*status)) {
1464             if (isSafe) {
1465                 cleanUpSafeText(strsrch, safetext, safebuffer);
1466             }
1467             return USEARCH_DONE;
1468         }
1469         if (textce == UCOL_NULLORDER) {
1470             // check if we have passed the safe buffer
1471             if (coleiter == strsrch->textIter) {
1472                 cleanUpSafeText(strsrch, safetext, safebuffer);
1473                 return USEARCH_DONE;
1474             }
1475             cleanUpSafeText(strsrch, safetext, safebuffer);
1476             safetext = safebuffer;
1477             coleiter = strsrch->textIter;
1478             setColEIterOffset(coleiter, safeoffset);
1479             // status checked at the start of the loop
1480             isSafe = FALSE;
1481             continue;
1482         }
1483         textce = getCE(strsrch, textce);
1484         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1485             // do the beginning stuff
1486             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1487             if (isSafe && failedoffset >= safelength) {
1488                 // alas... no hope. failed at rearranged accent set
1489                 cleanUpSafeText(strsrch, safetext, safebuffer);
1490                 return USEARCH_DONE;
1491             }
1492             else {
1493                 if (isSafe) {
1494                     failedoffset += safeoffset;
1495                     cleanUpSafeText(strsrch, safetext, safebuffer);
1496                 }
1497
1498                 // try rearranging the front accents
1499                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1500                                         failedoffset, textoffset, status);
1501                 if (result != USEARCH_DONE) {
1502                     // if status is a failure, ucol_setOffset does nothing
1503                     setColEIterOffset(strsrch->textIter, result);
1504                 }
1505                 if (U_FAILURE(*status)) {
1506                     return USEARCH_DONE;
1507                 }
1508                 return result;
1509             }
1510         }
1511         if (textce == ce[ceindex]) {
1512             ceindex --;
1513         }
1514     }
1515     // set offset here
1516     if (isSafe) {
1517         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1518         // sets the text iterator here with the correct expansion and offset
1519         int32_t    leftoverces = getExpansionPrefix(coleiter);
1520         cleanUpSafeText(strsrch, safetext, safebuffer);
1521         if (result >= safelength) {
1522             result = textoffset;
1523         }
1524         else {
1525             result += safeoffset;
1526         }
1527         setColEIterOffset(strsrch->textIter, result);
1528         strsrch->textIter->iteratordata_.toReturn =
1529                        setExpansionPrefix(strsrch->textIter, leftoverces);
1530         return result;
1531     }
1532
1533     return ucol_getOffset(coleiter);
1534 }
1535
1536 /**
1537 * Trying out the substring and sees if it can be a canonical match.
1538 * This will try normalizing the end accents and arranging them into canonical
1539 * equivalents and check their corresponding ces with the pattern ce.
1540 * Suffix accents in the text will be grouped according to their combining
1541 * class and the groups will be mixed and matched to try find the perfect
1542 * match with the pattern.
1543 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1544 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1545 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1546 *         "\u0301\u0325".
1547 * step 2: check if any of the generated substrings matches the pattern.
1548 * Internal method, status assumed to be success, caller has to check status
1549 * before calling this method.
1550 * @param strsrch string search data
1551 * @param textoffset end offset in the collation element text that ends with
1552 *                   the accents to be rearranged
1553 * @param status error status if any
1554 * @return TRUE if the match is valid, FALSE otherwise
1555 */
1556 static
1557 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1558                            int32_t    textoffset,
1559                            UErrorCode    *status)
1560 {
1561     const UChar       *text = strsrch->search->text;
1562           int32_t  temp = textoffset;
1563     UTF_BACK_1(text, 0, temp);
1564     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1565         UCollationElements *coleiter = strsrch->textIter;
1566         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1567         if (strsrch->pattern.hasPrefixAccents) {
1568             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1569                                                 status);
1570             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1571                 setColEIterOffset(coleiter, offset);
1572                 return TRUE;
1573             }
1574         }
1575         return FALSE;
1576     }
1577
1578     if (!strsrch->pattern.hasSuffixAccents) {
1579         return FALSE;
1580     }
1581
1582     UChar       accents[INITIAL_ARRAY_SIZE_];
1583     // offset to the last base character in substring to search
1584     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1585     // normalizing the offensive string
1586     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1587                                0, accents, INITIAL_ARRAY_SIZE_, status);
1588     // status checked in loop below
1589
1590     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1591     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1592
1593     // 2 power n - 1 plus the full set of accents
1594     int32_t  count = (2 << (size - 1)) - 1;
1595     while (U_SUCCESS(*status) && count > 0) {
1596         UChar *rearrange = strsrch->canonicalSuffixAccents;
1597         // copy the base characters
1598         for (int k = 0; k < accentsindex[0]; k ++) {
1599             *rearrange ++ = accents[k];
1600         }
1601         // forming all possible canonical rearrangement by dropping
1602         // sets of accents
1603         for (int i = 0; i <= size - 1; i ++) {
1604             int32_t mask = 1 << (size - i - 1);
1605             if (count & mask) {
1606                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1607                     *rearrange ++ = accents[j];
1608                 }
1609             }
1610         }
1611         *rearrange = 0;
1612         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1613                                                         status);
1614         if (offset != USEARCH_DONE) {
1615             return TRUE; // match found
1616         }
1617         count --;
1618     }
1619     return FALSE;
1620 }
1621
1622 /**
1623 * Gets the previous base character offset depending on the string search
1624 * pattern data
1625 * @param strsrch string search data
1626 * @param textoffset current offset, current character
1627 * @return the offset of the next character after this base character or itself
1628 *         if it is a composed character with accents
1629 */
1630 static
1631 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1632                                                       int32_t textoffset)
1633 {
1634     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1635         const UChar       *text = strsrch->search->text;
1636               int32_t  offset = textoffset;
1637         if (getFCD(text, &offset, strsrch->search->textLength) >>
1638                                                    SECOND_LAST_BYTE_SHIFT_) {
1639             return getPreviousBaseOffset(text, textoffset);
1640         }
1641     }
1642     return textoffset;
1643 }
1644
1645 /**
1646 * Checks match for contraction.
1647 * If the match ends with a partial contraction we fail.
1648 * If the match starts too far off (because of backwards iteration) we try to
1649 * chip off the extra characters
1650 * Internal method, status assumed to be success, caller has to check status
1651 * before calling this method.
1652 * @param strsrch string search data
1653 * @param start offset of potential match, to be modified if necessary
1654 * @param end offset of potential match, to be modified if necessary
1655 * @param status output error status if any
1656 * @return TRUE if match passes the contraction test, FALSE otherwise
1657 */
1658 static
1659 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1660                                          int32_t   *start,
1661                                          int32_t   *end,
1662                                          UErrorCode    *status)
1663 {
1664           UCollationElements *coleiter   = strsrch->textIter;
1665           int32_t             textlength = strsrch->search->textLength;
1666           int32_t         temp       = *start;
1667     const UCollator          *collator   = strsrch->collator;
1668     const UChar              *text       = strsrch->search->text;
1669     // This part checks if either ends of the match contains potential
1670     // contraction. If so we'll have to iterate through them
1671     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1672         (*start + 1 < textlength
1673          && ucol_unsafeCP(text[*start + 1], collator))) {
1674         int32_t expansion  = getExpansionPrefix(coleiter);
1675         UBool   expandflag = expansion > 0;
1676         setColEIterOffset(coleiter, *start);
1677         while (expansion > 0) {
1678             // getting rid of the redundant ce, caused by setOffset.
1679             // since backward contraction/expansion may have extra ces if we
1680             // are in the normalization buffer, hasAccentsBeforeMatch would
1681             // have taken care of it.
1682             // E.g. the character \u01FA will have an expansion of 3, but if
1683             // we are only looking for acute and ring \u030A and \u0301, we'll
1684             // have to skip the first ce in the expansion buffer.
1685             ucol_next(coleiter, status);
1686             if (U_FAILURE(*status)) {
1687                 return FALSE;
1688             }
1689             if (ucol_getOffset(coleiter) != temp) {
1690                 *start = temp;
1691                 temp  = ucol_getOffset(coleiter);
1692             }
1693             expansion --;
1694         }
1695
1696         int32_t  *patternce       = strsrch->pattern.CE;
1697         int32_t   patterncelength = strsrch->pattern.CELength;
1698         int32_t   count           = 0;
1699         int32_t   textlength      = strsrch->search->textLength;
1700         while (count < patterncelength) {
1701             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1702             // status checked below, note that if status is a failure
1703             // ucol_next returns UCOL_NULLORDER
1704             if (ce == UCOL_IGNORABLE) {
1705                 continue;
1706             }
1707             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1708                 *start = temp;
1709                 temp   = ucol_getOffset(coleiter);
1710             }
1711
1712             if (count == 0 && ce != patternce[0]) {
1713                 // accents may have extra starting ces, this occurs when a
1714                 // pure accent pattern is matched without rearrangement
1715                 // text \u0325\u0300 and looking for \u0300
1716                 int32_t expected = patternce[0];
1717                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1718                     ce = getCE(strsrch, ucol_next(coleiter, status));
1719                     while (U_SUCCESS(*status) && ce != expected &&
1720                            ce != UCOL_NULLORDER &&
1721                            ucol_getOffset(coleiter) <= *end) {
1722                         ce = getCE(strsrch, ucol_next(coleiter, status));
1723                     }
1724                 }
1725             }
1726             if (U_FAILURE(*status) || ce != patternce[count]) {
1727                 (*end) ++;
1728                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1729                 return FALSE;
1730             }
1731             count ++;
1732         }
1733     }
1734     return TRUE;
1735 }
1736
1737 /**
1738 * Checks and sets the match information if found.
1739 * Checks
1740 * <ul>
1741 * <li> the potential match does not repeat the previous match
1742 * <li> boundaries are correct
1743 * <li> potential match does not end in the middle of a contraction
1744 * <li> identical matches
1745 * <\ul>
1746 * Otherwise the offset will be shifted to the next character.
1747 * Internal method, status assumed to be success, caller has to check the
1748 * status before calling this method.
1749 * @param strsrch string search data
1750 * @param textoffset offset in the collation element text. the returned value
1751 *        will be the truncated end offset of the match or the new start
1752 *        search offset.
1753 * @param status output error status if any
1754 * @return TRUE if the match is valid, FALSE otherwise
1755 */
1756 static
1757 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1758                                      int32_t   *textoffset,
1759                                      UErrorCode    *status)
1760 {
1761     // to ensure that the start and ends are not composite characters
1762     UCollationElements *coleiter = strsrch->textIter;
1763     // if we have a canonical accent match
1764     if ((strsrch->pattern.hasSuffixAccents &&
1765         strsrch->canonicalSuffixAccents[0]) ||
1766         (strsrch->pattern.hasPrefixAccents &&
1767         strsrch->canonicalPrefixAccents[0])) {
1768         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1769                                                     strsrch,
1770                                                     ucol_getOffset(coleiter));
1771         strsrch->search->matchedLength = *textoffset -
1772                                                 strsrch->search->matchedIndex;
1773         return TRUE;
1774     }
1775
1776     int32_t start = getColElemIterOffset(coleiter, FALSE);
1777     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1778                                             status) || U_FAILURE(*status)) {
1779         return FALSE;
1780     }
1781
1782     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1783     // this totally matches, however we need to check if it is repeating
1784     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1785         !isBreakUnit(strsrch, start, *textoffset) ||
1786         !checkIdentical(strsrch, start, *textoffset)) {
1787         (*textoffset) ++;
1788         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1789                                         strsrch->search->textLength);
1790         return FALSE;
1791     }
1792
1793     strsrch->search->matchedIndex  = start;
1794     strsrch->search->matchedLength = *textoffset - start;
1795     return TRUE;
1796 }
1797
1798 /**
1799 * Shifting the collation element iterator position forward to prepare for
1800 * a preceding match. If the first character is a unsafe character, we'll only
1801 * shift by 1 to capture contractions, normalization etc.
1802 * Internal method, status assumed to be success, caller has to check status
1803 * before calling this method.
1804 * @param text strsrch string search data
1805 * @param textoffset start text position to do search
1806 * @param ce the text ce which failed the match.
1807 * @param patternceindex index of the ce within the pattern ce buffer which
1808 *        failed the match
1809 * @return final offset
1810 */
1811 static
1812 inline int32_t reverseShift(UStringSearch *strsrch,
1813                                 int32_t    textoffset,
1814                                 int32_t       ce,
1815                                 int32_t        patternceindex)
1816 {
1817     if (strsrch->search->isOverlap) {
1818         if (textoffset != strsrch->search->textLength) {
1819             textoffset --;
1820         }
1821         else {
1822             textoffset -= strsrch->pattern.defaultShiftSize;
1823         }
1824     }
1825     else {
1826         if (ce != UCOL_NULLORDER) {
1827             int32_t shift = strsrch->pattern.backShift[hash(ce)];
1828
1829             // this is to adjust for characters in the middle of the substring
1830             // for matching that failed.
1831             int32_t adjust = patternceindex;
1832             if (adjust > 1 && shift > adjust) {
1833                 shift -= adjust - 1;
1834             }
1835             textoffset -= shift;
1836         }
1837         else {
1838             textoffset -= strsrch->pattern.defaultShiftSize;
1839         }
1840     }
1841     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1842     return textoffset;
1843 }
1844
1845 /**
1846 * Checks match for contraction.
1847 * If the match starts with a partial contraction we fail.
1848 * Internal method, status assumed to be success, caller has to check status
1849 * before calling this method.
1850 * @param strsrch string search data
1851 * @param start offset of potential match, to be modified if necessary
1852 * @param end offset of potential match, to be modified if necessary
1853 * @param status output error status if any
1854 * @return TRUE if match passes the contraction test, FALSE otherwise
1855 */
1856 static
1857 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
1858                                      int32_t   *start,
1859                                      int32_t   *end, UErrorCode  *status)
1860 {
1861           UCollationElements *coleiter   = strsrch->textIter;
1862           int32_t             textlength = strsrch->search->textLength;
1863           int32_t             temp       = *end;
1864     const UCollator          *collator   = strsrch->collator;
1865     const UChar              *text       = strsrch->search->text;
1866     // This part checks if either if the start of the match contains potential
1867     // contraction. If so we'll have to iterate through them
1868     // Since we used ucol_next while previously looking for the potential
1869     // match, this guarantees that our end will not be a partial contraction,
1870     // or a partial supplementary character.
1871     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
1872         int32_t expansion  = getExpansionSuffix(coleiter);
1873         UBool   expandflag = expansion > 0;
1874         setColEIterOffset(coleiter, *end);
1875         while (U_SUCCESS(*status) && expansion > 0) {
1876             // getting rid of the redundant ce
1877             // since forward contraction/expansion may have extra ces
1878             // if we are in the normalization buffer, hasAccentsBeforeMatch
1879             // would have taken care of it.
1880             // E.g. the character \u01FA will have an expansion of 3, but if
1881             // we are only looking for A ring A\u030A, we'll have to skip the
1882             // last ce in the expansion buffer
1883             ucol_previous(coleiter, status);
1884             if (U_FAILURE(*status)) {
1885                 return FALSE;
1886             }
1887             if (ucol_getOffset(coleiter) != temp) {
1888                 *end = temp;
1889                 temp  = ucol_getOffset(coleiter);
1890             }
1891             expansion --;
1892         }
1893
1894         int32_t  *patternce       = strsrch->pattern.CE;
1895         int32_t   patterncelength = strsrch->pattern.CELength;
1896         int32_t   count           = patterncelength;
1897         while (count > 0) {
1898             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
1899             // status checked below, note that if status is a failure
1900             // ucol_previous returns UCOL_NULLORDER
1901             if (ce == UCOL_IGNORABLE) {
1902                 continue;
1903             }
1904             if (expandflag && count == 0 &&
1905                 getColElemIterOffset(coleiter, FALSE) != temp) {
1906                 *end = temp;
1907                 temp  = ucol_getOffset(coleiter);
1908             }
1909             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
1910                 (*start) --;
1911                 *start = getPreviousBaseOffset(text, *start);
1912                 return FALSE;
1913             }
1914             count --;
1915         }
1916     }
1917     return TRUE;
1918 }
1919
1920 /**
1921 * Checks and sets the match information if found.
1922 * Checks
1923 * <ul>
1924 * <li> the current match does not repeat the last match
1925 * <li> boundaries are correct
1926 * <li> exact matches has no extra accents
1927 * <li> identical matches
1928 * <\ul>
1929 * Otherwise the offset will be shifted to the preceding character.
1930 * Internal method, status assumed to be success, caller has to check status
1931 * before calling this method.
1932 * @param strsrch string search data
1933 * @param collator
1934 * @param coleiter collation element iterator
1935 * @param text string
1936 * @param textoffset offset in the collation element text. the returned value
1937 *        will be the truncated start offset of the match or the new start
1938 *        search offset.
1939 * @param status output error status if any
1940 * @return TRUE if the match is valid, FALSE otherwise
1941 */
1942 static
1943 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
1944                                      int32_t   *textoffset,
1945                                      UErrorCode    *status)
1946 {
1947     // to ensure that the start and ends are not composite characters
1948     int32_t end = ucol_getOffset(strsrch->textIter);
1949     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
1950         || U_FAILURE(*status)) {
1951             return FALSE;
1952     }
1953
1954     // this totally matches, however we need to check if it is repeating
1955     // the old match
1956     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
1957         !isBreakUnit(strsrch, *textoffset, end) ||
1958         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
1959         !checkIdentical(strsrch, *textoffset, end) ||
1960         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
1961         (*textoffset) --;
1962         *textoffset = getPreviousBaseOffset(strsrch->search->text,
1963                                             *textoffset);
1964         return FALSE;
1965     }
1966     strsrch->search->matchedIndex = *textoffset;
1967     strsrch->search->matchedLength = end - *textoffset;
1968     return TRUE;
1969 }
1970
1971 /**
1972 * Rearranges the end accents to try matching.
1973 * Suffix accents in the text will be grouped according to their combining
1974 * class and the groups will be mixed and matched to try find the perfect
1975 * match with the pattern.
1976 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1977 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1978 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1979 *         "\u0301\u0325".
1980 * step 2: check if any of the generated substrings matches the pattern.
1981 * Internal method, status assumed to be success, user has to check status
1982 * before calling this method.
1983 * @param strsrch string search match
1984 * @param start offset of the first base character
1985 * @param end start of the last accent set
1986 * @param status only error status if any
1987 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1988 *         offset of the match. Note this start includes all following accents.
1989 */
1990 static
1991 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
1992                                            int32_t    start,
1993                                            int32_t    end,
1994                                            UErrorCode    *status)
1995 {
1996     const UChar       *text       = strsrch->search->text;
1997           int32_t  tempend    = end;
1998
1999     UTF_BACK_1(text, 0, tempend);
2000     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2001                                                            LAST_BYTE_MASK_)) {
2002         // die... failed at a base character
2003         return USEARCH_DONE;
2004     }
2005     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2006
2007     if (U_SUCCESS(*status)) {
2008         UChar       accents[INITIAL_ARRAY_SIZE_];
2009         int32_t offset = getPreviousBaseOffset(text, end);
2010         // normalizing the offensive string
2011         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2012                         INITIAL_ARRAY_SIZE_, status);
2013
2014         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2015         int32_t         accentsize = getUnblockedAccentIndex(accents,
2016                                                          accentsindex);
2017         int32_t         count      = (2 << (accentsize - 1)) - 1;
2018         UChar               buffer[INITIAL_ARRAY_SIZE_];
2019         UCollationElements *coleiter = strsrch->utilIter;
2020         while (U_SUCCESS(*status) && count > 0) {
2021             UChar *rearrange = strsrch->canonicalSuffixAccents;
2022             // copy the base characters
2023             for (int k = 0; k < accentsindex[0]; k ++) {
2024                 *rearrange ++ = accents[k];
2025             }
2026             // forming all possible canonical rearrangement by dropping
2027             // sets of accents
2028             for (int i = 0; i <= accentsize - 1; i ++) {
2029                 int32_t mask = 1 << (accentsize - i - 1);
2030                 if (count & mask) {
2031                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2032                         *rearrange ++ = accents[j];
2033                     }
2034                 }
2035             }
2036             *rearrange = 0;
2037             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2038             UChar   *match     = addToUCharArray(buffer, &matchsize,
2039                                            strsrch->canonicalPrefixAccents,
2040                                            strsrch->search->text + start,
2041                                            offset - start,
2042                                            strsrch->canonicalSuffixAccents,
2043                                            status);
2044
2045             // run the collator iterator through this match
2046             // if status is a failure ucol_setText does nothing
2047             ucol_setText(coleiter, match, matchsize, status);
2048             if (U_SUCCESS(*status)) {
2049                 if (checkCollationMatch(strsrch, coleiter)) {
2050                     if (match != buffer) {
2051                         uprv_free(match);
2052                     }
2053                     return end;
2054                 }
2055             }
2056             count --;
2057         }
2058     }
2059     return USEARCH_DONE;
2060 }
2061
2062 /**
2063 * Take the rearranged start accents and tries matching. If match failed at
2064 * a seperate following set of accents (seperated from the rearranged on by
2065 * at least a base character) then we rearrange the preceding accents and
2066 * tries matching again.
2067 * We allow skipping of the ends of the accent set if the ces do not match.
2068 * However if the failure is found before the accent set, it fails.
2069 * Internal method, status assumed to be success, caller has to check status
2070 * before calling this method.
2071 * @param strsrch string search data
2072 * @param textoffset of the ends of the rearranged accent
2073 * @param status output error status if any
2074 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2075 *         offset of the match. Note this start includes all following accents.
2076 */
2077 static
2078 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2079                                            int32_t    textoffset,
2080                                            UErrorCode    *status)
2081 {
2082     const UChar       *text       = strsrch->search->text;
2083     const UCollator   *collator   = strsrch->collator;
2084           int32_t      safelength = 0;
2085           UChar       *safetext;
2086           int32_t      safetextlength;
2087           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2088           int32_t  safeoffset = textoffset;
2089
2090     if (textoffset &&
2091         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2092                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2093                                          ], collator)) {
2094         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2095                                            strsrch->search->textLength);
2096         safelength     = safeoffset - textoffset;
2097         safetextlength = INITIAL_ARRAY_SIZE_;
2098         safetext       = addToUCharArray(safebuffer, &safetextlength,
2099                                          strsrch->canonicalPrefixAccents,
2100                                          text + textoffset, safelength,
2101                                          NULL, status);
2102     }
2103     else {
2104         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2105         safetext       = strsrch->canonicalPrefixAccents;
2106     }
2107
2108     UCollationElements *coleiter = strsrch->utilIter;
2109      // if status is a failure, ucol_setText does nothing
2110     ucol_setText(coleiter, safetext, safetextlength, status);
2111     // status checked in loop below
2112
2113     int32_t  *ce           = strsrch->pattern.CE;
2114     int32_t   celength     = strsrch->pattern.CELength;
2115     int       ceindex      = 0;
2116     UBool     isSafe       = TRUE; // safe zone indication flag for position
2117     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2118
2119     while (ceindex < celength) {
2120         int32_t textce = ucol_next(coleiter, status);
2121         if (U_FAILURE(*status)) {
2122             if (isSafe) {
2123                 cleanUpSafeText(strsrch, safetext, safebuffer);
2124             }
2125             return USEARCH_DONE;
2126         }
2127         if (textce == UCOL_NULLORDER) {
2128             // check if we have passed the safe buffer
2129             if (coleiter == strsrch->textIter) {
2130                 cleanUpSafeText(strsrch, safetext, safebuffer);
2131                 return USEARCH_DONE;
2132             }
2133             cleanUpSafeText(strsrch, safetext, safebuffer);
2134             safetext = safebuffer;
2135             coleiter = strsrch->textIter;
2136             setColEIterOffset(coleiter, safeoffset);
2137             // status checked at the start of the loop
2138             isSafe = FALSE;
2139             continue;
2140         }
2141         textce = getCE(strsrch, textce);
2142         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2143             // do the beginning stuff
2144             int32_t failedoffset = ucol_getOffset(coleiter);
2145             if (isSafe && failedoffset <= prefixlength) {
2146                 // alas... no hope. failed at rearranged accent set
2147                 cleanUpSafeText(strsrch, safetext, safebuffer);
2148                 return USEARCH_DONE;
2149             }
2150             else {
2151                 if (isSafe) {
2152                     failedoffset = safeoffset - failedoffset;
2153                     cleanUpSafeText(strsrch, safetext, safebuffer);
2154                 }
2155
2156                 // try rearranging the end accents
2157                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2158                                         textoffset, failedoffset, status);
2159                 if (result != USEARCH_DONE) {
2160                     // if status is a failure, ucol_setOffset does nothing
2161                     setColEIterOffset(strsrch->textIter, result);
2162                 }
2163                 if (U_FAILURE(*status)) {
2164                     return USEARCH_DONE;
2165                 }
2166                 return result;
2167             }
2168         }
2169         if (textce == ce[ceindex]) {
2170             ceindex ++;
2171         }
2172     }
2173     // set offset here
2174     if (isSafe) {
2175         int32_t result      = ucol_getOffset(coleiter);
2176         // sets the text iterator here with the correct expansion and offset
2177         int32_t     leftoverces = getExpansionSuffix(coleiter);
2178         cleanUpSafeText(strsrch, safetext, safebuffer);
2179         if (result <= prefixlength) {
2180             result = textoffset;
2181         }
2182         else {
2183             result = textoffset + (safeoffset - result);
2184         }
2185         setColEIterOffset(strsrch->textIter, result);
2186         setExpansionSuffix(strsrch->textIter, leftoverces);
2187         return result;
2188     }
2189
2190     return ucol_getOffset(coleiter);
2191 }
2192
2193 /**
2194 * Trying out the substring and sees if it can be a canonical match.
2195 * This will try normalizing the starting accents and arranging them into
2196 * canonical equivalents and check their corresponding ces with the pattern ce.
2197 * Prefix accents in the text will be grouped according to their combining
2198 * class and the groups will be mixed and matched to try find the perfect
2199 * match with the pattern.
2200 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2201 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2202 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2203 *         "\u0301\u0325".
2204 * step 2: check if any of the generated substrings matches the pattern.
2205 * Internal method, status assumed to be success, caller has to check status
2206 * before calling this method.
2207 * @param strsrch string search data
2208 * @param textoffset start offset in the collation element text that starts
2209 *                   with the accents to be rearranged
2210 * @param status output error status if any
2211 * @return TRUE if the match is valid, FALSE otherwise
2212 */
2213 static
2214 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2215                                int32_t    textoffset,
2216                                UErrorCode    *status)
2217 {
2218     const UChar       *text       = strsrch->search->text;
2219           int32_t  temp       = textoffset;
2220           int32_t      textlength = strsrch->search->textLength;
2221     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2222         UCollationElements *coleiter = strsrch->textIter;
2223         int32_t         offset   = ucol_getOffset(coleiter);
2224         if (strsrch->pattern.hasSuffixAccents) {
2225             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2226                                                     offset, status);
2227             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2228                 setColEIterOffset(coleiter, offset);
2229                 return TRUE;
2230             }
2231         }
2232         return FALSE;
2233     }
2234
2235     if (!strsrch->pattern.hasPrefixAccents) {
2236         return FALSE;
2237     }
2238
2239     UChar       accents[INITIAL_ARRAY_SIZE_];
2240     // offset to the last base character in substring to search
2241     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2242     // normalizing the offensive string
2243     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2244                                0, accents, INITIAL_ARRAY_SIZE_, status);
2245     // status checked in loop
2246
2247     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2248     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2249
2250     // 2 power n - 1 plus the full set of accents
2251     int32_t  count = (2 << (size - 1)) - 1;
2252     while (U_SUCCESS(*status) && count > 0) {
2253         UChar *rearrange = strsrch->canonicalPrefixAccents;
2254         // copy the base characters
2255         for (int k = 0; k < accentsindex[0]; k ++) {
2256             *rearrange ++ = accents[k];
2257         }
2258         // forming all possible canonical rearrangement by dropping
2259         // sets of accents
2260         for (int i = 0; i <= size - 1; i ++) {
2261             int32_t mask = 1 << (size - i - 1);
2262             if (count & mask) {
2263                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2264                     *rearrange ++ = accents[j];
2265                 }
2266             }
2267         }
2268         *rearrange = 0;
2269         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2270                                                           baseoffset, status);
2271         if (offset != USEARCH_DONE) {
2272             return TRUE; // match found
2273         }
2274         count --;
2275     }
2276     return FALSE;
2277 }
2278
2279 /**
2280 * Checks match for contraction.
2281 * If the match starts with a partial contraction we fail.
2282 * Internal method, status assumed to be success, caller has to check status
2283 * before calling this method.
2284 * @param strsrch string search data
2285 * @param start offset of potential match, to be modified if necessary
2286 * @param end offset of potential match, to be modified if necessary
2287 * @param status only error status if any
2288 * @return TRUE if match passes the contraction test, FALSE otherwise
2289 */
2290 static
2291 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2292                                      int32_t   *start,
2293                                      int32_t   *end, UErrorCode  *status)
2294 {
2295           UCollationElements *coleiter   = strsrch->textIter;
2296           int32_t             textlength = strsrch->search->textLength;
2297           int32_t         temp       = *end;
2298     const UCollator          *collator   = strsrch->collator;
2299     const UChar              *text       = strsrch->search->text;
2300     // This part checks if either if the start of the match contains potential
2301     // contraction. If so we'll have to iterate through them
2302     // Since we used ucol_next while previously looking for the potential
2303     // match, this guarantees that our end will not be a partial contraction,
2304     // or a partial supplementary character.
2305     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2306         int32_t expansion  = getExpansionSuffix(coleiter);
2307         UBool   expandflag = expansion > 0;
2308         setColEIterOffset(coleiter, *end);
2309         while (expansion > 0) {
2310             // getting rid of the redundant ce
2311             // since forward contraction/expansion may have extra ces
2312             // if we are in the normalization buffer, hasAccentsBeforeMatch
2313             // would have taken care of it.
2314             // E.g. the character \u01FA will have an expansion of 3, but if
2315             // we are only looking for A ring A\u030A, we'll have to skip the
2316             // last ce in the expansion buffer
2317             ucol_previous(coleiter, status);
2318             if (U_FAILURE(*status)) {
2319                 return FALSE;
2320             }
2321             if (ucol_getOffset(coleiter) != temp) {
2322                 *end = temp;
2323                 temp  = ucol_getOffset(coleiter);
2324             }
2325             expansion --;
2326         }
2327
2328         int32_t  *patternce       = strsrch->pattern.CE;
2329         int32_t   patterncelength = strsrch->pattern.CELength;
2330         int32_t   count           = patterncelength;
2331         while (count > 0) {
2332             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2333             // status checked below, note that if status is a failure
2334             // ucol_previous returns UCOL_NULLORDER
2335             if (ce == UCOL_IGNORABLE) {
2336                 continue;
2337             }
2338             if (expandflag && count == 0 &&
2339                 getColElemIterOffset(coleiter, FALSE) != temp) {
2340                 *end = temp;
2341                 temp  = ucol_getOffset(coleiter);
2342             }
2343             if (count == patterncelength &&
2344                 ce != patternce[patterncelength - 1]) {
2345                 // accents may have extra starting ces, this occurs when a
2346                 // pure accent pattern is matched without rearrangement
2347                 int32_t    expected = patternce[patterncelength - 1];
2348                 UTF_BACK_1(text, 0, *end);
2349                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2350                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2351                     while (U_SUCCESS(*status) && ce != expected &&
2352                            ce != UCOL_NULLORDER &&
2353                            ucol_getOffset(coleiter) <= *start) {
2354                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2355                     }
2356                 }
2357             }
2358             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2359                 (*start) --;
2360                 *start = getPreviousBaseOffset(text, *start);
2361                 return FALSE;
2362             }
2363             count --;
2364         }
2365     }
2366     return TRUE;
2367 }
2368
2369 /**
2370 * Checks and sets the match information if found.
2371 * Checks
2372 * <ul>
2373 * <li> the potential match does not repeat the previous match
2374 * <li> boundaries are correct
2375 * <li> potential match does not end in the middle of a contraction
2376 * <li> identical matches
2377 * <\ul>
2378 * Otherwise the offset will be shifted to the next character.
2379 * Internal method, status assumed to be success, caller has to check status
2380 * before calling this method.
2381 * @param strsrch string search data
2382 * @param textoffset offset in the collation element text. the returned value
2383 *        will be the truncated start offset of the match or the new start
2384 *        search offset.
2385 * @param status only error status if any
2386 * @return TRUE if the match is valid, FALSE otherwise
2387 */
2388 static
2389 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2390                                          int32_t   *textoffset,
2391                                          UErrorCode    *status)
2392 {
2393     // to ensure that the start and ends are not composite characters
2394     UCollationElements *coleiter = strsrch->textIter;
2395     // if we have a canonical accent match
2396     if ((strsrch->pattern.hasSuffixAccents &&
2397         strsrch->canonicalSuffixAccents[0]) ||
2398         (strsrch->pattern.hasPrefixAccents &&
2399         strsrch->canonicalPrefixAccents[0])) {
2400         strsrch->search->matchedIndex  = *textoffset;
2401         strsrch->search->matchedLength =
2402             getNextUStringSearchBaseOffset(strsrch,
2403                                       getColElemIterOffset(coleiter, FALSE))
2404             - *textoffset;
2405         return TRUE;
2406     }
2407
2408     int32_t end = ucol_getOffset(coleiter);
2409     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2410                                                 status) ||
2411          U_FAILURE(*status)) {
2412         return FALSE;
2413     }
2414
2415     end = getNextUStringSearchBaseOffset(strsrch, end);
2416     // this totally matches, however we need to check if it is repeating
2417     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2418         !isBreakUnit(strsrch, *textoffset, end) ||
2419         !checkIdentical(strsrch, *textoffset, end)) {
2420         (*textoffset) --;
2421         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2422                                             *textoffset);
2423         return FALSE;
2424     }
2425
2426     strsrch->search->matchedIndex  = *textoffset;
2427     strsrch->search->matchedLength = end - *textoffset;
2428     return TRUE;
2429 }
2430
2431 // constructors and destructor -------------------------------------------
2432
2433 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2434                                           int32_t         patternlength,
2435                                     const UChar          *text,
2436                                           int32_t         textlength,
2437                                     const char           *locale,
2438                                           UBreakIterator *breakiter,
2439                                           UErrorCode     *status)
2440 {
2441     if (U_FAILURE(*status)) {
2442         return NULL;
2443     }
2444 #if UCONFIG_NO_BREAK_ITERATION
2445     if (breakiter != NULL) {
2446         *status = U_UNSUPPORTED_ERROR;
2447         return NULL;
2448     }
2449 #endif
2450     if (locale) {
2451         // ucol_open internally checks for status
2452         UCollator     *collator = ucol_open(locale, status);
2453         // pattern, text checks are done in usearch_openFromCollator
2454         UStringSearch *result   = usearch_openFromCollator(pattern,
2455                                               patternlength, text, textlength,
2456                                               collator, breakiter, status);
2457
2458         if (result == NULL || U_FAILURE(*status)) {
2459             if (collator) {
2460                 ucol_close(collator);
2461             }
2462             return NULL;
2463         }
2464         else {
2465             result->ownCollator = TRUE;
2466         }
2467         return result;
2468     }
2469     *status = U_ILLEGAL_ARGUMENT_ERROR;
2470     return NULL;
2471 }
2472
2473 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2474                                   const UChar          *pattern,
2475                                         int32_t         patternlength,
2476                                   const UChar          *text,
2477                                         int32_t         textlength,
2478                                   const UCollator      *collator,
2479                                         UBreakIterator *breakiter,
2480                                         UErrorCode     *status)
2481 {
2482     if (U_FAILURE(*status)) {
2483         return NULL;
2484     }
2485 #if UCONFIG_NO_BREAK_ITERATION
2486     if (breakiter != NULL) {
2487         *status = U_UNSUPPORTED_ERROR;
2488         return NULL;
2489     }
2490 #endif
2491     if (pattern == NULL || text == NULL || collator == NULL) {
2492         *status = U_ILLEGAL_ARGUMENT_ERROR;
2493         return NULL;
2494     }
2495
2496     // string search does not really work when numeric collation is turned on
2497     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2498         *status = U_UNSUPPORTED_ERROR;
2499         return NULL;
2500     }
2501
2502     if (U_SUCCESS(*status)) {
2503         initializeFCD(status);
2504         if (U_FAILURE(*status)) {
2505             return NULL;
2506         }
2507
2508         UStringSearch *result;
2509         if (textlength == -1) {
2510             textlength = u_strlen(text);
2511         }
2512         if (patternlength == -1) {
2513             patternlength = u_strlen(pattern);
2514         }
2515         if (textlength <= 0 || patternlength <= 0) {
2516             *status = U_ILLEGAL_ARGUMENT_ERROR;
2517             return NULL;
2518         }
2519
2520         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2521         if (result == NULL) {
2522             *status = U_MEMORY_ALLOCATION_ERROR;
2523             return NULL;
2524         }
2525
2526         result->collator    = collator;
2527         result->strength    = ucol_getStrength(collator);
2528         result->ceMask      = getMask(result->strength);
2529         result->toShift     =
2530              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2531                                                             UCOL_SHIFTED;
2532         result->variableTop = ucol_getVariableTop(collator, status);
2533
2534         if (U_FAILURE(*status)) {
2535             uprv_free(result);
2536             return NULL;
2537         }
2538
2539         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2540         if (result->search == NULL) {
2541             *status = U_MEMORY_ALLOCATION_ERROR;
2542             uprv_free(result);
2543             return NULL;
2544         }
2545
2546         result->search->text       = text;
2547         result->search->textLength = textlength;
2548
2549         result->pattern.text       = pattern;
2550         result->pattern.textLength = patternlength;
2551         result->pattern.CE         = NULL;
2552
2553         result->search->breakIter  = breakiter;
2554 #if !UCONFIG_NO_BREAK_ITERATION
2555         if (breakiter) {
2556             ubrk_setText(breakiter, text, textlength, status);
2557         }
2558 #endif
2559
2560         result->ownCollator           = FALSE;
2561         result->search->matchedLength = 0;
2562         result->search->matchedIndex  = USEARCH_DONE;
2563         result->textIter              = ucol_openElements(collator, text,
2564                                                           textlength, status);
2565         if (U_FAILURE(*status)) {
2566             usearch_close(result);
2567             return NULL;
2568         }
2569
2570         result->utilIter              = NULL;
2571
2572         result->search->isOverlap          = FALSE;
2573         result->search->isCanonicalMatch   = FALSE;
2574         result->search->isForwardSearching = TRUE;
2575         result->search->reset              = TRUE;
2576
2577         initialize(result, status);
2578
2579         if (U_FAILURE(*status)) {
2580             usearch_close(result);
2581             return NULL;
2582         }
2583
2584         return result;
2585     }
2586     return NULL;
2587 }
2588
2589 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2590 {
2591     if (strsrch) {
2592         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2593             strsrch->pattern.CE) {
2594             uprv_free(strsrch->pattern.CE);
2595         }
2596         ucol_closeElements(strsrch->textIter);
2597         ucol_closeElements(strsrch->utilIter);
2598         if (strsrch->ownCollator && strsrch->collator) {
2599             ucol_close((UCollator *)strsrch->collator);
2600         }
2601         uprv_free(strsrch->search);
2602         uprv_free(strsrch);
2603     }
2604 }
2605
2606 // set and get methods --------------------------------------------------
2607
2608 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2609                                         int32_t    position,
2610                                         UErrorCode    *status)
2611 {
2612     if (U_SUCCESS(*status) && strsrch) {
2613         if (isOutOfBounds(strsrch->search->textLength, position)) {
2614             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2615         }
2616         else {
2617             setColEIterOffset(strsrch->textIter, position);
2618         }
2619         strsrch->search->matchedIndex  = USEARCH_DONE;
2620         strsrch->search->matchedLength = 0;
2621         strsrch->search->reset         = FALSE;
2622     }
2623 }
2624
2625 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2626 {
2627     if (strsrch) {
2628         int32_t result = ucol_getOffset(strsrch->textIter);
2629         if (isOutOfBounds(strsrch->search->textLength, result)) {
2630             return USEARCH_DONE;
2631         }
2632         return result;
2633     }
2634     return USEARCH_DONE;
2635 }
2636
2637 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2638                                  USearchAttribute attribute,
2639                                  USearchAttributeValue value,
2640                                  UErrorCode *status)
2641 {
2642     if (U_SUCCESS(*status) && strsrch) {
2643         switch (attribute)
2644         {
2645         case USEARCH_OVERLAP :
2646             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2647             break;
2648         case USEARCH_CANONICAL_MATCH :
2649             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2650                                                                       FALSE);
2651             break;
2652         case USEARCH_ATTRIBUTE_COUNT :
2653         default:
2654             *status = U_ILLEGAL_ARGUMENT_ERROR;
2655         }
2656     }
2657     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2658         *status = U_ILLEGAL_ARGUMENT_ERROR;
2659     }
2660 }
2661
2662 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2663                                                 const UStringSearch *strsrch,
2664                                                 USearchAttribute attribute)
2665 {
2666     if (strsrch) {
2667         switch (attribute) {
2668         case USEARCH_OVERLAP :
2669             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2670                                                         USEARCH_OFF);
2671         case USEARCH_CANONICAL_MATCH :
2672             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2673                                                                USEARCH_OFF);
2674         case USEARCH_ATTRIBUTE_COUNT :
2675             return USEARCH_DEFAULT;
2676         }
2677     }
2678     return USEARCH_DEFAULT;
2679 }
2680
2681 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2682                                                 const UStringSearch *strsrch)
2683 {
2684     if (strsrch == NULL) {
2685         return USEARCH_DONE;
2686     }
2687     return strsrch->search->matchedIndex;
2688 }
2689
2690
2691 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2692                                             UChar         *result,
2693                                             int32_t        resultCapacity,
2694                                             UErrorCode    *status)
2695 {
2696     if (U_FAILURE(*status)) {
2697         return USEARCH_DONE;
2698     }
2699     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2700         result == NULL)) {
2701         *status = U_ILLEGAL_ARGUMENT_ERROR;
2702         return USEARCH_DONE;
2703     }
2704
2705     int32_t     copylength = strsrch->search->matchedLength;
2706     int32_t copyindex  = strsrch->search->matchedIndex;
2707     if (copyindex == USEARCH_DONE) {
2708         u_terminateUChars(result, resultCapacity, 0, status);
2709         return USEARCH_DONE;
2710     }
2711
2712     if (resultCapacity < copylength) {
2713         copylength = resultCapacity;
2714     }
2715     if (copylength > 0) {
2716         uprv_memcpy(result, strsrch->search->text + copyindex,
2717                     copylength * sizeof(UChar));
2718     }
2719     return u_terminateUChars(result, resultCapacity,
2720                              strsrch->search->matchedLength, status);
2721 }
2722
2723 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2724                                               const UStringSearch *strsrch)
2725 {
2726     if (strsrch) {
2727         return strsrch->search->matchedLength;
2728     }
2729     return USEARCH_DONE;
2730 }
2731
2732 #if !UCONFIG_NO_BREAK_ITERATION
2733
2734 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2735                                                UBreakIterator *breakiter,
2736                                                UErrorCode     *status)
2737 {
2738     if (U_SUCCESS(*status) && strsrch) {
2739         strsrch->search->breakIter = breakiter;
2740         if (breakiter) {
2741             ubrk_setText(breakiter, strsrch->search->text,
2742                          strsrch->search->textLength, status);
2743         }
2744     }
2745 }
2746
2747 U_CAPI const UBreakIterator* U_EXPORT2
2748 usearch_getBreakIterator(const UStringSearch *strsrch)
2749 {
2750     if (strsrch) {
2751         return strsrch->search->breakIter;
2752     }
2753     return NULL;
2754 }
2755
2756 #endif
2757
2758 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2759                                       const UChar         *text,
2760                                             int32_t        textlength,
2761                                             UErrorCode    *status)
2762 {
2763     if (U_SUCCESS(*status)) {
2764         if (strsrch == NULL || text == NULL || textlength < -1 ||
2765             textlength == 0) {
2766             *status = U_ILLEGAL_ARGUMENT_ERROR;
2767         }
2768         else {
2769             if (textlength == -1) {
2770                 textlength = u_strlen(text);
2771             }
2772             strsrch->search->text       = text;
2773             strsrch->search->textLength = textlength;
2774             ucol_setText(strsrch->textIter, text, textlength, status);
2775             strsrch->search->matchedIndex  = USEARCH_DONE;
2776             strsrch->search->matchedLength = 0;
2777             strsrch->search->reset         = TRUE;
2778 #if !UCONFIG_NO_BREAK_ITERATION
2779             if (strsrch->search->breakIter != NULL) {
2780                 ubrk_setText(strsrch->search->breakIter, text,
2781                              textlength, status);
2782             }
2783 #endif
2784         }
2785     }
2786 }
2787
2788 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2789                                                      int32_t       *length)
2790 {
2791     if (strsrch) {
2792         *length = strsrch->search->textLength;
2793         return strsrch->search->text;
2794     }
2795     return NULL;
2796 }
2797
2798 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
2799                                           const UCollator     *collator,
2800                                                 UErrorCode    *status)
2801 {
2802     if (U_SUCCESS(*status)) {
2803         if (collator == NULL) {
2804             *status = U_ILLEGAL_ARGUMENT_ERROR;
2805             return;
2806         }
2807         if (strsrch) {
2808             if (strsrch->ownCollator && (strsrch->collator != collator)) {
2809                 ucol_close((UCollator *)strsrch->collator);
2810                 strsrch->ownCollator = FALSE;
2811             }
2812             strsrch->collator    = collator;
2813             strsrch->strength    = ucol_getStrength(collator);
2814             strsrch->ceMask      = getMask(strsrch->strength);
2815             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2816             strsrch->toShift     =
2817                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2818                                                                 UCOL_SHIFTED;
2819             // if status is a failure, ucol_getVariableTop returns 0
2820             strsrch->variableTop = ucol_getVariableTop(collator, status);
2821             if (U_SUCCESS(*status)) {
2822                 initialize(strsrch, status);
2823                 if (U_SUCCESS(*status)) {
2824                     uprv_init_collIterate(collator, strsrch->search->text,
2825                                           strsrch->search->textLength,
2826                                           &(strsrch->textIter->iteratordata_));
2827                     strsrch->utilIter->iteratordata_.coll = collator;
2828                 }
2829             }
2830         }
2831     }
2832 }
2833
2834 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
2835 {
2836     if (strsrch) {
2837         return (UCollator *)strsrch->collator;
2838     }
2839     return NULL;
2840 }
2841
2842 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
2843                                          const UChar         *pattern,
2844                                                int32_t        patternlength,
2845                                                UErrorCode    *status)
2846 {
2847     if (U_SUCCESS(*status)) {
2848         if (strsrch == NULL || pattern == NULL) {
2849             *status = U_ILLEGAL_ARGUMENT_ERROR;
2850         }
2851         else {
2852             if (patternlength == -1) {
2853                 patternlength = u_strlen(pattern);
2854             }
2855             if (patternlength == 0) {
2856                 *status = U_ILLEGAL_ARGUMENT_ERROR;
2857                 return;
2858             }
2859             strsrch->pattern.text       = pattern;
2860             strsrch->pattern.textLength = patternlength;
2861             initialize(strsrch, status);
2862         }
2863     }
2864 }
2865
2866 U_CAPI const UChar* U_EXPORT2
2867 usearch_getPattern(const UStringSearch *strsrch,
2868                    int32_t       *length)
2869 {
2870     if (strsrch) {
2871         *length = strsrch->pattern.textLength;
2872         return strsrch->pattern.text;
2873     }
2874     return NULL;
2875 }
2876
2877 // miscellanous methods --------------------------------------------------
2878
2879 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
2880                                            UErrorCode    *status)
2881 {
2882     if (strsrch && U_SUCCESS(*status)) {
2883         strsrch->search->isForwardSearching = TRUE;
2884         usearch_setOffset(strsrch, 0, status);
2885         if (U_SUCCESS(*status)) {
2886             return usearch_next(strsrch, status);
2887         }
2888     }
2889     return USEARCH_DONE;
2890 }
2891
2892 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
2893                                                int32_t    position,
2894                                                UErrorCode    *status)
2895 {
2896     if (strsrch && U_SUCCESS(*status)) {
2897         strsrch->search->isForwardSearching = TRUE;
2898         // position checked in usearch_setOffset
2899         usearch_setOffset(strsrch, position, status);
2900         if (U_SUCCESS(*status)) {
2901             return usearch_next(strsrch, status);
2902         }
2903     }
2904     return USEARCH_DONE;
2905 }
2906
2907 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
2908                                           UErrorCode    *status)
2909 {
2910     if (strsrch && U_SUCCESS(*status)) {
2911         strsrch->search->isForwardSearching = FALSE;
2912         usearch_setOffset(strsrch, strsrch->search->textLength, status);
2913         if (U_SUCCESS(*status)) {
2914             return usearch_previous(strsrch, status);
2915         }
2916     }
2917     return USEARCH_DONE;
2918 }
2919
2920 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
2921                                                int32_t    position,
2922                                                UErrorCode    *status)
2923 {
2924     if (strsrch && U_SUCCESS(*status)) {
2925         strsrch->search->isForwardSearching = FALSE;
2926         // position checked in usearch_setOffset
2927         usearch_setOffset(strsrch, position, status);
2928         if (U_SUCCESS(*status)) {
2929             return usearch_previous(strsrch, status);
2930         }
2931     }
2932     return USEARCH_DONE;
2933 }
2934
2935 /**
2936 * If a direction switch is required, we'll count the number of ces till the
2937 * beginning of the collation element iterator and iterate forwards that
2938 * number of times. This is so that we get to the correct point within the
2939 * string to continue the search in. Imagine when we are in the middle of the
2940 * normalization buffer when the change in direction is request. arrrgghh....
2941 * After searching the offset within the collation element iterator will be
2942 * shifted to the start of the match. If a match is not found, the offset would
2943 * have been set to the end of the text string in the collation element
2944 * iterator.
2945 * Okay, here's my take on normalization buffer. The only time when there can
2946 * be 2 matches within the same normalization is when the pattern is consists
2947 * of all accents. But since the offset returned is from the text string, we
2948 * should not confuse the caller by returning the second match within the
2949 * same normalization buffer. If we do, the 2 results will have the same match
2950 * offsets, and that'll be confusing. I'll return the next match that doesn't
2951 * fall within the same normalization buffer. Note this does not affect the
2952 * results of matches spanning the text and the normalization buffer.
2953 * The position to start searching is taken from the collation element
2954 * iterator. Callers of this API would have to set the offset in the collation
2955 * element iterator before using this method.
2956 */
2957 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
2958                                           UErrorCode    *status)
2959 {
2960     if (U_SUCCESS(*status) && strsrch) {
2961         // note offset is either equivalent to the start of the previous match
2962         // or is set by the user
2963         int32_t      offset       = usearch_getOffset(strsrch);
2964         USearch     *search       = strsrch->search;
2965         search->reset             = FALSE;
2966         int32_t      textlength   = search->textLength;
2967         if (search->isForwardSearching) {
2968             if (offset == textlength
2969                 || (!search->isOverlap &&
2970                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
2971                     (search->matchedIndex != USEARCH_DONE &&
2972                      offset + search->matchedLength >= textlength)))) {
2973                 // not enough characters to match
2974                 setMatchNotFound(strsrch);
2975                 return USEARCH_DONE;
2976             }
2977         }
2978         else {
2979             // switching direction.
2980             // if matchedIndex == USEARCH_DONE, it means that either a
2981             // setOffset has been called or that previous ran off the text
2982             // string. the iterator would have been set to offset 0 if a
2983             // match is not found.
2984             search->isForwardSearching = TRUE;
2985             if (search->matchedIndex != USEARCH_DONE) {
2986                 // there's no need to set the collation element iterator
2987                 // the next call to next will set the offset.
2988                 return search->matchedIndex;
2989             }
2990         }
2991
2992         if (U_SUCCESS(*status)) {
2993             if (strsrch->pattern.CELength == 0) {
2994                 if (search->matchedIndex == USEARCH_DONE) {
2995                     search->matchedIndex = offset;
2996                 }
2997                 else { // moves by codepoints
2998                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
2999                 }
3000
3001                 search->matchedLength = 0;
3002                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3003                 // status checked below
3004                 if (search->matchedIndex == textlength) {
3005                     search->matchedIndex = USEARCH_DONE;
3006                 }
3007             }
3008             else {
3009                 if (search->matchedLength > 0) {
3010                     // if matchlength is 0 we are at the start of the iteration
3011                     if (search->isOverlap) {
3012                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3013                     }
3014                     else {
3015                         ucol_setOffset(strsrch->textIter,
3016                                        offset + search->matchedLength, status);
3017                     }
3018                 }
3019                 else {
3020                     // for boundary check purposes. this will ensure that the
3021                     // next match will not preceed the current offset
3022                     // note search->matchedIndex will always be set to something
3023                     // in the code
3024                     search->matchedIndex = offset - 1;
3025                 }
3026
3027                 if (search->isCanonicalMatch) {
3028                     // can't use exact here since extra accents are allowed.
3029                     usearch_handleNextCanonical(strsrch, status);
3030                 }
3031                 else {
3032                     usearch_handleNextExact(strsrch, status);
3033                 }
3034             }
3035
3036             if (U_FAILURE(*status)) {
3037                 return USEARCH_DONE;
3038             }
3039
3040             return search->matchedIndex;
3041         }
3042     }
3043     return USEARCH_DONE;
3044 }
3045
3046 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3047                                               UErrorCode *status)
3048 {
3049     if (U_SUCCESS(*status) && strsrch) {
3050         int32_t offset;
3051         USearch *search = strsrch->search;
3052         if (search->reset) {
3053             offset                     = search->textLength;
3054             search->isForwardSearching = FALSE;
3055             search->reset              = FALSE;
3056             setColEIterOffset(strsrch->textIter, offset);
3057         }
3058         else {
3059             offset = usearch_getOffset(strsrch);
3060         }
3061
3062         int32_t matchedindex = search->matchedIndex;
3063         if (search->isForwardSearching == TRUE) {
3064             // switching direction.
3065             // if matchedIndex == USEARCH_DONE, it means that either a
3066             // setOffset has been called or that next ran off the text
3067             // string. the iterator would have been set to offset textLength if
3068             // a match is not found.
3069             search->isForwardSearching = FALSE;
3070             if (matchedindex != USEARCH_DONE) {
3071                 return matchedindex;
3072             }
3073         }
3074         else {
3075             if (offset == 0 || matchedindex == 0 ||
3076                 (!search->isOverlap &&
3077                     (offset < strsrch->pattern.defaultShiftSize ||
3078                     (matchedindex != USEARCH_DONE &&
3079                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3080                 // not enough characters to match
3081                 setMatchNotFound(strsrch);
3082                 return USEARCH_DONE;
3083             }
3084         }
3085
3086         if (U_SUCCESS(*status)) {
3087             if (strsrch->pattern.CELength == 0) {
3088                 search->matchedIndex =
3089                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3090                 if (search->matchedIndex == 0) {
3091                     setMatchNotFound(strsrch);
3092                     // status checked below
3093                 }
3094                 else { // move by codepoints
3095                     UTF_BACK_1(search->text, 0, search->matchedIndex);
3096                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3097                     // status checked below
3098                     search->matchedLength = 0;
3099                 }
3100             }
3101             else {
3102                 if (strsrch->search->isCanonicalMatch) {
3103                     // can't use exact here since extra accents are allowed.
3104                     usearch_handlePreviousCanonical(strsrch, status);
3105                     // status checked below
3106                 }
3107                 else {
3108                     usearch_handlePreviousExact(strsrch, status);
3109                     // status checked below
3110                 }
3111             }
3112
3113             if (U_FAILURE(*status)) {
3114                 return USEARCH_DONE;
3115             }
3116
3117             return search->matchedIndex;
3118         }
3119     }
3120     return USEARCH_DONE;
3121 }
3122
3123
3124
3125 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3126 {
3127     /*
3128     reset is setting the attributes that are already in
3129     string search, hence all attributes in the collator should
3130     be retrieved without any problems
3131     */
3132     if (strsrch) {
3133         UErrorCode status            = U_ZERO_ERROR;
3134         UBool      sameCollAttribute = TRUE;
3135         uint32_t   ceMask;
3136         UBool      shift;
3137         uint32_t   varTop;
3138
3139         strsrch->strength    = ucol_getStrength(strsrch->collator);
3140         ceMask = getMask(strsrch->strength);
3141         if (strsrch->ceMask != ceMask) {
3142             strsrch->ceMask = ceMask;
3143             sameCollAttribute = FALSE;
3144         }
3145         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3146         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3147                                   &status) == UCOL_SHIFTED;
3148         if (strsrch->toShift != shift) {
3149             strsrch->toShift  = shift;
3150             sameCollAttribute = FALSE;
3151         }
3152
3153         // if status is a failure, ucol_getVariableTop returns 0
3154         varTop = ucol_getVariableTop(strsrch->collator, &status);
3155         if (strsrch->variableTop != varTop) {
3156             strsrch->variableTop = varTop;
3157             sameCollAttribute    = FALSE;
3158         }
3159         if (!sameCollAttribute) {
3160             initialize(strsrch, &status);
3161         }
3162         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3163                               strsrch->search->textLength,
3164                               &(strsrch->textIter->iteratordata_));
3165         strsrch->search->matchedLength      = 0;
3166         strsrch->search->matchedIndex       = USEARCH_DONE;
3167         strsrch->search->isOverlap          = FALSE;
3168         strsrch->search->isCanonicalMatch   = FALSE;
3169         strsrch->search->isForwardSearching = TRUE;
3170         strsrch->search->reset              = TRUE;
3171     }
3172 }
3173
3174 // internal use methods declared in usrchimp.h -----------------------------
3175
3176 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
3177 {
3178     if (U_FAILURE(*status)) {
3179         setMatchNotFound(strsrch);
3180         return FALSE;
3181     }
3182
3183     UCollationElements *coleiter        = strsrch->textIter;
3184     int32_t             textlength      = strsrch->search->textLength;
3185     int32_t            *patternce       = strsrch->pattern.CE;
3186     int32_t             patterncelength = strsrch->pattern.CELength;
3187     int32_t             textoffset      = ucol_getOffset(coleiter);
3188
3189     // status used in setting coleiter offset, since offset is checked in
3190     // shiftForward before setting the coleiter offset, status never
3191     // a failure
3192     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3193                               patterncelength);
3194     while (textoffset <= textlength)
3195     {
3196         uint32_t    patternceindex = patterncelength - 1;
3197         int32_t     targetce;
3198         UBool       found          = FALSE;
3199         int32_t    lastce          = UCOL_NULLORDER;
3200
3201         setColEIterOffset(coleiter, textoffset);
3202
3203         while (TRUE) {
3204             // finding the last pattern ce match, imagine composite characters
3205             // for example: search for pattern A in text \u00C0
3206             // we'll have to skip \u0300 the grave first before we get to A
3207             targetce = ucol_previous(coleiter, status);
3208             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3209                 found = FALSE;
3210                 break;
3211             }
3212             targetce = getCE(strsrch, targetce);
3213             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
3214                 // this is for the text \u0315\u0300 that requires
3215                 // normalization and pattern \u0300, where \u0315 is ignorable
3216                 continue;
3217             }
3218             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3219                 lastce = targetce;
3220             }
3221             if (targetce == patternce[patternceindex]) {
3222                 // the first ce can be a contraction
3223                 found = TRUE;
3224                 break;
3225             }
3226             if (!hasExpansion(coleiter)) {
3227                 found = FALSE;
3228                 break;
3229             }
3230         }
3231
3232         targetce = lastce;
3233
3234         while (found && patternceindex > 0) {
3235             targetce    = ucol_previous(coleiter, status);
3236             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3237                 found = FALSE;
3238                 break;
3239             }
3240             targetce    = getCE(strsrch, targetce);
3241             if (targetce == UCOL_IGNORABLE) {
3242                 continue;
3243             }
3244
3245             patternceindex --;
3246             found = found && targetce == patternce[patternceindex];
3247         }
3248
3249         if (!found) {
3250             if (U_FAILURE(*status)) {
3251                 break;
3252             }
3253             textoffset = shiftForward(strsrch, textoffset, lastce,
3254                                       patternceindex);
3255             // status checked at loop.
3256             patternceindex = patterncelength;
3257             continue;
3258         }
3259
3260         if (checkNextExactMatch(strsrch, &textoffset, status)) {
3261             // status checked in ucol_setOffset
3262             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3263             return TRUE;
3264         }
3265     }
3266     setMatchNotFound(strsrch);
3267     return FALSE;
3268 }
3269
3270 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
3271 {
3272     if (U_FAILURE(*status)) {
3273         setMatchNotFound(strsrch);
3274         return FALSE;
3275     }
3276
3277     UCollationElements *coleiter        = strsrch->textIter;
3278     int32_t             textlength      = strsrch->search->textLength;
3279     int32_t            *patternce       = strsrch->pattern.CE;
3280     int32_t             patterncelength = strsrch->pattern.CELength;
3281     int32_t             textoffset      = ucol_getOffset(coleiter);
3282     UBool               hasPatternAccents =
3283        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3284
3285     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3286                               patterncelength);
3287     strsrch->canonicalPrefixAccents[0] = 0;
3288     strsrch->canonicalSuffixAccents[0] = 0;
3289
3290     while (textoffset <= textlength)
3291     {
3292         int32_t     patternceindex = patterncelength - 1;
3293         int32_t     targetce;
3294         UBool       found          = FALSE;
3295         int32_t     lastce         = UCOL_NULLORDER;
3296
3297         setColEIterOffset(coleiter, textoffset);
3298
3299         for (;;) {
3300             // finding the last pattern ce match, imagine composite characters
3301             // for example: search for pattern A in text \u00C0
3302             // we'll have to skip \u0300 the grave first before we get to A
3303             targetce = ucol_previous(coleiter, status);
3304             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3305                 found = FALSE;
3306                 break;
3307             }
3308             targetce = getCE(strsrch, targetce);
3309             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3310                 lastce = targetce;
3311             }
3312             if (targetce == patternce[patternceindex]) {
3313                 // the first ce can be a contraction
3314                 found = TRUE;
3315                 break;
3316             }
3317             if (!hasExpansion(coleiter)) {
3318                 found = FALSE;
3319                 break;
3320             }
3321         }
3322
3323         while (found && patternceindex > 0) {
3324             targetce    = ucol_previous(coleiter, status);
3325             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3326                 found = FALSE;
3327                 break;
3328             }
3329             targetce    = getCE(strsrch, targetce);
3330             if (targetce == UCOL_IGNORABLE) {
3331                 continue;
3332             }
3333
3334             patternceindex --;
3335             found = found && targetce == patternce[patternceindex];
3336         }
3337
3338         // initializing the rearranged accent array
3339         if (hasPatternAccents && !found) {
3340             strsrch->canonicalPrefixAccents[0] = 0;
3341             strsrch->canonicalSuffixAccents[0] = 0;
3342             if (U_FAILURE(*status)) {
3343                 break;
3344             }
3345             found = doNextCanonicalMatch(strsrch, textoffset, status);
3346         }
3347
3348         if (!found) {
3349             if (U_FAILURE(*status)) {
3350                 break;
3351             }
3352             textoffset = shiftForward(strsrch, textoffset, lastce,
3353                                       patternceindex);
3354             // status checked at loop
3355             patternceindex = patterncelength;
3356             continue;
3357         }
3358
3359         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
3360             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3361             return TRUE;
3362         }
3363     }
3364     setMatchNotFound(strsrch);
3365     return FALSE;
3366 }
3367
3368 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
3369 {
3370     if (U_FAILURE(*status)) {
3371         setMatchNotFound(strsrch);
3372         return FALSE;
3373     }
3374
3375     UCollationElements *coleiter        = strsrch->textIter;
3376     int32_t            *patternce       = strsrch->pattern.CE;
3377     int32_t             patterncelength = strsrch->pattern.CELength;
3378     int32_t             textoffset      = ucol_getOffset(coleiter);
3379
3380     // shifting it check for setting offset
3381     // if setOffset is called previously or there was no previous match, we
3382     // leave the offset as it is.
3383     if (strsrch->search->matchedIndex != USEARCH_DONE) {
3384         textoffset = strsrch->search->matchedIndex;
3385     }
3386
3387     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3388                               patterncelength);
3389
3390     while (textoffset >= 0)
3391     {
3392         int32_t     patternceindex = 1;
3393         int32_t     targetce;
3394         UBool       found          = FALSE;
3395         int32_t     firstce        = UCOL_NULLORDER;
3396
3397         // if status is a failure, ucol_setOffset does nothing
3398         setColEIterOffset(coleiter, textoffset);
3399
3400         for (;;) {
3401             // finding the first pattern ce match, imagine composite
3402             // characters. for example: search for pattern \u0300 in text
3403             // \u00C0, we'll have to skip A first before we get to
3404             // \u0300 the grave accent
3405             targetce = ucol_next(coleiter, status);
3406             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3407                 found = FALSE;
3408                 break;
3409             }
3410             targetce = getCE(strsrch, targetce);
3411             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3412                 firstce = targetce;
3413             }
3414             if (targetce == UCOL_IGNORABLE) {
3415                 continue;
3416             }
3417             if (targetce == patternce[0]) {
3418                 found = TRUE;
3419                 break;
3420             }
3421             if (!hasExpansion(coleiter)) {
3422                 // checking for accents in composite character
3423                 found = FALSE;
3424                 break;
3425             }
3426         }
3427
3428         targetce = firstce;
3429
3430         while (found && (patternceindex < patterncelength)) {
3431             targetce    = ucol_next(coleiter, status);
3432             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3433                 found = FALSE;
3434                 break;
3435             }
3436             targetce    = getCE(strsrch, targetce);
3437             if (targetce == UCOL_IGNORABLE) {
3438                 continue;
3439             }
3440
3441             found = found && targetce == patternce[patternceindex];
3442             patternceindex ++;
3443         }
3444
3445         if (!found) {
3446             if (U_FAILURE(*status)) {
3447                 break;
3448             }
3449             textoffset = reverseShift(strsrch, textoffset, targetce,
3450                                       patternceindex);
3451             patternceindex = 0;
3452             continue;
3453         }
3454
3455         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
3456             setColEIterOffset(coleiter, textoffset);
3457             return TRUE;
3458         }
3459     }
3460     setMatchNotFound(strsrch);
3461     return FALSE;
3462 }
3463
3464 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
3465                                       UErrorCode    *status)
3466 {
3467     if (U_FAILURE(*status)) {
3468         setMatchNotFound(strsrch);
3469         return FALSE;
3470     }
3471
3472     UCollationElements *coleiter        = strsrch->textIter;
3473     int32_t            *patternce       = strsrch->pattern.CE;
3474     int32_t             patterncelength = strsrch->pattern.CELength;
3475     int32_t             textoffset      = ucol_getOffset(coleiter);
3476     UBool               hasPatternAccents =
3477        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3478
3479     // shifting it check for setting offset
3480     // if setOffset is called previously or there was no previous match, we
3481     // leave the offset as it is.
3482     if (strsrch->search->matchedIndex != USEARCH_DONE) {
3483         textoffset = strsrch->search->matchedIndex;
3484     }
3485
3486     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3487                               patterncelength);
3488     strsrch->canonicalPrefixAccents[0] = 0;
3489     strsrch->canonicalSuffixAccents[0] = 0;
3490
3491     while (textoffset >= 0)
3492     {
3493         int32_t     patternceindex = 1;
3494         int32_t     targetce;
3495         UBool       found          = FALSE;
3496         int32_t     firstce        = UCOL_NULLORDER;
3497
3498         setColEIterOffset(coleiter, textoffset);
3499         while (TRUE) {
3500             // finding the first pattern ce match, imagine composite
3501             // characters. for example: search for pattern \u0300 in text
3502             // \u00C0, we'll have to skip A first before we get to
3503             // \u0300 the grave accent
3504             targetce = ucol_next(coleiter, status);
3505             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3506                 found = FALSE;
3507                 break;
3508             }
3509             targetce = getCE(strsrch, targetce);
3510             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3511                 firstce = targetce;
3512             }
3513
3514             if (targetce == patternce[0]) {
3515                 // the first ce can be a contraction
3516                 found = TRUE;
3517                 break;
3518             }
3519             if (!hasExpansion(coleiter)) {
3520                 // checking for accents in composite character
3521                 found = FALSE;
3522                 break;
3523             }
3524         }
3525
3526         targetce = firstce;
3527
3528         while (found && patternceindex < patterncelength) {
3529             targetce    = ucol_next(coleiter, status);
3530             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3531                 found = FALSE;
3532                 break;
3533             }
3534             targetce = getCE(strsrch, targetce);
3535             if (targetce == UCOL_IGNORABLE) {
3536                 continue;
3537             }
3538
3539             found = found && targetce == patternce[patternceindex];
3540             patternceindex ++;
3541         }
3542
3543         // initializing the rearranged accent array
3544         if (hasPatternAccents && !found) {
3545             strsrch->canonicalPrefixAccents[0] = 0;
3546             strsrch->canonicalSuffixAccents[0] = 0;
3547             if (U_FAILURE(*status)) {
3548                 break;
3549             }
3550             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
3551         }
3552
3553         if (!found) {
3554             if (U_FAILURE(*status)) {
3555                 break;
3556             }
3557             textoffset = reverseShift(strsrch, textoffset, targetce,
3558                                       patternceindex);
3559             patternceindex = 0;
3560             continue;
3561         }
3562
3563         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
3564             setColEIterOffset(coleiter, textoffset);
3565             return TRUE;
3566         }
3567     }
3568     setMatchNotFound(strsrch);
3569     return FALSE;
3570 }
3571
3572 #endif /* #if !UCONFIG_NO_COLLATION */