icuSources/i18n/usearch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2004 IBM and others. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *  07/02/2001   synwee      Creation.
   7 **********************************************************************
   8 */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_COLLATION
  13
  14 #include "unicode/usearch.h"
  15 #include "unicode/ustring.h"
  16 #include "unicode/uchar.h"
  17 #include "unormimp.h"
  18 #include "ucol_imp.h"
  19 #include "usrchimp.h"
  20 #include "cmemory.h"
  21 #include "ucln_in.h"
  22
  23 // internal definition ---------------------------------------------------
  24
  25 #define LAST_BYTE_MASK_          0xFF
  26 #define SECOND_LAST_BYTE_SHIFT_  8
  27 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
  28
  29 static const uint16_t *FCD_ = NULL;
  30
  31 // internal methods -------------------------------------------------
  32
  33 /**
  34 * Fast collation element iterator setOffset.
  35 * This function does not check for bounds.
  36 * @param coleiter collation element iterator
  37 * @param offset to set
  38 */
  39 static
  40 inline void setColEIterOffset(UCollationElements *elems,
  41                       int32_t             offset)
  42 {
  43     collIterate *ci = &(elems->iteratordata_);
  44     ci->pos         = ci->string + offset;
  45     ci->CEpos       = ci->toReturn = ci->CEs;
  46     if (ci->flags & UCOL_ITER_INNORMBUF) {
  47         ci->flags = ci->origFlags;
  48     }
  49     ci->fcdPosition = NULL;
  50 }
  51
  52 /**
  53 * Getting the mask for collation strength
  54 * @param strength collation strength
  55 * @return collation element mask
  56 */
  57 static
  58 inline uint32_t getMask(UCollationStrength strength)
  59 {
  60     switch (strength)
  61     {
  62     case UCOL_PRIMARY:
  63         return UCOL_PRIMARYORDERMASK;
  64     case UCOL_SECONDARY:
  65         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
  66     default:
  67         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
  68                UCOL_PRIMARYORDERMASK;
  69     }
  70 }
  71
  72 /**
  73 * This is to squeeze the 21bit ces into a 256 table
  74 * @param ce collation element
  75 * @return collapsed version of the collation element
  76 */
  77 static
  78 inline int hash(uint32_t ce)
  79 {
  80     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
  81     // well with the new collation where most of the latin 1 characters
  82     // are of the value xx000xxx. their hashes will most of the time be 0
  83     // to be discussed on the hash algo.
  84     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
  85 }
  86
  87 U_CDECL_BEGIN
  88 static UBool U_CALLCONV
  89 usearch_cleanup(void) {
  90     FCD_ = NULL;
  91     return TRUE;
  92 }
  93 U_CDECL_END
  94
  95 /**
  96 * Initializing the fcd tables.
  97 * Internal method, status assumed to be a success.
  98 * @param status output error if any, caller to check status before calling
  99 *               method, status assumed to be success when passed in.
 100 */
 101 static
 102 inline void initializeFCD(UErrorCode *status)
 103 {
 104     if (FCD_ == NULL) {
 105         FCD_ = unorm_getFCDTrie(status);
 106         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
 107     }
 108 }
 109
 110 /**
 111 * Gets the fcd value for a character at the argument index.
 112 * This method takes into accounts of the supplementary characters.
 113 * @param str UTF16 string where character for fcd retrieval resides
 114 * @param offset position of the character whose fcd is to be retrieved, to be
 115 *               overwritten with the next character position, taking
 116 *               surrogate characters into consideration.
 117 * @param strlength length of the argument string
 118 * @return fcd value
 119 */
 120 static
 121 inline uint16_t getFCD(const UChar   *str, int32_t *offset,
 122                              int32_t  strlength)
 123 {
 124     int32_t temp = *offset;
 125     uint16_t    result;
 126     UChar       ch   = str[temp];
 127     result = unorm_getFCD16(FCD_, ch);
 128     temp ++;
 129
 130     if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
 131         ch = str[temp];
 132         if (UTF_IS_SECOND_SURROGATE(ch)) {
 133             result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
 134             temp ++;
 135         } else {
 136             result = 0;
 137         }
 138     }
 139     *offset = temp;
 140     return result;
 141 }
 142
 143 /**
 144 * Getting the modified collation elements taking into account the collation
 145 * attributes
 146 * @param strsrch string search data
 147 * @param sourcece
 148 * @return the modified collation element
 149 */
 150 static
 151 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
 152 {
 153     // note for tertiary we can't use the collator->tertiaryMask, that
 154     // is a preprocessed mask that takes into account case options. since
 155     // we are only concerned with exact matches, we don't need that.
 156     sourcece &= strsrch->ceMask;
 157
 158     if (strsrch->toShift) {
 159         // alternate handling here, since only the 16 most significant digits
 160         // is only used, we can safely do a compare without masking
 161         // if the ce is a variable, we mask and get only the primary values
 162         // no shifting to quartenary is required since all primary values
 163         // less than variabletop will need to be masked off anyway.
 164         if (strsrch->variableTop > sourcece) {
 165             if (strsrch->strength == UCOL_QUATERNARY) {
 166                 sourcece &= UCOL_PRIMARYORDERMASK;
 167             }
 168             else {
 169                 sourcece = UCOL_IGNORABLE;
 170             }
 171         }
 172     }
 173
 174     return sourcece;
 175 }
 176
 177 /**
 178 * Allocate a memory and returns NULL if it failed.
 179 * Internal method, status assumed to be a success.
 180 * @param size to allocate
 181 * @param status output error if any, caller to check status before calling
 182 *               method, status assumed to be success when passed in.
 183 * @return newly allocated array, NULL otherwise
 184 */
 185 static
 186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
 187 {
 188     uint32_t *result = (uint32_t *)uprv_malloc(size);
 189     if (result == NULL) {
 190         *status = U_MEMORY_ALLOCATION_ERROR;
 191     }
 192     return result;
 193 }
 194
 195 /**
 196 * Adds a uint32_t value to a destination array.
 197 * Creates a new array if we run out of space. The caller will have to
 198 * manually deallocate the newly allocated array.
 199 * Internal method, status assumed to be success, caller has to check status
 200 * before calling this method. destination not to be NULL and has at least
 201 * size destinationlength.
 202 * @param destination target array
 203 * @param offset destination offset to add value
 204 * @param destinationlength target array size, return value for the new size
 205 * @param value to be added
 206 * @param increments incremental size expected
 207 * @param status output error if any, caller to check status before calling
 208 *               method, status assumed to be success when passed in.
 209 * @return new destination array, destination if there was no new allocation
 210 */
 211 static
 212 inline int32_t * addTouint32_tArray(int32_t    *destination,
 213                                     uint32_t    offset,
 214                                     uint32_t   *destinationlength,
 215                                     uint32_t    value,
 216                                     uint32_t    increments,
 217                                     UErrorCode *status)
 218 {
 219     uint32_t newlength = *destinationlength;
 220     if (offset + 1 == newlength) {
 221         newlength += increments;
 222         int32_t *temp = (int32_t *)allocateMemory(
 223                                          sizeof(int32_t) * newlength, status);
 224         if (U_FAILURE(*status)) {
 225             return NULL;
 226         }
 227         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
 228         *destinationlength = newlength;
 229         destination        = temp;
 230     }
 231     destination[offset] = value;
 232     return destination;
 233 }
 234
 235 /**
 236 * Initializing the ce table for a pattern.
 237 * Stores non-ignorable collation keys.
 238 * Table size will be estimated by the size of the pattern text. Table
 239 * expansion will be perform as we go along. Adding 1 to ensure that the table
 240 * size definitely increases.
 241 * Internal method, status assumed to be a success.
 242 * @param strsrch string search data
 243 * @param status output error if any, caller to check status before calling
 244 *               method, status assumed to be success when passed in.
 245 * @return total number of expansions
 246 */
 247 static
 248 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
 249                                          UErrorCode    *status)
 250 {
 251     UPattern *pattern            = &(strsrch->pattern);
 252     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
 253     int32_t  *cetable            = pattern->CEBuffer;
 254     uint32_t  patternlength      = pattern->textLength;
 255     UCollationElements *coleiter = strsrch->utilIter;
 256
 257     if (coleiter == NULL) {
 258         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 259                                      patternlength, status);
 260         // status will be checked in ucol_next(..) later and if it is an
 261         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 262         // returned.
 263         strsrch->utilIter = coleiter;
 264     }
 265     else {
 266         uprv_init_collIterate(strsrch->collator, pattern->text,
 267                          pattern->textLength,
 268                          &coleiter->iteratordata_);
 269     }
 270
 271     if (pattern->CE != cetable && pattern->CE) {
 272         uprv_free(pattern->CE);
 273     }
 274
 275     uint16_t  offset      = 0;
 276     uint16_t  result      = 0;
 277     int32_t   ce;
 278
 279     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
 280            U_SUCCESS(*status)) {
 281         uint32_t newce = getCE(strsrch, ce);
 282         if (newce) {
 283             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
 284                                   newce,
 285                                   patternlength - ucol_getOffset(coleiter) + 1,
 286                                   status);
 287             if (U_FAILURE(*status)) {
 288                 return 0;
 289             }
 290             offset ++;
 291             if (cetable != temp && cetable != pattern->CEBuffer) {
 292                 uprv_free(cetable);
 293             }
 294             cetable = temp;
 295         }
 296         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 297     }
 298
 299     cetable[offset]   = 0;
 300     pattern->CE       = cetable;
 301     pattern->CELength = offset;
 302
 303     return result;
 304 }
 305
 306 /**
 307 * Initializes the pattern struct.
 308 * Internal method, status assumed to be success.
 309 * @param strsrch UStringSearch data storage
 310 * @param status output error if any, caller to check status before calling
 311 *               method, status assumed to be success when passed in.
 312 * @return expansionsize the total expansion size of the pattern
 313 */
 314 static
 315 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
 316 {
 317           UPattern   *pattern     = &(strsrch->pattern);
 318     const UChar      *patterntext = pattern->text;
 319           int32_t     length      = pattern->textLength;
 320           int32_t index       = 0;
 321
 322     pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
 323                                                      SECOND_LAST_BYTE_SHIFT_;
 324     index = length;
 325     UTF_BACK_1(patterntext, 0, index);
 326     pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
 327                                                              LAST_BYTE_MASK_;
 328     // since intializePattern is an internal method status is a success.
 329     return initializePatternCETable(strsrch, status);
 330 }
 331
 332 /**
 333 * Initializing shift tables, with the default values.
 334 * If a corresponding default value is 0, the shift table is not set.
 335 * @param shift table for forwards shift
 336 * @param backshift table for backwards shift
 337 * @param cetable table containing pattern ce
 338 * @param cesize size of the pattern ces
 339 * @param expansionsize total size of the expansions
 340 * @param defaultforward the default forward value
 341 * @param defaultbackward the default backward value
 342 */
 343 static
 344 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
 345                           int32_t  *cetable, int32_t cesize,
 346                           int16_t   expansionsize,
 347                           int16_t   defaultforward,
 348                           int16_t   defaultbackward)
 349 {
 350     // estimate the value to shift. to do that we estimate the smallest
 351     // number of characters to give the relevant ces, ie approximately
 352     // the number of ces minus their expansion, since expansions can come
 353     // from a character.
 354     int32_t count;
 355     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 356         shift[count] = defaultforward;
 357     }
 358     cesize --; // down to the last index
 359     for (count = 0; count < cesize; count ++) {
 360         // number of ces from right of array to the count
 361         int temp = defaultforward - count - 1;
 362         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
 363     }
 364     shift[hash(cetable[cesize])] = 1;
 365     // for ignorables we just shift by one. see test examples.
 366     shift[hash(0)] = 1;
 367
 368     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 369         backshift[count] = defaultbackward;
 370     }
 371     for (count = cesize; count > 0; count --) {
 372         // the original value count does not seem to work
 373         backshift[hash(cetable[count])] = count > expansionsize ?
 374                                           (int16_t)(count - expansionsize) : 1;
 375     }
 376     backshift[hash(cetable[0])] = 1;
 377     backshift[hash(0)] = 1;
 378 }
 379
 380 /**
 381 * Building of the pattern collation element list and the boyer moore strsrch
 382 * table.
 383 * The canonical match will only be performed after the default match fails.
 384 * For both cases we need to remember the size of the composed and decomposed
 385 * versions of the string. Since the Boyer-Moore shift calculations shifts by
 386 * a number of characters in the text and tries to match the pattern from that
 387 * offset, the shift value can not be too large in case we miss some
 388 * characters. To choose a right shift size, we estimate the NFC form of the
 389 * and use its size as a shift guide. The NFC form should be the small
 390 * possible representation of the pattern. Anyways, we'll err on the smaller
 391 * shift size. Hence the calculation for minlength.
 392 * Canonical match will be performed slightly differently. We'll split the
 393 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
 394 * the first and last base character (MS), the ending accents (EA). Matches
 395 * will be done on MS first, and only when we match MS then some processing
 396 * will be required for the prefix and end accents in order to determine if
 397 * they match PA and EA. Hence the default shift values
 398 * for the canonical match will take the size of either end's accent into
 399 * consideration. Forwards search will take the end accents into consideration
 400 * for the default shift values and the backwards search will take the prefix
 401 * accents into consideration.
 402 * If pattern has no non-ignorable ce, we return a illegal argument error.
 403 * Internal method, status assumed to be success.
 404 * @param strsrch UStringSearch data storage
 405 * @param status  for output errors if it occurs, status is assumed to be a
 406 *                success when it is passed in.
 407 */
 408 static
 409 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
 410 {
 411     int16_t expandlength  = initializePattern(strsrch, status);
 412     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
 413         UPattern *pattern = &strsrch->pattern;
 414         int32_t   cesize  = pattern->CELength;
 415
 416         int16_t minlength = cesize > expandlength
 417                             ? (int16_t)cesize - expandlength : 1;
 418         pattern->defaultShiftSize    = minlength;
 419         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
 420                       cesize, expandlength, minlength, minlength);
 421         return;
 422     }
 423     strsrch->pattern.defaultShiftSize = 0;
 424 }
 425
 426 /**
 427 * Determine whether the target text in UStringSearch bounded by the offset
 428 * start and end is one or more whole units of text as
 429 * determined by the breakiterator in UStringSearch.
 430 * @param strsrch string search data
 431 * @param start target text start offset
 432 * @param end target text end offset
 433 */
 434 static
 435 inline UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
 436                                int32_t    end)
 437 {
 438 #if !UCONFIG_NO_BREAK_ITERATION
 439     UBreakIterator *breakiterator = strsrch->search->breakIter;
 440     if (breakiterator) {
 441         int32_t startindex = ubrk_first(breakiterator);
 442         int32_t endindex   = ubrk_last(breakiterator);
 443
 444         // out-of-range indexes are never boundary positions
 445         if (start < startindex || start > endindex ||
 446             end < startindex || end > endindex) {
 447             return FALSE;
 448         }
 449         // otherwise, we can use following() on the position before the
 450         // specified one and return true of the position we get back is the
 451         // one the user specified
 452         UBool result = (start == startindex ||
 453                 ubrk_following(breakiterator, start - 1) == start) &&
 454                (end == endindex ||
 455                 ubrk_following(breakiterator, end - 1) == end);
 456         if (result) {
 457             // iterates the individual ces
 458                   UCollationElements *coleiter  = strsrch->utilIter;
 459             const UChar              *text      = strsrch->search->text +
 460                                                                       start;
 461                   UErrorCode          status    = U_ZERO_ERROR;
 462             ucol_setText(coleiter, text, end - start, &status);
 463             for (int32_t count = 0; count < strsrch->pattern.CELength;
 464                  count ++) {
 465                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 466                 if (ce == UCOL_IGNORABLE) {
 467                     count --;
 468                     continue;
 469                 }
 470                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
 471                     return FALSE;
 472                 }
 473             }
 474             int32_t nextce = ucol_next(coleiter, &status);
 475             while (ucol_getOffset(coleiter) == (end - start)
 476                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
 477                 nextce = ucol_next(coleiter, &status);
 478             }
 479             if (ucol_getOffset(coleiter) == (end - start)
 480                 && nextce != UCOL_NULLORDER) {
 481                 // extra collation elements at the end of the match
 482                 return FALSE;
 483             }
 484         }
 485         return result;
 486     }
 487 #endif
 488     return TRUE;
 489 }
 490
 491 /**
 492 * Getting the next base character offset if current offset is an accent,
 493 * or the current offset if the current character contains a base character.
 494 * accents the following base character will be returned
 495 * @param text string
 496 * @param textoffset current offset
 497 * @param textlength length of text string
 498 * @return the next base character or the current offset
 499 *         if the current character is contains a base character.
 500 */
 501 static
 502 inline int32_t getNextBaseOffset(const UChar       *text,
 503                                            int32_t  textoffset,
 504                                            int32_t      textlength)
 505 {
 506     if (textoffset < textlength) {
 507         int32_t temp = textoffset;
 508         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 509             while (temp < textlength) {
 510                 int32_t result = temp;
 511                 if ((getFCD(text, &temp, textlength) >>
 512                      SECOND_LAST_BYTE_SHIFT_) == 0) {
 513                     return result;
 514                 }
 515             }
 516             return textlength;
 517         }
 518     }
 519     return textoffset;
 520 }
 521
 522 /**
 523 * Gets the next base character offset depending on the string search pattern
 524 * data
 525 * @param strsrch string search data
 526 * @param textoffset current offset, one offset away from the last character
 527 *                   to search for.
 528 * @return start index of the next base character or the current offset
 529 *         if the current character is contains a base character.
 530 */
 531 static
 532 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
 533                                                   int32_t    textoffset)
 534 {
 535     int32_t textlength = strsrch->search->textLength;
 536     if (strsrch->pattern.hasSuffixAccents &&
 537         textoffset < textlength) {
 538               int32_t  temp       = textoffset;
 539         const UChar       *text       = strsrch->search->text;
 540         UTF_BACK_1(text, 0, temp);
 541         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 542             return getNextBaseOffset(text, textoffset, textlength);
 543         }
 544     }
 545     return textoffset;
 546 }
 547
 548 /**
 549 * Shifting the collation element iterator position forward to prepare for
 550 * a following match. If the last character is a unsafe character, we'll only
 551 * shift by 1 to capture contractions, normalization etc.
 552 * Internal method, status assumed to be success.
 553 * @param text strsrch string search data
 554 * @param textoffset start text position to do search
 555 * @param ce the text ce which failed the match.
 556 * @param patternceindex index of the ce within the pattern ce buffer which
 557 *        failed the match
 558 * @return final offset
 559 */
 560 static
 561 inline int32_t shiftForward(UStringSearch *strsrch,
 562                                 int32_t    textoffset,
 563                                 int32_t       ce,
 564                                 int32_t        patternceindex)
 565 {
 566     UPattern *pattern = &(strsrch->pattern);
 567     if (ce != UCOL_NULLORDER) {
 568         int32_t shift = pattern->shift[hash(ce)];
 569         // this is to adjust for characters in the middle of the
 570         // substring for matching that failed.
 571         int32_t adjust = pattern->CELength - patternceindex;
 572         if (adjust > 1 && shift >= adjust) {
 573             shift -= adjust - 1;
 574         }
 575         textoffset += shift;
 576     }
 577     else {
 578         textoffset += pattern->defaultShiftSize;
 579     }
 580
 581     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
 582     // check for unsafe characters
 583     // * if it is the start or middle of a contraction: to be done after
 584     //   a initial match is found
 585     // * thai or lao base consonant character: similar to contraction
 586     // * high surrogate character: similar to contraction
 587     // * next character is a accent: shift to the next base character
 588     return textoffset;
 589 }
 590
 591 /**
 592 * sets match not found
 593 * @param strsrch string search data
 594 */
 595 static
 596 inline void setMatchNotFound(UStringSearch *strsrch)
 597 {
 598     // this method resets the match result regardless of the error status.
 599     strsrch->search->matchedIndex = USEARCH_DONE;
 600     strsrch->search->matchedLength = 0;
 601     if (strsrch->search->isForwardSearching) {
 602         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
 603     }
 604     else {
 605         setColEIterOffset(strsrch->textIter, 0);
 606     }
 607 }
 608
 609 /**
 610 * Gets the offset to the next safe point in text.
 611 * ie. not the middle of a contraction, swappable characters or supplementary
 612 * characters.
 613 * @param collator collation sata
 614 * @param text string to work with
 615 * @param textoffset offset in string
 616 * @param textlength length of text string
 617 * @return offset to the next safe character
 618 */
 619 static
 620 inline int32_t getNextSafeOffset(const UCollator   *collator,
 621                                      const UChar       *text,
 622                                            int32_t  textoffset,
 623                                            int32_t      textlength)
 624 {
 625     int32_t result = textoffset; // first contraction character
 626     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
 627         result ++;
 628     }
 629     return result;
 630 }
 631
 632 /**
 633 * This checks for accents in the potential match started with a .
 634 * composite character.
 635 * This is really painful... we have to check that composite character do not
 636 * have any extra accents. We have to normalize the potential match and find
 637 * the immediate decomposed character before the match.
 638 * The first composite character would have been taken care of by the fcd
 639 * checks in checkForwardExactMatch.
 640 * This is the slow path after the fcd of the first character and
 641 * the last character has been checked by checkForwardExactMatch and we
 642 * determine that the potential match has extra non-ignorable preceding
 643 * ces.
 644 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 645 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 646 * Note here that accents checking are slow and cautioned in the API docs.
 647 * Internal method, status assumed to be a success, caller should check status
 648 * before calling this method
 649 * @param strsrch string search data
 650 * @param start index of the potential unfriendly composite character
 651 * @param end index of the potential unfriendly composite character
 652 * @param status output error status if any.
 653 * @return TRUE if there is non-ignorable accents before at the beginning
 654 *              of the match, FALSE otherwise.
 655 */
 656
 657 static
 658 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
 659                                    int32_t    end,
 660                                    UErrorCode    *status)
 661 {
 662     UBool result = FALSE;
 663     if (strsrch->pattern.hasPrefixAccents) {
 664               int32_t  length = end - start;
 665               int32_t  offset = 0;
 666         const UChar       *text   = strsrch->search->text + start;
 667
 668         UTF_FWD_1(text, offset, length);
 669         // we are only concerned with the first composite character
 670         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
 671             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
 672                                                        text, 0, length);
 673             if (safeoffset != length) {
 674                 safeoffset ++;
 675             }
 676             UChar   *norm = NULL;
 677             UChar    buffer[INITIAL_ARRAY_SIZE_];
 678             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
 679                                             buffer, INITIAL_ARRAY_SIZE_,
 680                                             status);
 681             if (U_FAILURE(*status)) {
 682                 return FALSE;
 683             }
 684             if (size >= INITIAL_ARRAY_SIZE_) {
 685                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
 686                                                status);
 687                 // if allocation failed, status will be set to
 688                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
 689                 // checks for it.
 690                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
 691                                        size, status);
 692                 if (U_FAILURE(*status) && norm != NULL) {
 693                     uprv_free(norm);
 694                     return FALSE;
 695                 }
 696             }
 697             else {
 698                 norm = buffer;
 699             }
 700
 701             UCollationElements *coleiter  = strsrch->utilIter;
 702             ucol_setText(coleiter, norm, size, status);
 703             uint32_t            firstce   = strsrch->pattern.CE[0];
 704             UBool               ignorable = TRUE;
 705             uint32_t            ce        = UCOL_IGNORABLE;
 706             while (U_SUCCESS(*status) && ce != firstce) {
 707                 offset = ucol_getOffset(coleiter);
 708                 if (ce != firstce && ce != UCOL_IGNORABLE) {
 709                     ignorable = FALSE;
 710                 }
 711                 ce = ucol_next(coleiter, status);
 712             }
 713             UChar32 codepoint;
 714             UTF_PREV_CHAR(norm, 0, offset, codepoint);
 715             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
 716
 717             if (norm != buffer) {
 718                 uprv_free(norm);
 719             }
 720         }
 721     }
 722
 723     return result;
 724 }
 725
 726 /**
 727 * Used by exact matches, checks if there are accents before the match.
 728 * This is really painful... we have to check that composite characters at
 729 * the start of the matches have to not have any extra accents.
 730 * We check the FCD of the character first, if it starts with an accent and
 731 * the first pattern ce does not match the first ce of the character, we bail.
 732 * Otherwise we try normalizing the first composite
 733 * character and find the immediate decomposed character before the match to
 734 * see if it is an non-ignorable accent.
 735 * Now normalizing the first composite character is enough because we ensure
 736 * that when the match is passed in here with extra beginning ces, the
 737 * first or last ce that match has to occur within the first character.
 738 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 739 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 740 * Note here that accents checking are slow and cautioned in the API docs.
 741 * @param strsrch string search data
 742 * @param start offset
 743 * @param end offset
 744 * @return TRUE if there are accents on either side of the match,
 745 *         FALSE otherwise
 746 */
 747 static
 748 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
 749                                   int32_t    end)
 750 {
 751     if (strsrch->pattern.hasPrefixAccents) {
 752         UCollationElements *coleiter  = strsrch->textIter;
 753         UErrorCode          status    = U_ZERO_ERROR;
 754         // we have been iterating forwards previously
 755         uint32_t            ignorable = TRUE;
 756         int32_t             firstce   = strsrch->pattern.CE[0];
 757
 758         setColEIterOffset(coleiter, start);
 759         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
 760         if (U_FAILURE(status)) {
 761             return TRUE;
 762         }
 763         while (ce != firstce) {
 764             if (ce != UCOL_IGNORABLE) {
 765                 ignorable = FALSE;
 766             }
 767             ce = getCE(strsrch, ucol_next(coleiter, &status));
 768             if (U_FAILURE(status)) {
 769                 return TRUE;
 770             }
 771         }
 772         if (!ignorable && inNormBuf(coleiter)) {
 773             // within normalization buffer, discontiguous handled here
 774             return TRUE;
 775         }
 776
 777         // within text
 778         int32_t temp = start;
 779         // original code
 780         // accent = (getFCD(strsrch->search->text, &temp,
 781         //                  strsrch->search->textLength)
 782         //            >> SECOND_LAST_BYTE_SHIFT_);
 783         // however this code does not work well with VC7 .net in release mode.
 784         // maybe the inlines for getFCD combined with shifting has bugs in
 785         // VC7. anyways this is a work around.
 786         UBool accent = getFCD(strsrch->search->text, &temp,
 787                               strsrch->search->textLength) > 0xFF;
 788         if (!accent) {
 789             return checkExtraMatchAccents(strsrch, start, end, &status);
 790         }
 791         if (!ignorable) {
 792             return TRUE;
 793         }
 794         if (start > 0) {
 795             temp = start;
 796             UTF_BACK_1(strsrch->search->text, 0, temp);
 797             if (getFCD(strsrch->search->text, &temp,
 798                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
 799                 setColEIterOffset(coleiter, start);
 800                 ce = ucol_previous(coleiter, &status);
 801                 if (U_FAILURE(status) ||
 802                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
 803                     return TRUE;
 804                 }
 805             }
 806         }
 807     }
 808
 809     return FALSE;
 810 }
 811
 812 /**
 813 * Used by exact matches, checks if there are accents bounding the match.
 814 * Note this is the initial boundary check. If the potential match
 815 * starts or ends with composite characters, the accents in those
 816 * characters will be determined later.
 817 * Not doing backwards iteration here, since discontiguos contraction for
 818 * backwards collation element iterator, use up too many characters.
 819 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
 820 * should fail since there is a acute at the end of \u01FA
 821 * Note here that accents checking are slow and cautioned in the API docs.
 822 * @param strsrch string search data
 823 * @param start offset of match
 824 * @param end end offset of the match
 825 * @return TRUE if there are accents on either side of the match,
 826 *         FALSE otherwise
 827 */
 828 static
 829 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
 830                                  int32_t    end)
 831 {
 832     if (strsrch->pattern.hasSuffixAccents) {
 833         const UChar       *text       = strsrch->search->text;
 834               int32_t  temp       = end;
 835               int32_t      textlength = strsrch->search->textLength;
 836         UTF_BACK_1(text, 0, temp);
 837         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 838             int32_t             firstce  = strsrch->pattern.CE[0];
 839             UCollationElements *coleiter = strsrch->textIter;
 840             UErrorCode          status   = U_ZERO_ERROR;
 841             setColEIterOffset(coleiter, start);
 842             while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
 843                 if (U_FAILURE(status)) {
 844                     return TRUE;
 845                 }
 846             }
 847             int32_t count = 1;
 848             while (count < strsrch->pattern.CELength) {
 849                 if (getCE(strsrch, ucol_next(coleiter, &status))
 850                     == UCOL_IGNORABLE) {
 851                     // Thai can give an ignorable here.
 852                     count --;
 853                 }
 854                 if (U_FAILURE(status)) {
 855                     return TRUE;
 856                 }
 857                 count ++;
 858             }
 859             int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 860             if (U_FAILURE(status)) {
 861                 return TRUE;
 862             }
 863             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
 864                 if (ucol_getOffset(coleiter) <= end) {
 865                     return TRUE;
 866                 }
 867                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 868                     return TRUE;
 869                 }
 870             }
 871         }
 872     }
 873     return FALSE;
 874 }
 875
 876 /**
 877 * Checks if the offset runs out of the text string
 878 * @param offset
 879 * @param textlength of the text string
 880 * @return TRUE if offset is out of bounds, FALSE otherwise
 881 */
 882 static
 883 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
 884 {
 885     return offset < 0 || offset > textlength;
 886 }
 887
 888 /**
 889 * Checks for identical match
 890 * @param strsrch string search data
 891 * @param start offset of possible match
 892 * @param end offset of possible match
 893 * @return TRUE if identical match is found
 894 */
 895 static
 896 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
 897                                   int32_t    end)
 898 {
 899     int32_t length = end - start;
 900     if (strsrch->strength != UCOL_IDENTICAL) {
 901         return TRUE;
 902     }
 903
 904     UErrorCode status = U_ZERO_ERROR;
 905     int decomplength = unorm_decompose(NULL, -1,
 906                                        strsrch->search->text + start, length,
 907                                        FALSE, 0, &status);
 908     if (decomplength != unorm_decompose(NULL, -1, strsrch->pattern.text,
 909                                         strsrch->pattern.textLength,
 910                                         FALSE, 0, &status)) {
 911         return FALSE;
 912     }
 913     decomplength ++;
 914     UChar *text    = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
 915     UChar *pattern = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
 916     unorm_decompose(text, decomplength, strsrch->search->text + start,
 917                     length, FALSE, 0, &status);
 918     unorm_decompose(pattern, decomplength, strsrch->pattern.text,
 919                     strsrch->pattern.textLength, FALSE, 0, &status);
 920     UBool result = (uprv_memcmp(pattern, text, decomplength * sizeof(UChar))
 921                     == 0);
 922     uprv_free(text);
 923     uprv_free(pattern);
 924     return result;
 925 }
 926
 927 /**
 928 * Checks to see if the match is repeated
 929 * @param strsrch string search data
 930 * @param start new match start index
 931 * @param end new match end index
 932 * @return TRUE if the the match is repeated, FALSE otherwise
 933 */
 934 static
 935 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
 936                                 int32_t    start,
 937                                 int32_t    end)
 938 {
 939     int32_t lastmatchindex = strsrch->search->matchedIndex;
 940     UBool       result;
 941     if (lastmatchindex == USEARCH_DONE) {
 942         return FALSE;
 943     }
 944     if (strsrch->search->isForwardSearching) {
 945         result = start <= lastmatchindex;
 946     }
 947     else {
 948         result = start >= lastmatchindex;
 949     }
 950     if (!result && !strsrch->search->isOverlap) {
 951         if (strsrch->search->isForwardSearching) {
 952             result = start < lastmatchindex + strsrch->search->matchedLength;
 953         }
 954         else {
 955             result = end > lastmatchindex;
 956         }
 957     }
 958     return result;
 959 }
 960
 961 /**
 962 * Gets the collation element iterator's current offset.
 963 * @param coleiter collation element iterator
 964 * @param forwards flag TRUE if we are moving in th forwards direction
 965 * @return current offset
 966 */
 967 static
 968 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
 969                                               UBool               forwards)
 970 {
 971     int32_t result = ucol_getOffset(coleiter);
 972     // intricacies of the the backwards collation element iterator
 973     if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
 974         result ++;
 975     }
 976     return result;
 977 }
 978
 979 /**
 980 * Checks match for contraction.
 981 * If the match ends with a partial contraction we fail.
 982 * If the match starts too far off (because of backwards iteration) we try to
 983 * chip off the extra characters depending on whether a breakiterator has
 984 * been used.
 985 * Internal method, error assumed to be success, caller has to check status
 986 * before calling this method.
 987 * @param strsrch string search data
 988 * @param start offset of potential match, to be modified if necessary
 989 * @param end offset of potential match, to be modified if necessary
 990 * @param status output error status if any
 991 * @return TRUE if match passes the contraction test, FALSE otherwise
 992 */
 993
 994 static
 995 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
 996                                      int32_t   *start,
 997                                      int32_t   *end, UErrorCode  *status)
 998 {
 999           UCollationElements *coleiter   = strsrch->textIter;
1000           int32_t             textlength = strsrch->search->textLength;
1001           int32_t         temp       = *start;
1002     const UCollator          *collator   = strsrch->collator;
1003     const UChar              *text       = strsrch->search->text;
1004     // This part checks if either ends of the match contains potential
1005     // contraction. If so we'll have to iterate through them
1006     // The start contraction needs to be checked since ucol_previous dumps
1007     // all characters till the first safe character into the buffer.
1008     // *start + 1 is used to test for the unsafe characters instead of *start
1009     // because ucol_prev takes all unsafe characters till the first safe
1010     // character ie *start. so by testing *start + 1, we can estimate if
1011     // excess prefix characters has been included in the potential search
1012     // results.
1013     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1014         (*start + 1 < textlength
1015          && ucol_unsafeCP(text[*start + 1], collator))) {
1016         int32_t expansion  = getExpansionPrefix(coleiter);
1017         UBool   expandflag = expansion > 0;
1018         setColEIterOffset(coleiter, *start);
1019         while (expansion > 0) {
1020             // getting rid of the redundant ce, caused by setOffset.
1021             // since backward contraction/expansion may have extra ces if we
1022             // are in the normalization buffer, hasAccentsBeforeMatch would
1023             // have taken care of it.
1024             // E.g. the character \u01FA will have an expansion of 3, but if
1025             // we are only looking for acute and ring \u030A and \u0301, we'll
1026             // have to skip the first ce in the expansion buffer.
1027             ucol_next(coleiter, status);
1028             if (U_FAILURE(*status)) {
1029                 return FALSE;
1030             }
1031             if (ucol_getOffset(coleiter) != temp) {
1032                 *start = temp;
1033                 temp  = ucol_getOffset(coleiter);
1034             }
1035             expansion --;
1036         }
1037
1038         int32_t  *patternce       = strsrch->pattern.CE;
1039         int32_t   patterncelength = strsrch->pattern.CELength;
1040         int32_t   count           = 0;
1041         while (count < patterncelength) {
1042             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1043             if (ce == UCOL_IGNORABLE) {
1044                 continue;
1045             }
1046             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1047                 *start = temp;
1048                 temp   = ucol_getOffset(coleiter);
1049             }
1050             if (U_FAILURE(*status) || ce != patternce[count]) {
1051                 (*end) ++;
1052                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1053                 return FALSE;
1054             }
1055             count ++;
1056         }
1057     }
1058     return TRUE;
1059 }
1060
1061 /**
1062 * Checks and sets the match information if found.
1063 * Checks
1064 * <ul>
1065 * <li> the potential match does not repeat the previous match
1066 * <li> boundaries are correct
1067 * <li> exact matches has no extra accents
1068 * <li> identical matchesb
1069 * <li> potential match does not end in the middle of a contraction
1070 * <\ul>
1071 * Otherwise the offset will be shifted to the next character.
1072 * Internal method, status assumed to be success, caller has to check status
1073 * before calling this method.
1074 * @param strsrch string search data
1075 * @param textoffset offset in the collation element text. the returned value
1076 *        will be the truncated end offset of the match or the new start
1077 *        search offset.
1078 * @param status output error status if any
1079 * @return TRUE if the match is valid, FALSE otherwise
1080 */
1081 static
1082 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1083                                  int32_t   *textoffset, UErrorCode *status)
1084 {
1085     UCollationElements *coleiter = strsrch->textIter;
1086     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1087
1088     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1089         return FALSE;
1090     }
1091
1092     // this totally matches, however we need to check if it is repeating
1093     if (!isBreakUnit(strsrch, start, *textoffset) ||
1094         checkRepeatedMatch(strsrch, start, *textoffset) ||
1095         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1096         !checkIdentical(strsrch, start, *textoffset) ||
1097         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1098
1099         (*textoffset) ++;
1100         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1101         return FALSE;
1102     }
1103
1104     // totally match, we will get rid of the ending ignorables.
1105     strsrch->search->matchedIndex  = start;
1106     strsrch->search->matchedLength = *textoffset - start;
1107     return TRUE;
1108 }
1109
1110 /**
1111 * Getting the previous base character offset, or the current offset if the
1112 * current character is a base character
1113 * @param text string
1114 * @param textoffset one offset after the current character
1115 * @return the offset of the next character after the base character or the first
1116 *         composed character with accents
1117 */
1118 static
1119 inline int32_t getPreviousBaseOffset(const UChar       *text,
1120                                                int32_t  textoffset)
1121 {
1122     if (textoffset > 0) {
1123         while (TRUE) {
1124             int32_t result = textoffset;
1125             UTF_BACK_1(text, 0, textoffset);
1126             int32_t temp = textoffset;
1127             uint16_t fcd = getFCD(text, &temp, result);
1128             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1129                 if (fcd & LAST_BYTE_MASK_) {
1130                     return textoffset;
1131                 }
1132                 return result;
1133             }
1134             if (textoffset == 0) {
1135                 return 0;
1136             }
1137         }
1138     }
1139     return textoffset;
1140 }
1141
1142 /**
1143 * Getting the indexes of the accents that are not blocked in the argument
1144 * accent array
1145 * @param accents array of accents in nfd terminated by a 0.
1146 * @param accentsindex array of indexes of the accents that are not blocked
1147 */
1148 static
1149 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1150 {
1151     int32_t index     = 0;
1152     int32_t     length    = u_strlen(accents);
1153     UChar32     codepoint = 0;
1154     int         cclass    = 0;
1155     int         result    = 0;
1156     int32_t temp;
1157     while (index < length) {
1158         temp = index;
1159         UTF_NEXT_CHAR(accents, index, length, codepoint);
1160         if (u_getCombiningClass(codepoint) != cclass) {
1161             cclass        = u_getCombiningClass(codepoint);
1162             accentsindex[result] = temp;
1163             result ++;
1164         }
1165     }
1166     accentsindex[result] = length;
1167     return result;
1168 }
1169
1170 /**
1171 * Appends 3 UChar arrays to a destination array.
1172 * Creates a new array if we run out of space. The caller will have to
1173 * manually deallocate the newly allocated array.
1174 * Internal method, status assumed to be success, caller has to check status
1175 * before calling this method. destination not to be NULL and has at least
1176 * size destinationlength.
1177 * @param destination target array
1178 * @param destinationlength target array size, returning the appended length
1179 * @param source1 null-terminated first array
1180 * @param source2 second array
1181 * @param source2length length of seond array
1182 * @param source3 null-terminated third array
1183 * @param status error status if any
1184 * @return new destination array, destination if there was no new allocation
1185 */
1186 static
1187 inline UChar * addToUCharArray(      UChar      *destination,
1188                                      int32_t    *destinationlength,
1189                                const UChar      *source1,
1190                                const UChar      *source2,
1191                                      int32_t     source2length,
1192                                const UChar      *source3,
1193                                      UErrorCode *status)
1194 {
1195     int32_t source1length = source1 ? u_strlen(source1) : 0;
1196     int32_t source3length = source3 ? u_strlen(source3) : 0;
1197     if (*destinationlength < source1length + source2length + source3length +
1198                                                                            1)
1199     {
1200         destination = (UChar *)allocateMemory(
1201           (source1length + source2length + source3length + 1) * sizeof(UChar),
1202           status);
1203         // if error allocating memory, status will be
1204         // U_MEMORY_ALLOCATION_ERROR
1205         if (U_FAILURE(*status)) {
1206             *destinationlength = 0;
1207             return NULL;
1208         }
1209     }
1210     if (source1length != 0) {
1211         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1212     }
1213     if (source2length != 0) {
1214         uprv_memcpy(destination + source1length, source2,
1215                     sizeof(UChar) * source2length);
1216     }
1217     if (source3length != 0) {
1218         uprv_memcpy(destination + source1length + source2length, source3,
1219                     sizeof(UChar) * source3length);
1220     }
1221     *destinationlength = source1length + source2length + source3length;
1222     return destination;
1223 }
1224
1225 /**
1226 * Running through a collation element iterator to see if the contents matches
1227 * pattern in string search data
1228 * @param strsrch string search data
1229 * @param coleiter collation element iterator
1230 * @return TRUE if a match if found, FALSE otherwise
1231 */
1232 static
1233 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1234                                        UCollationElements *coleiter)
1235 {
1236     int         patternceindex = strsrch->pattern.CELength;
1237     int32_t    *patternce      = strsrch->pattern.CE;
1238     UErrorCode  status = U_ZERO_ERROR;
1239     while (patternceindex > 0) {
1240         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1241         if (ce == UCOL_IGNORABLE) {
1242             continue;
1243         }
1244         if (U_FAILURE(status) || ce != *patternce) {
1245             return FALSE;
1246         }
1247         patternce ++;
1248         patternceindex --;
1249     }
1250     return TRUE;
1251 }
1252
1253 /**
1254 * Rearranges the front accents to try matching.
1255 * Prefix accents in the text will be grouped according to their combining
1256 * class and the groups will be mixed and matched to try find the perfect
1257 * match with the pattern.
1258 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1259 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1260 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1261 *         "\u0301\u0325".
1262 * step 2: check if any of the generated substrings matches the pattern.
1263 * Internal method, status is assumed to be success, caller has to check status
1264 * before calling this method.
1265 * @param strsrch string search match
1266 * @param start first offset of the accents to start searching
1267 * @param end start of the last accent set
1268 * @param status output error status if any
1269 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1270 *         offset of the match. Note this start includes all preceding accents.
1271 */
1272 static
1273 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1274                                        int32_t    start,
1275                                        int32_t    end,
1276                                        UErrorCode    *status)
1277 {
1278     const UChar       *text       = strsrch->search->text;
1279           int32_t      textlength = strsrch->search->textLength;
1280           int32_t  tempstart  = start;
1281
1282     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1283         // die... failed at a base character
1284         return USEARCH_DONE;
1285     }
1286
1287     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1288     start = getPreviousBaseOffset(text, tempstart);
1289
1290     UChar       accents[INITIAL_ARRAY_SIZE_];
1291     // normalizing the offensive string
1292     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1293                     INITIAL_ARRAY_SIZE_, status);
1294     if (U_FAILURE(*status)) {
1295         return USEARCH_DONE;
1296     }
1297
1298     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1299     int32_t         accentsize = getUnblockedAccentIndex(accents,
1300                                                                  accentsindex);
1301     int32_t         count      = (2 << (accentsize - 1)) - 1;
1302     UChar               buffer[INITIAL_ARRAY_SIZE_];
1303     UCollationElements *coleiter   = strsrch->utilIter;
1304     while (U_SUCCESS(*status) && count > 0) {
1305         UChar *rearrange = strsrch->canonicalPrefixAccents;
1306         // copy the base characters
1307         for (int k = 0; k < accentsindex[0]; k ++) {
1308             *rearrange ++ = accents[k];
1309         }
1310         // forming all possible canonical rearrangement by dropping
1311         // sets of accents
1312         for (int i = 0; i <= accentsize - 1; i ++) {
1313             int32_t mask = 1 << (accentsize - i - 1);
1314             if (count & mask) {
1315                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1316                     *rearrange ++ = accents[j];
1317                 }
1318             }
1319         }
1320         *rearrange = 0;
1321         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1322         UChar   *match     = addToUCharArray(buffer, &matchsize,
1323                                            strsrch->canonicalPrefixAccents,
1324                                            strsrch->search->text + offset,
1325                                            end - offset,
1326                                            strsrch->canonicalSuffixAccents,
1327                                            status);
1328
1329         // if status is a failure, ucol_setText does nothing.
1330         // run the collator iterator through this match
1331         ucol_setText(coleiter, match, matchsize, status);
1332         if (U_SUCCESS(*status)) {
1333             if (checkCollationMatch(strsrch, coleiter)) {
1334                 if (match != buffer) {
1335                     uprv_free(match);
1336                 }
1337                 return start;
1338             }
1339         }
1340         count --;
1341     }
1342     return USEARCH_DONE;
1343 }
1344
1345 /**
1346 * Gets the offset to the safe point in text before textoffset.
1347 * ie. not the middle of a contraction, swappable characters or supplementary
1348 * characters.
1349 * @param collator collation sata
1350 * @param text string to work with
1351 * @param textoffset offset in string
1352 * @param textlength length of text string
1353 * @return offset to the previous safe character
1354 */
1355 static
1356 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1357                                       const UChar       *text,
1358                                             int32_t  textoffset)
1359 {
1360     int32_t result = textoffset; // first contraction character
1361     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1362         result --;
1363     }
1364     if (result != 0) {
1365         // the first contraction character is consider unsafe here
1366         result --;
1367     }
1368     return result;
1369 }
1370
1371 /**
1372 * Cleaning up after we passed the safe zone
1373 * @param strsrch string search data
1374 * @param safetext safe text array
1375 * @param safebuffer safe text buffer
1376 * @param coleiter collation element iterator for safe text
1377 */
1378 static
1379 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1380                                   UChar         *safebuffer)
1381 {
1382     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1383     {
1384        uprv_free(safetext);
1385     }
1386 }
1387
1388 /**
1389 * Take the rearranged end accents and tries matching. If match failed at
1390 * a seperate preceding set of accents (seperated from the rearranged on by
1391 * at least a base character) then we rearrange the preceding accents and
1392 * tries matching again.
1393 * We allow skipping of the ends of the accent set if the ces do not match.
1394 * However if the failure is found before the accent set, it fails.
1395 * Internal method, status assumed to be success, caller has to check status
1396 * before calling this method.
1397 * @param strsrch string search data
1398 * @param textoffset of the start of the rearranged accent
1399 * @param status output error status if any
1400 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1401 *         offset of the match. Note this start includes all preceding accents.
1402 */
1403 static
1404 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1405                                        int32_t    textoffset,
1406                                        UErrorCode    *status)
1407 {
1408     const UChar              *text           = strsrch->search->text;
1409     const UCollator          *collator       = strsrch->collator;
1410           int32_t             safelength     = 0;
1411           UChar              *safetext;
1412           int32_t             safetextlength;
1413           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1414           UCollationElements *coleiter       = strsrch->utilIter;
1415           int32_t         safeoffset     = textoffset;
1416
1417     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1418                                          collator)) {
1419         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1420         safelength     = textoffset - safeoffset;
1421         safetextlength = INITIAL_ARRAY_SIZE_;
1422         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1423                                          text + safeoffset, safelength,
1424                                          strsrch->canonicalSuffixAccents,
1425                                          status);
1426     }
1427     else {
1428         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1429         safetext       = strsrch->canonicalSuffixAccents;
1430     }
1431
1432     // if status is a failure, ucol_setText does nothing
1433     ucol_setText(coleiter, safetext, safetextlength, status);
1434     // status checked in loop below
1435
1436     int32_t  *ce        = strsrch->pattern.CE;
1437     int32_t   celength  = strsrch->pattern.CELength;
1438     int       ceindex   = celength - 1;
1439     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1440
1441     while (ceindex >= 0) {
1442         int32_t textce = ucol_previous(coleiter, status);
1443         if (U_FAILURE(*status)) {
1444             if (isSafe) {
1445                 cleanUpSafeText(strsrch, safetext, safebuffer);
1446             }
1447             return USEARCH_DONE;
1448         }
1449         if (textce == UCOL_NULLORDER) {
1450             // check if we have passed the safe buffer
1451             if (coleiter == strsrch->textIter) {
1452                 cleanUpSafeText(strsrch, safetext, safebuffer);
1453                 return USEARCH_DONE;
1454             }
1455             cleanUpSafeText(strsrch, safetext, safebuffer);
1456             safetext = safebuffer;
1457             coleiter = strsrch->textIter;
1458             setColEIterOffset(coleiter, safeoffset);
1459             // status checked at the start of the loop
1460             isSafe = FALSE;
1461             continue;
1462         }
1463         textce = getCE(strsrch, textce);
1464         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1465             // do the beginning stuff
1466             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1467             if (isSafe && failedoffset >= safelength) {
1468                 // alas... no hope. failed at rearranged accent set
1469                 cleanUpSafeText(strsrch, safetext, safebuffer);
1470                 return USEARCH_DONE;
1471             }
1472             else {
1473                 if (isSafe) {
1474                     failedoffset += safeoffset;
1475                     cleanUpSafeText(strsrch, safetext, safebuffer);
1476                 }
1477
1478                 // try rearranging the front accents
1479                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1480                                         failedoffset, textoffset, status);
1481                 if (result != USEARCH_DONE) {
1482                     // if status is a failure, ucol_setOffset does nothing
1483                     setColEIterOffset(strsrch->textIter, result);
1484                 }
1485                 if (U_FAILURE(*status)) {
1486                     return USEARCH_DONE;
1487                 }
1488                 return result;
1489             }
1490         }
1491         if (textce == ce[ceindex]) {
1492             ceindex --;
1493         }
1494     }
1495     // set offset here
1496     if (isSafe) {
1497         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1498         // sets the text iterator here with the correct expansion and offset
1499         int32_t    leftoverces = getExpansionPrefix(coleiter);
1500         cleanUpSafeText(strsrch, safetext, safebuffer);
1501         if (result >= safelength) {
1502             result = textoffset;
1503         }
1504         else {
1505             result += safeoffset;
1506         }
1507         setColEIterOffset(strsrch->textIter, result);
1508         strsrch->textIter->iteratordata_.toReturn =
1509                        setExpansionPrefix(strsrch->textIter, leftoverces);
1510         return result;
1511     }
1512
1513     return ucol_getOffset(coleiter);
1514 }
1515
1516 /**
1517 * Trying out the substring and sees if it can be a canonical match.
1518 * This will try normalizing the end accents and arranging them into canonical
1519 * equivalents and check their corresponding ces with the pattern ce.
1520 * Suffix accents in the text will be grouped according to their combining
1521 * class and the groups will be mixed and matched to try find the perfect
1522 * match with the pattern.
1523 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1524 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1525 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1526 *         "\u0301\u0325".
1527 * step 2: check if any of the generated substrings matches the pattern.
1528 * Internal method, status assumed to be success, caller has to check status
1529 * before calling this method.
1530 * @param strsrch string search data
1531 * @param textoffset end offset in the collation element text that ends with
1532 *                   the accents to be rearranged
1533 * @param status error status if any
1534 * @return TRUE if the match is valid, FALSE otherwise
1535 */
1536 static
1537 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1538                            int32_t    textoffset,
1539                            UErrorCode    *status)
1540 {
1541     const UChar       *text = strsrch->search->text;
1542           int32_t  temp = textoffset;
1543     UTF_BACK_1(text, 0, temp);
1544     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1545         UCollationElements *coleiter = strsrch->textIter;
1546         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1547         if (strsrch->pattern.hasPrefixAccents) {
1548             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1549                                                 status);
1550             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1551                 setColEIterOffset(coleiter, offset);
1552                 return TRUE;
1553             }
1554         }
1555         return FALSE;
1556     }
1557
1558     if (!strsrch->pattern.hasSuffixAccents) {
1559         return FALSE;
1560     }
1561
1562     UChar       accents[INITIAL_ARRAY_SIZE_];
1563     // offset to the last base character in substring to search
1564     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1565     // normalizing the offensive string
1566     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1567                                0, accents, INITIAL_ARRAY_SIZE_, status);
1568     // status checked in loop below
1569
1570     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1571     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1572
1573     // 2 power n - 1 plus the full set of accents
1574     int32_t  count = (2 << (size - 1)) - 1;
1575     while (U_SUCCESS(*status) && count > 0) {
1576         UChar *rearrange = strsrch->canonicalSuffixAccents;
1577         // copy the base characters
1578         for (int k = 0; k < accentsindex[0]; k ++) {
1579             *rearrange ++ = accents[k];
1580         }
1581         // forming all possible canonical rearrangement by dropping
1582         // sets of accents
1583         for (int i = 0; i <= size - 1; i ++) {
1584             int32_t mask = 1 << (size - i - 1);
1585             if (count & mask) {
1586                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1587                     *rearrange ++ = accents[j];
1588                 }
1589             }
1590         }
1591         *rearrange = 0;
1592         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1593                                                         status);
1594         if (offset != USEARCH_DONE) {
1595             return TRUE; // match found
1596         }
1597         count --;
1598     }
1599     return FALSE;
1600 }
1601
1602 /**
1603 * Gets the previous base character offset depending on the string search
1604 * pattern data
1605 * @param strsrch string search data
1606 * @param textoffset current offset, current character
1607 * @return the offset of the next character after this base character or itself
1608 *         if it is a composed character with accents
1609 */
1610 static
1611 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1612                                                       int32_t textoffset)
1613 {
1614     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1615         const UChar       *text = strsrch->search->text;
1616               int32_t  offset = textoffset;
1617         if (getFCD(text, &offset, strsrch->search->textLength) >>
1618                                                    SECOND_LAST_BYTE_SHIFT_) {
1619             return getPreviousBaseOffset(text, textoffset);
1620         }
1621     }
1622     return textoffset;
1623 }
1624
1625 /**
1626 * Checks match for contraction.
1627 * If the match ends with a partial contraction we fail.
1628 * If the match starts too far off (because of backwards iteration) we try to
1629 * chip off the extra characters
1630 * Internal method, status assumed to be success, caller has to check status
1631 * before calling this method.
1632 * @param strsrch string search data
1633 * @param start offset of potential match, to be modified if necessary
1634 * @param end offset of potential match, to be modified if necessary
1635 * @param status output error status if any
1636 * @return TRUE if match passes the contraction test, FALSE otherwise
1637 */
1638 static
1639 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1640                                          int32_t   *start,
1641                                          int32_t   *end,
1642                                          UErrorCode    *status)
1643 {
1644           UCollationElements *coleiter   = strsrch->textIter;
1645           int32_t             textlength = strsrch->search->textLength;
1646           int32_t         temp       = *start;
1647     const UCollator          *collator   = strsrch->collator;
1648     const UChar              *text       = strsrch->search->text;
1649     // This part checks if either ends of the match contains potential
1650     // contraction. If so we'll have to iterate through them
1651     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1652         (*start + 1 < textlength
1653          && ucol_unsafeCP(text[*start + 1], collator))) {
1654         int32_t expansion  = getExpansionPrefix(coleiter);
1655         UBool   expandflag = expansion > 0;
1656         setColEIterOffset(coleiter, *start);
1657         while (expansion > 0) {
1658             // getting rid of the redundant ce, caused by setOffset.
1659             // since backward contraction/expansion may have extra ces if we
1660             // are in the normalization buffer, hasAccentsBeforeMatch would
1661             // have taken care of it.
1662             // E.g. the character \u01FA will have an expansion of 3, but if
1663             // we are only looking for acute and ring \u030A and \u0301, we'll
1664             // have to skip the first ce in the expansion buffer.
1665             ucol_next(coleiter, status);
1666             if (U_FAILURE(*status)) {
1667                 return FALSE;
1668             }
1669             if (ucol_getOffset(coleiter) != temp) {
1670                 *start = temp;
1671                 temp  = ucol_getOffset(coleiter);
1672             }
1673             expansion --;
1674         }
1675
1676         int32_t  *patternce       = strsrch->pattern.CE;
1677         int32_t   patterncelength = strsrch->pattern.CELength;
1678         int32_t   count           = 0;
1679         int32_t   textlength      = strsrch->search->textLength;
1680         while (count < patterncelength) {
1681             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1682             // status checked below, note that if status is a failure
1683             // ucol_next returns UCOL_NULLORDER
1684             if (ce == UCOL_IGNORABLE) {
1685                 continue;
1686             }
1687             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1688                 *start = temp;
1689                 temp   = ucol_getOffset(coleiter);
1690             }
1691
1692             if (count == 0 && ce != patternce[0]) {
1693                 // accents may have extra starting ces, this occurs when a
1694                 // pure accent pattern is matched without rearrangement
1695                 // text \u0325\u0300 and looking for \u0300
1696                 int32_t expected = patternce[0];
1697                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1698                     ce = getCE(strsrch, ucol_next(coleiter, status));
1699                     while (U_SUCCESS(*status) && ce != expected &&
1700                            ce != UCOL_NULLORDER &&
1701                            ucol_getOffset(coleiter) <= *end) {
1702                         ce = getCE(strsrch, ucol_next(coleiter, status));
1703                     }
1704                 }
1705             }
1706             if (U_FAILURE(*status) || ce != patternce[count]) {
1707                 (*end) ++;
1708                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1709                 return FALSE;
1710             }
1711             count ++;
1712         }
1713     }
1714     return TRUE;
1715 }
1716
1717 /**
1718 * Checks and sets the match information if found.
1719 * Checks
1720 * <ul>
1721 * <li> the potential match does not repeat the previous match
1722 * <li> boundaries are correct
1723 * <li> potential match does not end in the middle of a contraction
1724 * <li> identical matches
1725 * <\ul>
1726 * Otherwise the offset will be shifted to the next character.
1727 * Internal method, status assumed to be success, caller has to check the
1728 * status before calling this method.
1729 * @param strsrch string search data
1730 * @param textoffset offset in the collation element text. the returned value
1731 *        will be the truncated end offset of the match or the new start
1732 *        search offset.
1733 * @param status output error status if any
1734 * @return TRUE if the match is valid, FALSE otherwise
1735 */
1736 static
1737 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1738                                      int32_t   *textoffset,
1739                                      UErrorCode    *status)
1740 {
1741     // to ensure that the start and ends are not composite characters
1742     UCollationElements *coleiter = strsrch->textIter;
1743     // if we have a canonical accent match
1744     if ((strsrch->pattern.hasSuffixAccents &&
1745         strsrch->canonicalSuffixAccents[0]) ||
1746         (strsrch->pattern.hasPrefixAccents &&
1747         strsrch->canonicalPrefixAccents[0])) {
1748         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1749                                                     strsrch,
1750                                                     ucol_getOffset(coleiter));
1751         strsrch->search->matchedLength = *textoffset -
1752                                                 strsrch->search->matchedIndex;
1753         return TRUE;
1754     }
1755
1756     int32_t start = getColElemIterOffset(coleiter, FALSE);
1757     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1758                                             status) || U_FAILURE(*status)) {
1759         return FALSE;
1760     }
1761
1762     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1763     // this totally matches, however we need to check if it is repeating
1764     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1765         !isBreakUnit(strsrch, start, *textoffset) ||
1766         !checkIdentical(strsrch, start, *textoffset)) {
1767         (*textoffset) ++;
1768         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1769                                         strsrch->search->textLength);
1770         return FALSE;
1771     }
1772
1773     strsrch->search->matchedIndex  = start;
1774     strsrch->search->matchedLength = *textoffset - start;
1775     return TRUE;
1776 }
1777
1778 /**
1779 * Shifting the collation element iterator position forward to prepare for
1780 * a preceding match. If the first character is a unsafe character, we'll only
1781 * shift by 1 to capture contractions, normalization etc.
1782 * Internal method, status assumed to be success, caller has to check status
1783 * before calling this method.
1784 * @param text strsrch string search data
1785 * @param textoffset start text position to do search
1786 * @param ce the text ce which failed the match.
1787 * @param patternceindex index of the ce within the pattern ce buffer which
1788 *        failed the match
1789 * @return final offset
1790 */
1791 static
1792 inline int32_t reverseShift(UStringSearch *strsrch,
1793                                 int32_t    textoffset,
1794                                 int32_t       ce,
1795                                 int32_t        patternceindex)
1796 {
1797     if (strsrch->search->isOverlap) {
1798         if (textoffset != strsrch->search->textLength) {
1799             textoffset --;
1800         }
1801         else {
1802             textoffset -= strsrch->pattern.defaultShiftSize;
1803         }
1804     }
1805     else {
1806         if (ce != UCOL_NULLORDER) {
1807             int32_t shift = strsrch->pattern.backShift[hash(ce)];
1808
1809             // this is to adjust for characters in the middle of the substring
1810             // for matching that failed.
1811             int32_t adjust = patternceindex;
1812             if (adjust > 1 && shift > adjust) {
1813                 shift -= adjust - 1;
1814             }
1815             textoffset -= shift;
1816         }
1817         else {
1818             textoffset -= strsrch->pattern.defaultShiftSize;
1819         }
1820     }
1821     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1822     return textoffset;
1823 }
1824
1825 /**
1826 * Checks match for contraction.
1827 * If the match starts with a partial contraction we fail.
1828 * Internal method, status assumed to be success, caller has to check status
1829 * before calling this method.
1830 * @param strsrch string search data
1831 * @param start offset of potential match, to be modified if necessary
1832 * @param end offset of potential match, to be modified if necessary
1833 * @param status output error status if any
1834 * @return TRUE if match passes the contraction test, FALSE otherwise
1835 */
1836 static
1837 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
1838                                      int32_t   *start,
1839                                      int32_t   *end, UErrorCode  *status)
1840 {
1841           UCollationElements *coleiter   = strsrch->textIter;
1842           int32_t             textlength = strsrch->search->textLength;
1843           int32_t             temp       = *end;
1844     const UCollator          *collator   = strsrch->collator;
1845     const UChar              *text       = strsrch->search->text;
1846     // This part checks if either if the start of the match contains potential
1847     // contraction. If so we'll have to iterate through them
1848     // Since we used ucol_next while previously looking for the potential
1849     // match, this guarantees that our end will not be a partial contraction,
1850     // or a partial supplementary character.
1851     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
1852         int32_t expansion  = getExpansionSuffix(coleiter);
1853         UBool   expandflag = expansion > 0;
1854         setColEIterOffset(coleiter, *end);
1855         while (U_SUCCESS(*status) && expansion > 0) {
1856             // getting rid of the redundant ce
1857             // since forward contraction/expansion may have extra ces
1858             // if we are in the normalization buffer, hasAccentsBeforeMatch
1859             // would have taken care of it.
1860             // E.g. the character \u01FA will have an expansion of 3, but if
1861             // we are only looking for A ring A\u030A, we'll have to skip the
1862             // last ce in the expansion buffer
1863             ucol_previous(coleiter, status);
1864             if (U_FAILURE(*status)) {
1865                 return FALSE;
1866             }
1867             if (ucol_getOffset(coleiter) != temp) {
1868                 *end = temp;
1869                 temp  = ucol_getOffset(coleiter);
1870             }
1871             expansion --;
1872         }
1873
1874         int32_t  *patternce       = strsrch->pattern.CE;
1875         int32_t   patterncelength = strsrch->pattern.CELength;
1876         int32_t   count           = patterncelength;
1877         while (count > 0) {
1878             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
1879             // status checked below, note that if status is a failure
1880             // ucol_previous returns UCOL_NULLORDER
1881             if (ce == UCOL_IGNORABLE) {
1882                 continue;
1883             }
1884             if (expandflag && count == 0 &&
1885                 getColElemIterOffset(coleiter, FALSE) != temp) {
1886                 *end = temp;
1887                 temp  = ucol_getOffset(coleiter);
1888             }
1889             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
1890                 (*start) --;
1891                 *start = getPreviousBaseOffset(text, *start);
1892                 return FALSE;
1893             }
1894             count --;
1895         }
1896     }
1897     return TRUE;
1898 }
1899
1900 /**
1901 * Checks and sets the match information if found.
1902 * Checks
1903 * <ul>
1904 * <li> the current match does not repeat the last match
1905 * <li> boundaries are correct
1906 * <li> exact matches has no extra accents
1907 * <li> identical matches
1908 * <\ul>
1909 * Otherwise the offset will be shifted to the preceding character.
1910 * Internal method, status assumed to be success, caller has to check status
1911 * before calling this method.
1912 * @param strsrch string search data
1913 * @param collator
1914 * @param coleiter collation element iterator
1915 * @param text string
1916 * @param textoffset offset in the collation element text. the returned value
1917 *        will be the truncated start offset of the match or the new start
1918 *        search offset.
1919 * @param status output error status if any
1920 * @return TRUE if the match is valid, FALSE otherwise
1921 */
1922 static
1923 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
1924                                      int32_t   *textoffset,
1925                                      UErrorCode    *status)
1926 {
1927     // to ensure that the start and ends are not composite characters
1928     int32_t end = ucol_getOffset(strsrch->textIter);
1929     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
1930         || U_FAILURE(*status)) {
1931             return FALSE;
1932     }
1933
1934     // this totally matches, however we need to check if it is repeating
1935     // the old match
1936     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
1937         !isBreakUnit(strsrch, *textoffset, end) ||
1938         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
1939         !checkIdentical(strsrch, *textoffset, end) ||
1940         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
1941         (*textoffset) --;
1942         *textoffset = getPreviousBaseOffset(strsrch->search->text,
1943                                             *textoffset);
1944         return FALSE;
1945     }
1946     strsrch->search->matchedIndex = *textoffset;
1947     strsrch->search->matchedLength = end - *textoffset;
1948     return TRUE;
1949 }
1950
1951 /**
1952 * Rearranges the end accents to try matching.
1953 * Suffix accents in the text will be grouped according to their combining
1954 * class and the groups will be mixed and matched to try find the perfect
1955 * match with the pattern.
1956 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1957 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1958 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1959 *         "\u0301\u0325".
1960 * step 2: check if any of the generated substrings matches the pattern.
1961 * Internal method, status assumed to be success, user has to check status
1962 * before calling this method.
1963 * @param strsrch string search match
1964 * @param start offset of the first base character
1965 * @param end start of the last accent set
1966 * @param status only error status if any
1967 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1968 *         offset of the match. Note this start includes all following accents.
1969 */
1970 static
1971 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
1972                                            int32_t    start,
1973                                            int32_t    end,
1974                                            UErrorCode    *status)
1975 {
1976     const UChar       *text       = strsrch->search->text;
1977           int32_t  tempend    = end;
1978
1979     UTF_BACK_1(text, 0, tempend);
1980     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
1981                                                            LAST_BYTE_MASK_)) {
1982         // die... failed at a base character
1983         return USEARCH_DONE;
1984     }
1985     end = getNextBaseOffset(text, end, strsrch->search->textLength);
1986
1987     if (U_SUCCESS(*status)) {
1988         UChar       accents[INITIAL_ARRAY_SIZE_];
1989         int32_t offset = getPreviousBaseOffset(text, end);
1990         // normalizing the offensive string
1991         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
1992                         INITIAL_ARRAY_SIZE_, status);
1993
1994         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1995         int32_t         accentsize = getUnblockedAccentIndex(accents,
1996                                                          accentsindex);
1997         int32_t         count      = (2 << (accentsize - 1)) - 1;
1998         UChar               buffer[INITIAL_ARRAY_SIZE_];
1999         UCollationElements *coleiter = strsrch->utilIter;
2000         while (U_SUCCESS(*status) && count > 0) {
2001             UChar *rearrange = strsrch->canonicalSuffixAccents;
2002             // copy the base characters
2003             for (int k = 0; k < accentsindex[0]; k ++) {
2004                 *rearrange ++ = accents[k];
2005             }
2006             // forming all possible canonical rearrangement by dropping
2007             // sets of accents
2008             for (int i = 0; i <= accentsize - 1; i ++) {
2009                 int32_t mask = 1 << (accentsize - i - 1);
2010                 if (count & mask) {
2011                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2012                         *rearrange ++ = accents[j];
2013                     }
2014                 }
2015             }
2016             *rearrange = 0;
2017             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2018             UChar   *match     = addToUCharArray(buffer, &matchsize,
2019                                            strsrch->canonicalPrefixAccents,
2020                                            strsrch->search->text + start,
2021                                            offset - start,
2022                                            strsrch->canonicalSuffixAccents,
2023                                            status);
2024
2025             // run the collator iterator through this match
2026             // if status is a failure ucol_setText does nothing
2027             ucol_setText(coleiter, match, matchsize, status);
2028             if (U_SUCCESS(*status)) {
2029                 if (checkCollationMatch(strsrch, coleiter)) {
2030                     if (match != buffer) {
2031                         uprv_free(match);
2032                     }
2033                     return end;
2034                 }
2035             }
2036             count --;
2037         }
2038     }
2039     return USEARCH_DONE;
2040 }
2041
2042 /**
2043 * Take the rearranged start accents and tries matching. If match failed at
2044 * a seperate following set of accents (seperated from the rearranged on by
2045 * at least a base character) then we rearrange the preceding accents and
2046 * tries matching again.
2047 * We allow skipping of the ends of the accent set if the ces do not match.
2048 * However if the failure is found before the accent set, it fails.
2049 * Internal method, status assumed to be success, caller has to check status
2050 * before calling this method.
2051 * @param strsrch string search data
2052 * @param textoffset of the ends of the rearranged accent
2053 * @param status output error status if any
2054 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2055 *         offset of the match. Note this start includes all following accents.
2056 */
2057 static
2058 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2059                                            int32_t    textoffset,
2060                                            UErrorCode    *status)
2061 {
2062     const UChar       *text       = strsrch->search->text;
2063     const UCollator   *collator   = strsrch->collator;
2064           int32_t      safelength = 0;
2065           UChar       *safetext;
2066           int32_t      safetextlength;
2067           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2068           int32_t  safeoffset = textoffset;
2069
2070     if (textoffset &&
2071         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2072                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2073                                          ], collator)) {
2074         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2075                                            strsrch->search->textLength);
2076         safelength     = safeoffset - textoffset;
2077         safetextlength = INITIAL_ARRAY_SIZE_;
2078         safetext       = addToUCharArray(safebuffer, &safetextlength,
2079                                          strsrch->canonicalPrefixAccents,
2080                                          text + textoffset, safelength,
2081                                          NULL, status);
2082     }
2083     else {
2084         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2085         safetext       = strsrch->canonicalPrefixAccents;
2086     }
2087
2088     UCollationElements *coleiter = strsrch->utilIter;
2089      // if status is a failure, ucol_setText does nothing
2090     ucol_setText(coleiter, safetext, safetextlength, status);
2091     // status checked in loop below
2092
2093     int32_t  *ce           = strsrch->pattern.CE;
2094     int32_t   celength     = strsrch->pattern.CELength;
2095     int       ceindex      = 0;
2096     UBool     isSafe       = TRUE; // safe zone indication flag for position
2097     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2098
2099     while (ceindex < celength) {
2100         int32_t textce = ucol_next(coleiter, status);
2101         if (U_FAILURE(*status)) {
2102             if (isSafe) {
2103                 cleanUpSafeText(strsrch, safetext, safebuffer);
2104             }
2105             return USEARCH_DONE;
2106         }
2107         if (textce == UCOL_NULLORDER) {
2108             // check if we have passed the safe buffer
2109             if (coleiter == strsrch->textIter) {
2110                 cleanUpSafeText(strsrch, safetext, safebuffer);
2111                 return USEARCH_DONE;
2112             }
2113             cleanUpSafeText(strsrch, safetext, safebuffer);
2114             safetext = safebuffer;
2115             coleiter = strsrch->textIter;
2116             setColEIterOffset(coleiter, safeoffset);
2117             // status checked at the start of the loop
2118             isSafe = FALSE;
2119             continue;
2120         }
2121         textce = getCE(strsrch, textce);
2122         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2123             // do the beginning stuff
2124             int32_t failedoffset = ucol_getOffset(coleiter);
2125             if (isSafe && failedoffset <= prefixlength) {
2126                 // alas... no hope. failed at rearranged accent set
2127                 cleanUpSafeText(strsrch, safetext, safebuffer);
2128                 return USEARCH_DONE;
2129             }
2130             else {
2131                 if (isSafe) {
2132                     failedoffset = safeoffset - failedoffset;
2133                     cleanUpSafeText(strsrch, safetext, safebuffer);
2134                 }
2135
2136                 // try rearranging the end accents
2137                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2138                                         textoffset, failedoffset, status);
2139                 if (result != USEARCH_DONE) {
2140                     // if status is a failure, ucol_setOffset does nothing
2141                     setColEIterOffset(strsrch->textIter, result);
2142                 }
2143                 if (U_FAILURE(*status)) {
2144                     return USEARCH_DONE;
2145                 }
2146                 return result;
2147             }
2148         }
2149         if (textce == ce[ceindex]) {
2150             ceindex ++;
2151         }
2152     }
2153     // set offset here
2154     if (isSafe) {
2155         int32_t result      = ucol_getOffset(coleiter);
2156         // sets the text iterator here with the correct expansion and offset
2157         int32_t     leftoverces = getExpansionSuffix(coleiter);
2158         cleanUpSafeText(strsrch, safetext, safebuffer);
2159         if (result <= prefixlength) {
2160             result = textoffset;
2161         }
2162         else {
2163             result = textoffset + (safeoffset - result);
2164         }
2165         setColEIterOffset(strsrch->textIter, result);
2166         setExpansionSuffix(strsrch->textIter, leftoverces);
2167         return result;
2168     }
2169
2170     return ucol_getOffset(coleiter);
2171 }
2172
2173 /**
2174 * Trying out the substring and sees if it can be a canonical match.
2175 * This will try normalizing the starting accents and arranging them into
2176 * canonical equivalents and check their corresponding ces with the pattern ce.
2177 * Prefix accents in the text will be grouped according to their combining
2178 * class and the groups will be mixed and matched to try find the perfect
2179 * match with the pattern.
2180 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2181 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2182 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2183 *         "\u0301\u0325".
2184 * step 2: check if any of the generated substrings matches the pattern.
2185 * Internal method, status assumed to be success, caller has to check status
2186 * before calling this method.
2187 * @param strsrch string search data
2188 * @param textoffset start offset in the collation element text that starts
2189 *                   with the accents to be rearranged
2190 * @param status output error status if any
2191 * @return TRUE if the match is valid, FALSE otherwise
2192 */
2193 static
2194 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2195                                int32_t    textoffset,
2196                                UErrorCode    *status)
2197 {
2198     const UChar       *text       = strsrch->search->text;
2199           int32_t  temp       = textoffset;
2200           int32_t      textlength = strsrch->search->textLength;
2201     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2202         UCollationElements *coleiter = strsrch->textIter;
2203         int32_t         offset   = ucol_getOffset(coleiter);
2204         if (strsrch->pattern.hasSuffixAccents) {
2205             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2206                                                     offset, status);
2207             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2208                 setColEIterOffset(coleiter, offset);
2209                 return TRUE;
2210             }
2211         }
2212         return FALSE;
2213     }
2214
2215     if (!strsrch->pattern.hasPrefixAccents) {
2216         return FALSE;
2217     }
2218
2219     UChar       accents[INITIAL_ARRAY_SIZE_];
2220     // offset to the last base character in substring to search
2221     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2222     // normalizing the offensive string
2223     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2224                                0, accents, INITIAL_ARRAY_SIZE_, status);
2225     // status checked in loop
2226
2227     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2228     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2229
2230     // 2 power n - 1 plus the full set of accents
2231     int32_t  count = (2 << (size - 1)) - 1;
2232     while (U_SUCCESS(*status) && count > 0) {
2233         UChar *rearrange = strsrch->canonicalPrefixAccents;
2234         // copy the base characters
2235         for (int k = 0; k < accentsindex[0]; k ++) {
2236             *rearrange ++ = accents[k];
2237         }
2238         // forming all possible canonical rearrangement by dropping
2239         // sets of accents
2240         for (int i = 0; i <= size - 1; i ++) {
2241             int32_t mask = 1 << (size - i - 1);
2242             if (count & mask) {
2243                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2244                     *rearrange ++ = accents[j];
2245                 }
2246             }
2247         }
2248         *rearrange = 0;
2249         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2250                                                           baseoffset, status);
2251         if (offset != USEARCH_DONE) {
2252             return TRUE; // match found
2253         }
2254         count --;
2255     }
2256     return FALSE;
2257 }
2258
2259 /**
2260 * Checks match for contraction.
2261 * If the match starts with a partial contraction we fail.
2262 * Internal method, status assumed to be success, caller has to check status
2263 * before calling this method.
2264 * @param strsrch string search data
2265 * @param start offset of potential match, to be modified if necessary
2266 * @param end offset of potential match, to be modified if necessary
2267 * @param status only error status if any
2268 * @return TRUE if match passes the contraction test, FALSE otherwise
2269 */
2270 static
2271 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2272                                      int32_t   *start,
2273                                      int32_t   *end, UErrorCode  *status)
2274 {
2275           UCollationElements *coleiter   = strsrch->textIter;
2276           int32_t             textlength = strsrch->search->textLength;
2277           int32_t         temp       = *end;
2278     const UCollator          *collator   = strsrch->collator;
2279     const UChar              *text       = strsrch->search->text;
2280     // This part checks if either if the start of the match contains potential
2281     // contraction. If so we'll have to iterate through them
2282     // Since we used ucol_next while previously looking for the potential
2283     // match, this guarantees that our end will not be a partial contraction,
2284     // or a partial supplementary character.
2285     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2286         int32_t expansion  = getExpansionSuffix(coleiter);
2287         UBool   expandflag = expansion > 0;
2288         setColEIterOffset(coleiter, *end);
2289         while (expansion > 0) {
2290             // getting rid of the redundant ce
2291             // since forward contraction/expansion may have extra ces
2292             // if we are in the normalization buffer, hasAccentsBeforeMatch
2293             // would have taken care of it.
2294             // E.g. the character \u01FA will have an expansion of 3, but if
2295             // we are only looking for A ring A\u030A, we'll have to skip the
2296             // last ce in the expansion buffer
2297             ucol_previous(coleiter, status);
2298             if (U_FAILURE(*status)) {
2299                 return FALSE;
2300             }
2301             if (ucol_getOffset(coleiter) != temp) {
2302                 *end = temp;
2303                 temp  = ucol_getOffset(coleiter);
2304             }
2305             expansion --;
2306         }
2307
2308         int32_t  *patternce       = strsrch->pattern.CE;
2309         int32_t   patterncelength = strsrch->pattern.CELength;
2310         int32_t   count           = patterncelength;
2311         while (count > 0) {
2312             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2313             // status checked below, note that if status is a failure
2314             // ucol_previous returns UCOL_NULLORDER
2315             if (ce == UCOL_IGNORABLE) {
2316                 continue;
2317             }
2318             if (expandflag && count == 0 &&
2319                 getColElemIterOffset(coleiter, FALSE) != temp) {
2320                 *end = temp;
2321                 temp  = ucol_getOffset(coleiter);
2322             }
2323             if (count == patterncelength &&
2324                 ce != patternce[patterncelength - 1]) {
2325                 // accents may have extra starting ces, this occurs when a
2326                 // pure accent pattern is matched without rearrangement
2327                 int32_t    expected = patternce[patterncelength - 1];
2328                 UTF_BACK_1(text, 0, *end);
2329                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2330                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2331                     while (U_SUCCESS(*status) && ce != expected &&
2332                            ce != UCOL_NULLORDER &&
2333                            ucol_getOffset(coleiter) <= *start) {
2334                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2335                     }
2336                 }
2337             }
2338             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2339                 (*start) --;
2340                 *start = getPreviousBaseOffset(text, *start);
2341                 return FALSE;
2342             }
2343             count --;
2344         }
2345     }
2346     return TRUE;
2347 }
2348
2349 /**
2350 * Checks and sets the match information if found.
2351 * Checks
2352 * <ul>
2353 * <li> the potential match does not repeat the previous match
2354 * <li> boundaries are correct
2355 * <li> potential match does not end in the middle of a contraction
2356 * <li> identical matches
2357 * <\ul>
2358 * Otherwise the offset will be shifted to the next character.
2359 * Internal method, status assumed to be success, caller has to check status
2360 * before calling this method.
2361 * @param strsrch string search data
2362 * @param textoffset offset in the collation element text. the returned value
2363 *        will be the truncated start offset of the match or the new start
2364 *        search offset.
2365 * @param status only error status if any
2366 * @return TRUE if the match is valid, FALSE otherwise
2367 */
2368 static
2369 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2370                                          int32_t   *textoffset,
2371                                          UErrorCode    *status)
2372 {
2373     // to ensure that the start and ends are not composite characters
2374     UCollationElements *coleiter = strsrch->textIter;
2375     // if we have a canonical accent match
2376     if ((strsrch->pattern.hasSuffixAccents &&
2377         strsrch->canonicalSuffixAccents[0]) ||
2378         (strsrch->pattern.hasPrefixAccents &&
2379         strsrch->canonicalPrefixAccents[0])) {
2380         strsrch->search->matchedIndex  = *textoffset;
2381         strsrch->search->matchedLength =
2382             getNextUStringSearchBaseOffset(strsrch,
2383                                       getColElemIterOffset(coleiter, FALSE))
2384             - *textoffset;
2385         return TRUE;
2386     }
2387
2388     int32_t end = ucol_getOffset(coleiter);
2389     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2390                                                 status) ||
2391          U_FAILURE(*status)) {
2392         return FALSE;
2393     }
2394
2395     end = getNextUStringSearchBaseOffset(strsrch, end);
2396     // this totally matches, however we need to check if it is repeating
2397     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2398         !isBreakUnit(strsrch, *textoffset, end) ||
2399         !checkIdentical(strsrch, *textoffset, end)) {
2400         (*textoffset) --;
2401         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2402                                             *textoffset);
2403         return FALSE;
2404     }
2405
2406     strsrch->search->matchedIndex  = *textoffset;
2407     strsrch->search->matchedLength = end - *textoffset;
2408     return TRUE;
2409 }
2410
2411 // constructors and destructor -------------------------------------------
2412
2413 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2414                                           int32_t         patternlength,
2415                                     const UChar          *text,
2416                                           int32_t         textlength,
2417                                     const char           *locale,
2418                                           UBreakIterator *breakiter,
2419                                           UErrorCode     *status)
2420 {
2421     if (U_FAILURE(*status)) {
2422         return NULL;
2423     }
2424 #if UCONFIG_NO_BREAK_ITERATION
2425     if (breakiter != NULL) {
2426         *status = U_UNSUPPORTED_ERROR;
2427         return NULL;
2428     }
2429 #endif
2430     if (locale) {
2431         // ucol_open internally checks for status
2432         UCollator     *collator = ucol_open(locale, status);
2433         // pattern, text checks are done in usearch_openFromCollator
2434         UStringSearch *result   = usearch_openFromCollator(pattern,
2435                                               patternlength, text, textlength,
2436                                               collator, breakiter, status);
2437
2438         if (result == NULL || U_FAILURE(*status)) {
2439             if (collator) {
2440                 ucol_close(collator);
2441             }
2442             return NULL;
2443         }
2444         else {
2445             result->ownCollator = TRUE;
2446         }
2447         return result;
2448     }
2449     *status = U_ILLEGAL_ARGUMENT_ERROR;
2450     return NULL;
2451 }
2452
2453 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2454                                   const UChar          *pattern,
2455                                         int32_t         patternlength,
2456                                   const UChar          *text,
2457                                         int32_t         textlength,
2458                                   const UCollator      *collator,
2459                                         UBreakIterator *breakiter,
2460                                         UErrorCode     *status)
2461 {
2462     if (U_FAILURE(*status)) {
2463         return NULL;
2464     }
2465 #if UCONFIG_NO_BREAK_ITERATION
2466     if (breakiter != NULL) {
2467         *status = U_UNSUPPORTED_ERROR;
2468         return NULL;
2469     }
2470 #endif
2471     if (pattern == NULL || text == NULL || collator == NULL) {
2472         *status = U_ILLEGAL_ARGUMENT_ERROR;
2473     }
2474
2475     // string search does not really work when numeric collation is turned on
2476     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2477         *status = U_UNSUPPORTED_ERROR;
2478     }
2479
2480     if (U_SUCCESS(*status)) {
2481         initializeFCD(status);
2482         if (U_FAILURE(*status)) {
2483             return NULL;
2484         }
2485
2486         UStringSearch *result;
2487         if (textlength == -1) {
2488             textlength = u_strlen(text);
2489         }
2490         if (patternlength == -1) {
2491             patternlength = u_strlen(pattern);
2492         }
2493         if (textlength <= 0 || patternlength <= 0) {
2494             *status = U_ILLEGAL_ARGUMENT_ERROR;
2495             return NULL;
2496         }
2497
2498         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2499         if (result == NULL) {
2500             *status = U_MEMORY_ALLOCATION_ERROR;
2501             return NULL;
2502         }
2503
2504         result->collator    = collator;
2505         result->strength    = ucol_getStrength(collator);
2506         result->ceMask      = getMask(result->strength);
2507         result->toShift     =
2508              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2509                                                             UCOL_SHIFTED;
2510         result->variableTop = ucol_getVariableTop(collator, status);
2511
2512         if (U_FAILURE(*status)) {
2513             uprv_free(result);
2514             return NULL;
2515         }
2516
2517         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2518         if (result->search == NULL) {
2519             *status = U_MEMORY_ALLOCATION_ERROR;
2520             uprv_free(result);
2521             return NULL;
2522         }
2523
2524         result->search->text       = text;
2525         result->search->textLength = textlength;
2526
2527         result->pattern.text       = pattern;
2528         result->pattern.textLength = patternlength;
2529         result->pattern.CE         = NULL;
2530
2531         result->search->breakIter  = breakiter;
2532 #if !UCONFIG_NO_BREAK_ITERATION
2533         if (breakiter) {
2534             ubrk_setText(breakiter, text, textlength, status);
2535         }
2536 #endif
2537
2538         result->ownCollator           = FALSE;
2539         result->search->matchedLength = 0;
2540         result->search->matchedIndex  = USEARCH_DONE;
2541         result->textIter              = ucol_openElements(collator, text,
2542                                                           textlength, status);
2543         if (U_FAILURE(*status)) {
2544             usearch_close(result);
2545             return NULL;
2546         }
2547
2548         result->utilIter              = NULL;
2549
2550         result->search->isOverlap          = FALSE;
2551         result->search->isCanonicalMatch   = FALSE;
2552         result->search->isForwardSearching = TRUE;
2553         result->search->reset              = TRUE;
2554
2555         initialize(result, status);
2556
2557         if (U_FAILURE(*status)) {
2558             usearch_close(result);
2559             return NULL;
2560         }
2561
2562         return result;
2563     }
2564     return NULL;
2565 }
2566
2567 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2568 {
2569     if (strsrch) {
2570         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2571             strsrch->pattern.CE) {
2572             uprv_free(strsrch->pattern.CE);
2573         }
2574         ucol_closeElements(strsrch->textIter);
2575         ucol_closeElements(strsrch->utilIter);
2576         if (strsrch->ownCollator && strsrch->collator) {
2577             ucol_close((UCollator *)strsrch->collator);
2578         }
2579         uprv_free(strsrch->search);
2580         uprv_free(strsrch);
2581     }
2582 }
2583
2584 // set and get methods --------------------------------------------------
2585
2586 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2587                                         int32_t    position,
2588                                         UErrorCode    *status)
2589 {
2590     if (U_SUCCESS(*status) && strsrch) {
2591         if (isOutOfBounds(strsrch->search->textLength, position)) {
2592             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2593         }
2594         else {
2595             setColEIterOffset(strsrch->textIter, position);
2596         }
2597         strsrch->search->matchedIndex  = USEARCH_DONE;
2598         strsrch->search->matchedLength = 0;
2599         strsrch->search->reset         = FALSE;
2600     }
2601 }
2602
2603 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2604 {
2605     if (strsrch) {
2606         int32_t result = ucol_getOffset(strsrch->textIter);
2607         if (isOutOfBounds(strsrch->search->textLength, result)) {
2608             return USEARCH_DONE;
2609         }
2610         return result;
2611     }
2612     return USEARCH_DONE;
2613 }
2614
2615 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2616                                  USearchAttribute attribute,
2617                                  USearchAttributeValue value,
2618                                  UErrorCode *status)
2619 {
2620     if (U_SUCCESS(*status) && strsrch) {
2621         switch (attribute)
2622         {
2623         case USEARCH_OVERLAP :
2624             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2625             break;
2626         case USEARCH_CANONICAL_MATCH :
2627             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2628                                                                       FALSE);
2629             break;
2630         case USEARCH_ATTRIBUTE_COUNT :
2631         default:
2632             *status = U_ILLEGAL_ARGUMENT_ERROR;
2633         }
2634     }
2635     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2636         *status = U_ILLEGAL_ARGUMENT_ERROR;
2637     }
2638 }
2639
2640 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2641                                                 const UStringSearch *strsrch,
2642                                                 USearchAttribute attribute)
2643 {
2644     if (strsrch) {
2645         switch (attribute) {
2646         case USEARCH_OVERLAP :
2647             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2648                                                         USEARCH_OFF);
2649         case USEARCH_CANONICAL_MATCH :
2650             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2651                                                                USEARCH_OFF);
2652         case USEARCH_ATTRIBUTE_COUNT :
2653             return USEARCH_DEFAULT;
2654         }
2655     }
2656     return USEARCH_DEFAULT;
2657 }
2658
2659 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2660                                                 const UStringSearch *strsrch)
2661 {
2662     if (strsrch == NULL) {
2663         return USEARCH_DONE;
2664     }
2665     return strsrch->search->matchedIndex;
2666 }
2667
2668
2669 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2670                                             UChar         *result,
2671                                             int32_t        resultCapacity,
2672                                             UErrorCode    *status)
2673 {
2674     if (U_FAILURE(*status)) {
2675         return USEARCH_DONE;
2676     }
2677     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2678         result == NULL)) {
2679         *status = U_ILLEGAL_ARGUMENT_ERROR;
2680         return USEARCH_DONE;
2681     }
2682
2683     int32_t     copylength = strsrch->search->matchedLength;
2684     int32_t copyindex  = strsrch->search->matchedIndex;
2685     if (copyindex == USEARCH_DONE) {
2686         u_terminateUChars(result, resultCapacity, 0, status);
2687         return USEARCH_DONE;
2688     }
2689
2690     if (resultCapacity < copylength) {
2691         copylength = resultCapacity;
2692     }
2693     if (copylength > 0) {
2694         uprv_memcpy(result, strsrch->search->text + copyindex,
2695                     copylength * sizeof(UChar));
2696     }
2697     return u_terminateUChars(result, resultCapacity,
2698                              strsrch->search->matchedLength, status);
2699 }
2700
2701 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2702                                               const UStringSearch *strsrch)
2703 {
2704     if (strsrch) {
2705         return strsrch->search->matchedLength;
2706     }
2707     return USEARCH_DONE;
2708 }
2709
2710 #if !UCONFIG_NO_BREAK_ITERATION
2711
2712 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2713                                                UBreakIterator *breakiter,
2714                                                UErrorCode     *status)
2715 {
2716     if (U_SUCCESS(*status) && strsrch) {
2717         strsrch->search->breakIter = breakiter;
2718         if (breakiter) {
2719             ubrk_setText(breakiter, strsrch->search->text,
2720                          strsrch->search->textLength, status);
2721         }
2722     }
2723 }
2724
2725 U_CAPI const UBreakIterator* U_EXPORT2
2726 usearch_getBreakIterator(const UStringSearch *strsrch)
2727 {
2728     if (strsrch) {
2729         return strsrch->search->breakIter;
2730     }
2731     return NULL;
2732 }
2733
2734 #endif
2735
2736 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2737                                       const UChar         *text,
2738                                             int32_t        textlength,
2739                                             UErrorCode    *status)
2740 {
2741     if (U_SUCCESS(*status)) {
2742         if (strsrch == NULL || text == NULL || textlength < -1 ||
2743             textlength == 0) {
2744             *status = U_ILLEGAL_ARGUMENT_ERROR;
2745         }
2746         else {
2747             if (textlength == -1) {
2748                 textlength = u_strlen(text);
2749             }
2750             strsrch->search->text       = text;
2751             strsrch->search->textLength = textlength;
2752             ucol_setText(strsrch->textIter, text, textlength, status);
2753             strsrch->search->matchedIndex  = USEARCH_DONE;
2754             strsrch->search->matchedLength = 0;
2755             strsrch->search->reset         = TRUE;
2756 #if !UCONFIG_NO_BREAK_ITERATION
2757             if (strsrch->search->breakIter != NULL) {
2758                 ubrk_setText(strsrch->search->breakIter, text,
2759                              textlength, status);
2760             }
2761 #endif
2762         }
2763     }
2764 }
2765
2766 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2767                                                      int32_t       *length)
2768 {
2769     if (strsrch) {
2770         *length = strsrch->search->textLength;
2771         return strsrch->search->text;
2772     }
2773     return NULL;
2774 }
2775
2776 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
2777                                           const UCollator     *collator,
2778                                                 UErrorCode    *status)
2779 {
2780     if (U_SUCCESS(*status)) {
2781         if (collator == NULL) {
2782             *status = U_ILLEGAL_ARGUMENT_ERROR;
2783             return;
2784         }
2785         if (strsrch) {
2786             if (strsrch->ownCollator && (strsrch->collator != collator)) {
2787                 ucol_close((UCollator *)strsrch->collator);
2788                 strsrch->ownCollator = FALSE;
2789             }
2790             strsrch->collator    = collator;
2791             strsrch->strength    = ucol_getStrength(collator);
2792             strsrch->ceMask      = getMask(strsrch->strength);
2793             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2794             strsrch->toShift     =
2795                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2796                                                                 UCOL_SHIFTED;
2797             // if status is a failure, ucol_getVariableTop returns 0
2798             strsrch->variableTop = ucol_getVariableTop(collator, status);
2799             if (U_SUCCESS(*status)) {
2800                 initialize(strsrch, status);
2801                 if (U_SUCCESS(*status)) {
2802                     uprv_init_collIterate(collator, strsrch->search->text,
2803                                           strsrch->search->textLength,
2804                                           &(strsrch->textIter->iteratordata_));
2805                     strsrch->utilIter->iteratordata_.coll = collator;
2806                 }
2807             }
2808         }
2809     }
2810 }
2811
2812 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
2813 {
2814     if (strsrch) {
2815         return (UCollator *)strsrch->collator;
2816     }
2817     return NULL;
2818 }
2819
2820 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
2821                                          const UChar         *pattern,
2822                                                int32_t        patternlength,
2823                                                UErrorCode    *status)
2824 {
2825     if (U_SUCCESS(*status)) {
2826         if (strsrch == NULL || pattern == NULL) {
2827             *status = U_ILLEGAL_ARGUMENT_ERROR;
2828         }
2829         else {
2830             if (patternlength == -1) {
2831                 patternlength = u_strlen(pattern);
2832             }
2833             if (patternlength == 0) {
2834                 *status = U_ILLEGAL_ARGUMENT_ERROR;
2835                 return;
2836             }
2837             strsrch->pattern.text       = pattern;
2838             strsrch->pattern.textLength = patternlength;
2839             initialize(strsrch, status);
2840         }
2841     }
2842 }
2843
2844 U_CAPI const UChar* U_EXPORT2
2845 usearch_getPattern(const UStringSearch *strsrch,
2846                    int32_t       *length)
2847 {
2848     if (strsrch) {
2849         *length = strsrch->pattern.textLength;
2850         return strsrch->pattern.text;
2851     }
2852     return NULL;
2853 }
2854
2855 // miscellanous methods --------------------------------------------------
2856
2857 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
2858                                            UErrorCode    *status)
2859 {
2860     if (strsrch && U_SUCCESS(*status)) {
2861         strsrch->search->isForwardSearching = TRUE;
2862         usearch_setOffset(strsrch, 0, status);
2863         if (U_SUCCESS(*status)) {
2864             return usearch_next(strsrch, status);
2865         }
2866     }
2867     return USEARCH_DONE;
2868 }
2869
2870 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
2871                                                int32_t    position,
2872                                                UErrorCode    *status)
2873 {
2874     if (strsrch && U_SUCCESS(*status)) {
2875         strsrch->search->isForwardSearching = TRUE;
2876         // position checked in usearch_setOffset
2877         usearch_setOffset(strsrch, position, status);
2878         if (U_SUCCESS(*status)) {
2879             return usearch_next(strsrch, status);
2880         }
2881     }
2882     return USEARCH_DONE;
2883 }
2884
2885 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
2886                                           UErrorCode    *status)
2887 {
2888     if (strsrch && U_SUCCESS(*status)) {
2889         strsrch->search->isForwardSearching = FALSE;
2890         usearch_setOffset(strsrch, strsrch->search->textLength, status);
2891         if (U_SUCCESS(*status)) {
2892             return usearch_previous(strsrch, status);
2893         }
2894     }
2895     return USEARCH_DONE;
2896 }
2897
2898 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
2899                                                int32_t    position,
2900                                                UErrorCode    *status)
2901 {
2902     if (strsrch && U_SUCCESS(*status)) {
2903         strsrch->search->isForwardSearching = FALSE;
2904         // position checked in usearch_setOffset
2905         usearch_setOffset(strsrch, position, status);
2906         if (U_SUCCESS(*status)) {
2907             return usearch_previous(strsrch, status);
2908         }
2909     }
2910     return USEARCH_DONE;
2911 }
2912
2913 /**
2914 * If a direction switch is required, we'll count the number of ces till the
2915 * beginning of the collation element iterator and iterate forwards that
2916 * number of times. This is so that we get to the correct point within the
2917 * string to continue the search in. Imagine when we are in the middle of the
2918 * normalization buffer when the change in direction is request. arrrgghh....
2919 * After searching the offset within the collation element iterator will be
2920 * shifted to the start of the match. If a match is not found, the offset would
2921 * have been set to the end of the text string in the collation element
2922 * iterator.
2923 * Okay, here's my take on normalization buffer. The only time when there can
2924 * be 2 matches within the same normalization is when the pattern is consists
2925 * of all accents. But since the offset returned is from the text string, we
2926 * should not confuse the caller by returning the second match within the
2927 * same normalization buffer. If we do, the 2 results will have the same match
2928 * offsets, and that'll be confusing. I'll return the next match that doesn't
2929 * fall within the same normalization buffer. Note this does not affect the
2930 * results of matches spanning the text and the normalization buffer.
2931 * The position to start searching is taken from the collation element
2932 * iterator. Callers of this API would have to set the offset in the collation
2933 * element iterator before using this method.
2934 */
2935 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
2936                                           UErrorCode    *status)
2937 {
2938     if (U_SUCCESS(*status) && strsrch) {
2939         // note offset is either equivalent to the start of the previous match
2940         // or is set by the user
2941         int32_t      offset       = usearch_getOffset(strsrch);
2942         USearch     *search       = strsrch->search;
2943         search->reset             = FALSE;
2944         int32_t      textlength   = search->textLength;
2945         if (search->isForwardSearching) {
2946             if (offset == textlength
2947                 || (!search->isOverlap &&
2948                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
2949                     (search->matchedIndex != USEARCH_DONE &&
2950                      offset + search->matchedLength >= textlength)))) {
2951                 // not enough characters to match
2952                 setMatchNotFound(strsrch);
2953                 return USEARCH_DONE;
2954             }
2955         }
2956         else {
2957             // switching direction.
2958             // if matchedIndex == USEARCH_DONE, it means that either a
2959             // setOffset has been called or that previous ran off the text
2960             // string. the iterator would have been set to offset 0 if a
2961             // match is not found.
2962             search->isForwardSearching = TRUE;
2963             if (search->matchedIndex != USEARCH_DONE) {
2964                 // there's no need to set the collation element iterator
2965                 // the next call to next will set the offset.
2966                 return search->matchedIndex;
2967             }
2968         }
2969
2970         if (U_SUCCESS(*status)) {
2971             if (strsrch->pattern.CELength == 0) {
2972                 if (search->matchedIndex == USEARCH_DONE) {
2973                     search->matchedIndex = offset;
2974                 }
2975                 else { // moves by codepoints
2976                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
2977                 }
2978
2979                 search->matchedLength = 0;
2980                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
2981                 // status checked below
2982                 if (search->matchedIndex == textlength) {
2983                     search->matchedIndex = USEARCH_DONE;
2984                 }
2985             }
2986             else {
2987                 if (search->matchedLength > 0) {
2988                     // if matchlength is 0 we are at the start of the iteration
2989                     if (search->isOverlap) {
2990                         ucol_setOffset(strsrch->textIter, offset + 1, status);
2991                     }
2992                     else {
2993                         ucol_setOffset(strsrch->textIter,
2994                                        offset + search->matchedLength, status);
2995                     }
2996                 }
2997                 else {
2998                     // for boundary check purposes. this will ensure that the
2999                     // next match will not preceed the current offset
3000                     // note search->matchedIndex will always be set to something
3001                     // in the code
3002                     search->matchedIndex = offset - 1;
3003                 }
3004
3005                 if (search->isCanonicalMatch) {
3006                     // can't use exact here since extra accents are allowed.
3007                     usearch_handleNextCanonical(strsrch, status);
3008                 }
3009                 else {
3010                     usearch_handleNextExact(strsrch, status);
3011                 }
3012             }
3013
3014             if (U_FAILURE(*status)) {
3015                 return USEARCH_DONE;
3016             }
3017
3018             return search->matchedIndex;
3019         }
3020     }
3021     return USEARCH_DONE;
3022 }
3023
3024 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3025                                               UErrorCode *status)
3026 {
3027     if (U_SUCCESS(*status) && strsrch) {
3028         int32_t offset;
3029         USearch *search = strsrch->search;
3030         if (search->reset) {
3031             offset                     = search->textLength;
3032             search->isForwardSearching = FALSE;
3033             search->reset              = FALSE;
3034             setColEIterOffset(strsrch->textIter, offset);
3035         }
3036         else {
3037             offset = usearch_getOffset(strsrch);
3038         }
3039
3040         int32_t matchedindex = search->matchedIndex;
3041         if (search->isForwardSearching == TRUE) {
3042             // switching direction.
3043             // if matchedIndex == USEARCH_DONE, it means that either a
3044             // setOffset has been called or that next ran off the text
3045             // string. the iterator would have been set to offset textLength if
3046             // a match is not found.
3047             search->isForwardSearching = FALSE;
3048             if (matchedindex != USEARCH_DONE) {
3049                 return matchedindex;
3050             }
3051         }
3052         else {
3053             if (offset == 0 || matchedindex == 0 ||
3054                 (!search->isOverlap &&
3055                     (offset < strsrch->pattern.defaultShiftSize ||
3056                     (matchedindex != USEARCH_DONE &&
3057                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3058                 // not enough characters to match
3059                 setMatchNotFound(strsrch);
3060                 return USEARCH_DONE;
3061             }
3062         }
3063
3064         if (U_SUCCESS(*status)) {
3065             if (strsrch->pattern.CELength == 0) {
3066                 search->matchedIndex =
3067                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3068                 if (search->matchedIndex == 0) {
3069                     setMatchNotFound(strsrch);
3070                     // status checked below
3071                 }
3072                 else { // move by codepoints
3073                     UTF_BACK_1(search->text, 0, search->matchedIndex);
3074                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3075                     // status checked below
3076                     search->matchedLength = 0;
3077                 }
3078             }
3079             else {
3080                 if (strsrch->search->isCanonicalMatch) {
3081                     // can't use exact here since extra accents are allowed.
3082                     usearch_handlePreviousCanonical(strsrch, status);
3083                     // status checked below
3084                 }
3085                 else {
3086                     usearch_handlePreviousExact(strsrch, status);
3087                     // status checked below
3088                 }
3089             }
3090
3091             if (U_FAILURE(*status)) {
3092                 return USEARCH_DONE;
3093             }
3094
3095             return search->matchedIndex;
3096         }
3097     }
3098     return USEARCH_DONE;
3099 }
3100
3101
3102
3103 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3104 {
3105     /*
3106     reset is setting the attributes that are already in
3107     string search, hence all attributes in the collator should
3108     be retrieved without any problems
3109     */
3110     if (strsrch) {
3111         UErrorCode status            = U_ZERO_ERROR;
3112         UBool      sameCollAttribute = TRUE;
3113         uint32_t   ceMask;
3114         UBool      shift;
3115         uint32_t   varTop;
3116
3117         strsrch->strength    = ucol_getStrength(strsrch->collator);
3118         ceMask = getMask(strsrch->strength);
3119         if (strsrch->ceMask != ceMask) {
3120             strsrch->ceMask = ceMask;
3121             sameCollAttribute = FALSE;
3122         }
3123         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3124         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3125                                   &status) == UCOL_SHIFTED;
3126         if (strsrch->toShift != shift) {
3127             strsrch->toShift  = shift;
3128             sameCollAttribute = FALSE;
3129         }
3130
3131         // if status is a failure, ucol_getVariableTop returns 0
3132         varTop = ucol_getVariableTop(strsrch->collator, &status);
3133         if (strsrch->variableTop != varTop) {
3134             strsrch->variableTop = varTop;
3135             sameCollAttribute    = FALSE;
3136         }
3137         if (!sameCollAttribute) {
3138             initialize(strsrch, &status);
3139         }
3140         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3141                               strsrch->search->textLength,
3142                               &(strsrch->textIter->iteratordata_));
3143         strsrch->search->matchedLength      = 0;
3144         strsrch->search->matchedIndex       = USEARCH_DONE;
3145         strsrch->search->isOverlap          = FALSE;
3146         strsrch->search->isCanonicalMatch   = FALSE;
3147         strsrch->search->isForwardSearching = TRUE;
3148         strsrch->search->reset              = TRUE;
3149     }
3150 }
3151
3152 // internal use methods declared in usrchimp.h -----------------------------
3153
3154 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
3155 {
3156     if (U_FAILURE(*status)) {
3157         setMatchNotFound(strsrch);
3158         return FALSE;
3159     }
3160
3161     UCollationElements *coleiter        = strsrch->textIter;
3162     int32_t             textlength      = strsrch->search->textLength;
3163     int32_t            *patternce       = strsrch->pattern.CE;
3164     int32_t             patterncelength = strsrch->pattern.CELength;
3165     int32_t             textoffset      = ucol_getOffset(coleiter);
3166
3167     // status used in setting coleiter offset, since offset is checked in
3168     // shiftForward before setting the coleiter offset, status never
3169     // a failure
3170     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3171                               patterncelength);
3172     while (textoffset <= textlength)
3173     {
3174         uint32_t    patternceindex = patterncelength - 1;
3175         int32_t     targetce;
3176         UBool       found          = FALSE;
3177         int32_t    lastce          = UCOL_NULLORDER;
3178
3179         setColEIterOffset(coleiter, textoffset);
3180
3181         while (TRUE) {
3182             // finding the last pattern ce match, imagine composite characters
3183             // for example: search for pattern A in text \u00C0
3184             // we'll have to skip \u0300 the grave first before we get to A
3185             targetce = ucol_previous(coleiter, status);
3186             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3187                 found = FALSE;
3188                 break;
3189             }
3190             targetce = getCE(strsrch, targetce);
3191             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
3192                 // this is for the text \u0315\u0300 that requires
3193                 // normalization and pattern \u0300, where \u0315 is ignorable
3194                 continue;
3195             }
3196             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3197                 lastce = targetce;
3198             }
3199             if (targetce == patternce[patternceindex]) {
3200                 // the first ce can be a contraction
3201                 found = TRUE;
3202                 break;
3203             }
3204             if (!hasExpansion(coleiter)) {
3205                 found = FALSE;
3206                 break;
3207             }
3208         }
3209
3210         targetce = lastce;
3211
3212         while (found && patternceindex > 0) {
3213             targetce    = ucol_previous(coleiter, status);
3214             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3215                 found = FALSE;
3216                 break;
3217             }
3218             targetce    = getCE(strsrch, targetce);
3219             if (targetce == UCOL_IGNORABLE) {
3220                 continue;
3221             }
3222
3223             patternceindex --;
3224             found = found && targetce == patternce[patternceindex];
3225         }
3226
3227         if (!found) {
3228             if (U_FAILURE(*status)) {
3229                 break;
3230             }
3231             textoffset = shiftForward(strsrch, textoffset, lastce,
3232                                       patternceindex);
3233             // status checked at loop.
3234             patternceindex = patterncelength;
3235             continue;
3236         }
3237
3238         if (checkNextExactMatch(strsrch, &textoffset, status)) {
3239             // status checked in ucol_setOffset
3240             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3241             return TRUE;
3242         }
3243     }
3244     setMatchNotFound(strsrch);
3245     return FALSE;
3246 }
3247
3248 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
3249 {
3250     if (U_FAILURE(*status)) {
3251         setMatchNotFound(strsrch);
3252         return FALSE;
3253     }
3254
3255     UCollationElements *coleiter        = strsrch->textIter;
3256     int32_t             textlength      = strsrch->search->textLength;
3257     int32_t            *patternce       = strsrch->pattern.CE;
3258     int32_t             patterncelength = strsrch->pattern.CELength;
3259     int32_t             textoffset      = ucol_getOffset(coleiter);
3260     UBool               hasPatternAccents =
3261        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3262
3263     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3264                               patterncelength);
3265     strsrch->canonicalPrefixAccents[0] = 0;
3266     strsrch->canonicalSuffixAccents[0] = 0;
3267
3268     while (textoffset <= textlength)
3269     {
3270         int32_t     patternceindex = patterncelength - 1;
3271         int32_t     targetce;
3272         UBool       found          = FALSE;
3273         int32_t     lastce         = UCOL_NULLORDER;
3274
3275         setColEIterOffset(coleiter, textoffset);
3276
3277         for (;;) {
3278             // finding the last pattern ce match, imagine composite characters
3279             // for example: search for pattern A in text \u00C0
3280             // we'll have to skip \u0300 the grave first before we get to A
3281             targetce = ucol_previous(coleiter, status);
3282             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3283                 found = FALSE;
3284                 break;
3285             }
3286             targetce = getCE(strsrch, targetce);
3287             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3288                 lastce = targetce;
3289             }
3290             if (targetce == patternce[patternceindex]) {
3291                 // the first ce can be a contraction
3292                 found = TRUE;
3293                 break;
3294             }
3295             if (!hasExpansion(coleiter)) {
3296                 found = FALSE;
3297                 break;
3298             }
3299         }
3300
3301         while (found && patternceindex > 0) {
3302             targetce    = ucol_previous(coleiter, status);
3303             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3304                 found = FALSE;
3305                 break;
3306             }
3307             targetce    = getCE(strsrch, targetce);
3308             if (targetce == UCOL_IGNORABLE) {
3309                 continue;
3310             }
3311
3312             patternceindex --;
3313             found = found && targetce == patternce[patternceindex];
3314         }
3315
3316         // initializing the rearranged accent array
3317         if (hasPatternAccents && !found) {
3318             strsrch->canonicalPrefixAccents[0] = 0;
3319             strsrch->canonicalSuffixAccents[0] = 0;
3320             if (U_FAILURE(*status)) {
3321                 break;
3322             }
3323             found = doNextCanonicalMatch(strsrch, textoffset, status);
3324         }
3325
3326         if (!found) {
3327             if (U_FAILURE(*status)) {
3328                 break;
3329             }
3330             textoffset = shiftForward(strsrch, textoffset, lastce,
3331                                       patternceindex);
3332             // status checked at loop
3333             patternceindex = patterncelength;
3334             continue;
3335         }
3336
3337         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
3338             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3339             return TRUE;
3340         }
3341     }
3342     setMatchNotFound(strsrch);
3343     return FALSE;
3344 }
3345
3346 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
3347 {
3348     if (U_FAILURE(*status)) {
3349         setMatchNotFound(strsrch);
3350         return FALSE;
3351     }
3352
3353     UCollationElements *coleiter        = strsrch->textIter;
3354     int32_t            *patternce       = strsrch->pattern.CE;
3355     int32_t             patterncelength = strsrch->pattern.CELength;
3356     int32_t             textoffset      = ucol_getOffset(coleiter);
3357
3358     // shifting it check for setting offset
3359     // if setOffset is called previously or there was no previous match, we
3360     // leave the offset as it is.
3361     if (strsrch->search->matchedIndex != USEARCH_DONE) {
3362         textoffset = strsrch->search->matchedIndex;
3363     }
3364
3365     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3366                               patterncelength);
3367
3368     while (textoffset >= 0)
3369     {
3370         int32_t     patternceindex = 1;
3371         int32_t     targetce;
3372         UBool       found          = FALSE;
3373         int32_t     firstce        = UCOL_NULLORDER;
3374
3375         // if status is a failure, ucol_setOffset does nothing
3376         setColEIterOffset(coleiter, textoffset);
3377
3378         for (;;) {
3379             // finding the first pattern ce match, imagine composite
3380             // characters. for example: search for pattern \u0300 in text
3381             // \u00C0, we'll have to skip A first before we get to
3382             // \u0300 the grave accent
3383             targetce = ucol_next(coleiter, status);
3384             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3385                 found = FALSE;
3386                 break;
3387             }
3388             targetce = getCE(strsrch, targetce);
3389             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3390                 firstce = targetce;
3391             }
3392             if (targetce == UCOL_IGNORABLE) {
3393                 continue;
3394             }
3395             if (targetce == patternce[0]) {
3396                 found = TRUE;
3397                 break;
3398             }
3399             if (!hasExpansion(coleiter)) {
3400                 // checking for accents in composite character
3401                 found = FALSE;
3402                 break;
3403             }
3404         }
3405
3406         targetce = firstce;
3407
3408         while (found && (patternceindex < patterncelength)) {
3409             targetce    = ucol_next(coleiter, status);
3410             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3411                 found = FALSE;
3412                 break;
3413             }
3414             targetce    = getCE(strsrch, targetce);
3415             if (targetce == UCOL_IGNORABLE) {
3416                 continue;
3417             }
3418
3419             found = found && targetce == patternce[patternceindex];
3420             patternceindex ++;
3421         }
3422
3423         if (!found) {
3424             if (U_FAILURE(*status)) {
3425                 break;
3426             }
3427             textoffset = reverseShift(strsrch, textoffset, targetce,
3428                                       patternceindex);
3429             patternceindex = 0;
3430             continue;
3431         }
3432
3433         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
3434             setColEIterOffset(coleiter, textoffset);
3435             return TRUE;
3436         }
3437     }
3438     setMatchNotFound(strsrch);
3439     return FALSE;
3440 }
3441
3442 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
3443                                       UErrorCode    *status)
3444 {
3445     if (U_FAILURE(*status)) {
3446         setMatchNotFound(strsrch);
3447         return FALSE;
3448     }
3449
3450     UCollationElements *coleiter        = strsrch->textIter;
3451     int32_t            *patternce       = strsrch->pattern.CE;
3452     int32_t             patterncelength = strsrch->pattern.CELength;
3453     int32_t             textoffset      = ucol_getOffset(coleiter);
3454     UBool               hasPatternAccents =
3455        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3456
3457     // shifting it check for setting offset
3458     // if setOffset is called previously or there was no previous match, we
3459     // leave the offset as it is.
3460     if (strsrch->search->matchedIndex != USEARCH_DONE) {
3461         textoffset = strsrch->search->matchedIndex;
3462     }
3463
3464     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3465                               patterncelength);
3466     strsrch->canonicalPrefixAccents[0] = 0;
3467     strsrch->canonicalSuffixAccents[0] = 0;
3468
3469     while (textoffset >= 0)
3470     {
3471         int32_t     patternceindex = 1;
3472         int32_t     targetce;
3473         UBool       found          = FALSE;
3474         int32_t     firstce        = UCOL_NULLORDER;
3475
3476         setColEIterOffset(coleiter, textoffset);
3477         while (TRUE) {
3478             // finding the first pattern ce match, imagine composite
3479             // characters. for example: search for pattern \u0300 in text
3480             // \u00C0, we'll have to skip A first before we get to
3481             // \u0300 the grave accent
3482             targetce = ucol_next(coleiter, status);
3483             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3484                 found = FALSE;
3485                 break;
3486             }
3487             targetce = getCE(strsrch, targetce);
3488             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3489                 firstce = targetce;
3490             }
3491
3492             if (targetce == patternce[0]) {
3493                 // the first ce can be a contraction
3494                 found = TRUE;
3495                 break;
3496             }
3497             if (!hasExpansion(coleiter)) {
3498                 // checking for accents in composite character
3499                 found = FALSE;
3500                 break;
3501             }
3502         }
3503
3504         targetce = firstce;
3505
3506         while (found && patternceindex < patterncelength) {
3507             targetce    = ucol_next(coleiter, status);
3508             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3509                 found = FALSE;
3510                 break;
3511             }
3512             targetce = getCE(strsrch, targetce);
3513             if (targetce == UCOL_IGNORABLE) {
3514                 continue;
3515             }
3516
3517             found = found && targetce == patternce[patternceindex];
3518             patternceindex ++;
3519         }
3520
3521         // initializing the rearranged accent array
3522         if (hasPatternAccents && !found) {
3523             strsrch->canonicalPrefixAccents[0] = 0;
3524             strsrch->canonicalSuffixAccents[0] = 0;
3525             if (U_FAILURE(*status)) {
3526                 break;
3527             }
3528             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
3529         }
3530
3531         if (!found) {
3532             if (U_FAILURE(*status)) {
3533                 break;
3534             }
3535             textoffset = reverseShift(strsrch, textoffset, targetce,
3536                                       patternceindex);
3537             patternceindex = 0;
3538             continue;
3539         }
3540
3541         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
3542             setColEIterOffset(coleiter, textoffset);
3543             return TRUE;
3544         }
3545     }
3546     setMatchNotFound(strsrch);
3547     return FALSE;
3548 }
3549
3550 #endif /* #if !UCONFIG_NO_COLLATION */