icuSources/i18n/usearch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2003 IBM and others. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *  07/02/2001   synwee      Creation.
   7 **********************************************************************
   8 */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_COLLATION
  13
  14 #include "unicode/usearch.h"
  15 #include "unicode/ustring.h"
  16 #include "unicode/uchar.h"
  17 #include "unormimp.h"
  18 #include "ucol_imp.h"
  19 #include "usrchimp.h"
  20 #include "cmemory.h"
  21
  22 // internal definition ---------------------------------------------------
  23
  24 #define LAST_BYTE_MASK_          0xFF
  25 #define SECOND_LAST_BYTE_SHIFT_  8
  26 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
  27
  28 static const uint16_t *FCD_ = NULL;
  29
  30 // internal methods -------------------------------------------------
  31
  32 /**
  33 * Fast collation element iterator setOffset.
  34 * This function does not check for bounds.
  35 * @param coleiter collation element iterator
  36 * @param offset to set
  37 */
  38 static
  39 inline void setColEIterOffset(UCollationElements *elems,
  40                       int32_t             offset)
  41 {
  42         collIterate *ci = &(elems->iteratordata_);
  43         ci->pos         = ci->string + offset;
  44         ci->CEpos       = ci->toReturn = ci->CEs;
  45         if (ci->flags & UCOL_ITER_INNORMBUF) {
  46                 ci->flags = ci->origFlags;
  47         }
  48         ci->fcdPosition = NULL;
  49 }
  50
  51 /**
  52 * Getting the mask for collation strength
  53 * @param strength collation strength
  54 * @return collation element mask
  55 */
  56 static
  57 inline uint32_t getMask(UCollationStrength strength)
  58 {
  59     switch (strength)
  60     {
  61     case UCOL_PRIMARY:
  62         return UCOL_PRIMARYORDERMASK;
  63     case UCOL_SECONDARY:
  64         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
  65     default:
  66         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
  67                UCOL_PRIMARYORDERMASK;
  68     }
  69 }
  70
  71 /**
  72 * This is to squeeze the 21bit ces into a 256 table
  73 * @param ce collation element
  74 * @return collapsed version of the collation element
  75 */
  76 static
  77 inline int hash(uint32_t ce)
  78 {
  79     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
  80     // well with the new collation where most of the latin 1 characters
  81     // are of the value xx000xxx. their hashes will most of the time be 0
  82     // to be discussed on the hash algo.
  83     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
  84 }
  85
  86 /**
  87 * Initializing the fcd tables.
  88 * Internal method, status assumed to be a success.
  89 * @param status output error if any, caller to check status before calling
  90 *               method, status assumed to be success when passed in.
  91 */
  92 static
  93 inline void initializeFCD(UErrorCode *status)
  94 {
  95     if (FCD_ == NULL) {
  96         FCD_ = unorm_getFCDTrie(status);
  97     }
  98 }
  99
 100 /**
 101 * Gets the fcd value for a character at the argument index.
 102 * This method takes into accounts of the supplementary characters.
 103 * @param str UTF16 string where character for fcd retrieval resides
 104 * @param offset position of the character whose fcd is to be retrieved, to be
 105 *               overwritten with the next character position, taking
 106 *               surrogate characters into consideration.
 107 * @param strlength length of the argument string
 108 * @return fcd value
 109 */
 110 static
 111 inline uint16_t getFCD(const UChar   *str, int32_t *offset,
 112                              int32_t  strlength)
 113 {
 114     int32_t temp = *offset;
 115     uint16_t    result;
 116     UChar       ch   = str[temp];
 117     result = unorm_getFCD16(FCD_, ch);
 118     temp ++;
 119
 120     if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
 121         ch = str[temp];
 122         if (UTF_IS_SECOND_SURROGATE(ch)) {
 123             result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
 124             temp ++;
 125         } else {
 126             result = 0;
 127         }
 128     }
 129     *offset = temp;
 130     return result;
 131 }
 132
 133 /**
 134 * Getting the modified collation elements taking into account the collation
 135 * attributes
 136 * @param strsrch string search data
 137 * @param sourcece
 138 * @return the modified collation element
 139 */
 140 static
 141 inline uint32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
 142 {
 143     // note for tertiary we can't use the collator->tertiaryMask, that
 144     // is a preprocessed mask that takes into account case options. since
 145     // we are only concerned with exact matches, we don't need that.
 146     sourcece &= strsrch->ceMask;
 147
 148     if (strsrch->toShift) {
 149         // alternate handling here, since only the 16 most significant digits
 150         // is only used, we can safely do a compare without masking
 151         // if the ce is a variable, we mask and get only the primary values
 152         // no shifting to quartenary is required since all primary values
 153         // less than variabletop will need to be masked off anyway.
 154         if (strsrch->variableTop > sourcece) {
 155             if (strsrch->strength == UCOL_QUATERNARY) {
 156                 sourcece &= UCOL_PRIMARYORDERMASK;
 157             }
 158             else {
 159                 sourcece = UCOL_IGNORABLE;
 160             }
 161         }
 162     }
 163
 164     return sourcece;
 165 }
 166
 167 /**
 168 * Allocate a memory and returns NULL if it failed.
 169 * Internal method, status assumed to be a success.
 170 * @param size to allocate
 171 * @param status output error if any, caller to check status before calling
 172 *               method, status assumed to be success when passed in.
 173 * @return newly allocated array, NULL otherwise
 174 */
 175 static
 176 inline void * allocateMemory(uint32_t size, UErrorCode *status)
 177 {
 178     uint32_t *result = (uint32_t *)uprv_malloc(size);
 179     if (result == NULL) {
 180         *status = U_MEMORY_ALLOCATION_ERROR;
 181     }
 182     return result;
 183 }
 184
 185 /**
 186 * Adds a uint32_t value to a destination array.
 187 * Creates a new array if we run out of space. The caller will have to
 188 * manually deallocate the newly allocated array.
 189 * Internal method, status assumed to be success, caller has to check status
 190 * before calling this method. destination not to be NULL and has at least
 191 * size destinationlength.
 192 * @param destination target array
 193 * @param offset destination offset to add value
 194 * @param destinationlength target array size, return value for the new size
 195 * @param value to be added
 196 * @param increments incremental size expected
 197 * @param status output error if any, caller to check status before calling
 198 *               method, status assumed to be success when passed in.
 199 * @return new destination array, destination if there was no new allocation
 200 */
 201 static
 202 inline uint32_t * addTouint32_tArray(uint32_t   *destination,
 203                                      uint32_t    offset,
 204                                      uint32_t   *destinationlength,
 205                                      uint32_t    value,
 206                                      uint32_t    increments,
 207                                      UErrorCode *status)
 208 {
 209     uint32_t newlength = *destinationlength;
 210     if (offset + 1 == newlength) {
 211         newlength += increments;
 212         uint32_t *temp = (uint32_t *)allocateMemory(
 213                                          sizeof(uint32_t) * newlength, status);
 214         if (U_FAILURE(*status)) {
 215             return NULL;
 216         }
 217         uprv_memcpy(temp, destination, sizeof(uint32_t) * offset);
 218         *destinationlength = newlength;
 219         destination        = temp;
 220     }
 221     destination[offset] = value;
 222     return destination;
 223 }
 224
 225 /**
 226 * Initializing the ce table for a pattern.
 227 * Stores non-ignorable collation keys.
 228 * Table size will be estimated by the size of the pattern text. Table
 229 * expansion will be perform as we go along. Adding 1 to ensure that the table
 230 * size definitely increases.
 231 * Internal method, status assumed to be a success.
 232 * @param strsrch string search data
 233 * @param status output error if any, caller to check status before calling
 234 *               method, status assumed to be success when passed in.
 235 * @return total number of expansions
 236 */
 237 static
 238 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
 239                                          UErrorCode    *status)
 240 {
 241     UPattern *pattern            = &(strsrch->pattern);
 242     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
 243     uint32_t *cetable            = pattern->CEBuffer;
 244     uint32_t  patternlength      = pattern->textLength;
 245     UCollationElements *coleiter = strsrch->utilIter;
 246
 247     if (coleiter == NULL) {
 248         coleiter = ucol_openElements(strsrch->collator, pattern->text,
 249                                      patternlength, status);
 250         // status will be checked in ucol_next(..) later and if it is an
 251         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
 252         // returned.
 253         strsrch->utilIter = coleiter;
 254     }
 255     else {
 256         uprv_init_collIterate(strsrch->collator, pattern->text,
 257                          pattern->textLength,
 258                          &coleiter->iteratordata_);
 259     }
 260
 261     if (pattern->CE != cetable && pattern->CE) {
 262         uprv_free(pattern->CE);
 263     }
 264
 265     uint16_t  offset      = 0;
 266     uint16_t  result      = 0;
 267     uint32_t  ce;
 268
 269     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
 270            U_SUCCESS(*status)) {
 271         uint32_t newce = getCE(strsrch, ce);
 272         if (newce) {
 273             uint32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
 274                                   newce,
 275                                   patternlength - ucol_getOffset(coleiter) + 1,
 276                                   status);
 277             if (U_FAILURE(*status)) {
 278                 return 0;
 279             }
 280             offset ++;
 281             if (cetable != temp && cetable != pattern->CEBuffer) {
 282                 uprv_free(cetable);
 283             }
 284             cetable = temp;
 285         }
 286         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
 287     }
 288
 289     cetable[offset]   = 0;
 290     pattern->CE       = cetable;
 291     pattern->CELength = offset;
 292
 293     return result;
 294 }
 295
 296 /**
 297 * Initializes the pattern struct.
 298 * Internal method, status assumed to be success.
 299 * @param strsrch UStringSearch data storage
 300 * @param status output error if any, caller to check status before calling
 301 *               method, status assumed to be success when passed in.
 302 * @return expansionsize the total expansion size of the pattern
 303 */
 304 static
 305 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
 306 {
 307           UPattern   *pattern     = &(strsrch->pattern);
 308     const UChar      *patterntext = pattern->text;
 309           int32_t     length      = pattern->textLength;
 310           int32_t index       = 0;
 311
 312     pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
 313                                                      SECOND_LAST_BYTE_SHIFT_;
 314     index = length;
 315     UTF_BACK_1(patterntext, 0, index);
 316     pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
 317                                                              LAST_BYTE_MASK_;
 318     // since intializePattern is an internal method status is a success.
 319     return initializePatternCETable(strsrch, status);
 320 }
 321
 322 /**
 323 * Initializing shift tables, with the default values.
 324 * If a corresponding default value is 0, the shift table is not set.
 325 * @param shift table for forwards shift
 326 * @param backshift table for backwards shift
 327 * @param cetable table containing pattern ce
 328 * @param cesize size of the pattern ces
 329 * @param expansionsize total size of the expansions
 330 * @param defaultforward the default forward value
 331 * @param defaultbackward the default backward value
 332 */
 333 static
 334 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
 335                           uint32_t *cetable, int32_t cesize,
 336                           int16_t   expansionsize,
 337                           int16_t   defaultforward,
 338                           int16_t   defaultbackward)
 339 {
 340     // estimate the value to shift. to do that we estimate the smallest
 341     // number of characters to give the relevant ces, ie approximately
 342     // the number of ces minus their expansion, since expansions can come
 343     // from a character.
 344     int32_t count;
 345     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 346         shift[count] = defaultforward;
 347     }
 348     cesize --; // down to the last index
 349     for (count = 0; count < cesize; count ++) {
 350         // number of ces from right of array to the count
 351         int temp = defaultforward - count - 1;
 352         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
 353     }
 354     shift[hash(cetable[cesize])] = 1;
 355     // for ignorables we just shift by one. see test examples.
 356     shift[hash(0)] = 1;
 357
 358     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
 359         backshift[count] = defaultbackward;
 360     }
 361     for (count = cesize; count > 0; count --) {
 362         // the original value count does not seem to work
 363         backshift[hash(cetable[count])] = count > expansionsize ?
 364                                           (int16_t)(count - expansionsize) : 1;
 365     }
 366     backshift[hash(cetable[0])] = 1;
 367     backshift[hash(0)] = 1;
 368 }
 369
 370 /**
 371 * Building of the pattern collation element list and the boyer moore strsrch
 372 * table.
 373 * The canonical match will only be performed after the default match fails.
 374 * For both cases we need to remember the size of the composed and decomposed
 375 * versions of the string. Since the Boyer-Moore shift calculations shifts by
 376 * a number of characters in the text and tries to match the pattern from that
 377 * offset, the shift value can not be too large in case we miss some
 378 * characters. To choose a right shift size, we estimate the NFC form of the
 379 * and use its size as a shift guide. The NFC form should be the small
 380 * possible representation of the pattern. Anyways, we'll err on the smaller
 381 * shift size. Hence the calculation for minlength.
 382 * Canonical match will be performed slightly differently. We'll split the
 383 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
 384 * the first and last base character (MS), the ending accents (EA). Matches
 385 * will be done on MS first, and only when we match MS then some processing
 386 * will be required for the prefix and end accents in order to determine if
 387 * they match PA and EA. Hence the default shift values
 388 * for the canonical match will take the size of either end's accent into
 389 * consideration. Forwards search will take the end accents into consideration
 390 * for the default shift values and the backwards search will take the prefix
 391 * accents into consideration.
 392 * If pattern has no non-ignorable ce, we return a illegal argument error.
 393 * Internal method, status assumed to be success.
 394 * @param strsrch UStringSearch data storage
 395 * @param status  for output errors if it occurs, status is assumed to be a
 396 *                success when it is passed in.
 397 */
 398 static
 399 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
 400 {
 401     int16_t expandlength  = initializePattern(strsrch, status);
 402     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
 403         UPattern *pattern = &strsrch->pattern;
 404         int32_t   cesize  = pattern->CELength;
 405
 406         int16_t minlength = cesize > expandlength
 407                                         ? (int16_t)cesize - expandlength : 1;
 408         pattern->defaultShiftSize    = minlength;
 409         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
 410                       cesize, expandlength, minlength, minlength);
 411         return;
 412     }
 413     strsrch->pattern.defaultShiftSize = 0;
 414 }
 415
 416 /**
 417 * Determine whether the target text in UStringSearch bounded by the offset
 418 * start and end is one or more whole units of text as
 419 * determined by the breakiterator in UStringSearch.
 420 * @param strsrch string search data
 421 * @param start target text start offset
 422 * @param end target text end offset
 423 */
 424 static
 425 inline UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
 426                                int32_t    end)
 427 {
 428 #if !UCONFIG_NO_BREAK_ITERATION
 429     UBreakIterator *breakiterator = strsrch->search->breakIter;
 430     if (breakiterator) {
 431         int32_t startindex = ubrk_first(breakiterator);
 432         int32_t endindex   = ubrk_last(breakiterator);
 433
 434         // out-of-range indexes are never boundary positions
 435         if (start < startindex || start > endindex ||
 436             end < startindex || end > endindex) {
 437             return FALSE;
 438         }
 439         // otherwise, we can use following() on the position before the
 440         // specified one and return true of the position we get back is the
 441         // one the user specified
 442         UBool result = (start == startindex ||
 443                 ubrk_following(breakiterator, start - 1) == start) &&
 444                (end == endindex ||
 445                 ubrk_following(breakiterator, end - 1) == end);
 446         if (result) {
 447             // iterates the individual ces
 448                   UCollationElements *coleiter  = strsrch->utilIter;
 449             const UChar              *text      = strsrch->search->text +
 450                                                                       start;
 451                   UErrorCode          status    = U_ZERO_ERROR;
 452             ucol_setText(coleiter, text, end - start, &status);
 453             for (int32_t count = 0; count < strsrch->pattern.CELength;
 454                  count ++) {
 455                 uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 456                 if (ce == UCOL_IGNORABLE) {
 457                     count --;
 458                     continue;
 459                 }
 460                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
 461                     return FALSE;
 462                 }
 463             }
 464             uint32_t nextce = ucol_next(coleiter, &status);
 465             while (ucol_getOffset(coleiter) == (end - start)
 466                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
 467                 nextce = ucol_next(coleiter, &status);
 468             }
 469             if (ucol_getOffset(coleiter) == (end - start)
 470                 && nextce != UCOL_NULLORDER) {
 471                 // extra collation elements at the end of the match
 472                 return FALSE;
 473             }
 474         }
 475         return result;
 476     }
 477 #endif
 478     return TRUE;
 479 }
 480
 481 /**
 482 * Getting the next base character offset if current offset is an accent,
 483 * or the current offset if the current character contains a base character.
 484 * accents the following base character will be returned
 485 * @param text string
 486 * @param textoffset current offset
 487 * @param textlength length of text string
 488 * @return the next base character or the current offset
 489 *         if the current character is contains a base character.
 490 */
 491 static
 492 inline int32_t getNextBaseOffset(const UChar       *text,
 493                                            int32_t  textoffset,
 494                                            int32_t      textlength)
 495 {
 496     if (textoffset < textlength) {
 497         int32_t temp = textoffset;
 498         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 499             while (temp < textlength) {
 500                 int32_t result = temp;
 501                 if ((getFCD(text, &temp, textlength) >>
 502                      SECOND_LAST_BYTE_SHIFT_) == 0) {
 503                     return result;
 504                 }
 505             }
 506             return textlength;
 507         }
 508     }
 509     return textoffset;
 510 }
 511
 512 /**
 513 * Gets the next base character offset depending on the string search pattern
 514 * data
 515 * @param strsrch string search data
 516 * @param textoffset current offset, one offset away from the last character
 517 *                   to search for.
 518 * @return start index of the next base character or the current offset
 519 *         if the current character is contains a base character.
 520 */
 521 static
 522 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
 523                                                   int32_t    textoffset)
 524 {
 525         int32_t textlength = strsrch->search->textLength;
 526     if (strsrch->pattern.hasSuffixAccents &&
 527         textoffset < textlength) {
 528               int32_t  temp       = textoffset;
 529         const UChar       *text       = strsrch->search->text;
 530         UTF_BACK_1(text, 0, temp);
 531         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 532             return getNextBaseOffset(text, textoffset, textlength);
 533         }
 534     }
 535     return textoffset;
 536 }
 537
 538 /**
 539 * Shifting the collation element iterator position forward to prepare for
 540 * a following match. If the last character is a unsafe character, we'll only
 541 * shift by 1 to capture contractions, normalization etc.
 542 * Internal method, status assumed to be success.
 543 * @param text strsrch string search data
 544 * @param textoffset start text position to do search
 545 * @param ce the text ce which failed the match.
 546 * @param patternceindex index of the ce within the pattern ce buffer which
 547 *        failed the match
 548 * @return final offset
 549 */
 550 static
 551 inline int32_t shiftForward(UStringSearch *strsrch,
 552                                 int32_t    textoffset,
 553                                 uint32_t       ce,
 554                                 int32_t        patternceindex)
 555 {
 556         UPattern *pattern = &(strsrch->pattern);
 557     if (ce != UCOL_NULLORDER) {
 558         int32_t shift = pattern->shift[hash(ce)];
 559         // this is to adjust for characters in the middle of the
 560         // substring for matching that failed.
 561         int32_t adjust = pattern->CELength - patternceindex;
 562         if (adjust > 1 && shift >= adjust) {
 563             shift -= adjust - 1;
 564         }
 565         textoffset += shift;
 566     }
 567     else {
 568         textoffset += pattern->defaultShiftSize;
 569     }
 570
 571     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
 572     // check for unsafe characters
 573     // * if it is the start or middle of a contraction: to be done after
 574     //   a initial match is found
 575     // * thai or lao base consonant character: similar to contraction
 576     // * high surrogate character: similar to contraction
 577     // * next character is a accent: shift to the next base character
 578     return textoffset;
 579 }
 580
 581 /**
 582 * sets match not found
 583 * @param strsrch string search data
 584 */
 585 static
 586 inline void setMatchNotFound(UStringSearch *strsrch)
 587 {
 588     // this method resets the match result regardless of the error status.
 589     strsrch->search->matchedIndex = USEARCH_DONE;
 590     strsrch->search->matchedLength = 0;
 591     if (strsrch->search->isForwardSearching) {
 592         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
 593     }
 594     else {
 595         setColEIterOffset(strsrch->textIter, 0);
 596     }
 597 }
 598
 599 /**
 600 * Gets the offset to the next safe point in text.
 601 * ie. not the middle of a contraction, swappable characters or supplementary
 602 * characters.
 603 * @param collator collation sata
 604 * @param text string to work with
 605 * @param textoffset offset in string
 606 * @param textlength length of text string
 607 * @return offset to the next safe character
 608 */
 609 static
 610 inline int32_t getNextSafeOffset(const UCollator   *collator,
 611                                      const UChar       *text,
 612                                            int32_t  textoffset,
 613                                            int32_t      textlength)
 614 {
 615     int32_t result = textoffset; // first contraction character
 616     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
 617         result ++;
 618     }
 619     return result;
 620 }
 621
 622 /**
 623 * This checks for accents in the potential match started with a .
 624 * composite character.
 625 * This is really painful... we have to check that composite character do not
 626 * have any extra accents. We have to normalize the potential match and find
 627 * the immediate decomposed character before the match.
 628 * The first composite character would have been taken care of by the fcd
 629 * checks in checkForwardExactMatch.
 630 * This is the slow path after the fcd of the first character and
 631 * the last character has been checked by checkForwardExactMatch and we
 632 * determine that the potential match has extra non-ignorable preceding
 633 * ces.
 634 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 635 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 636 * Note here that accents checking are slow and cautioned in the API docs.
 637 * Internal method, status assumed to be a success, caller should check status
 638 * before calling this method
 639 * @param strsrch string search data
 640 * @param start index of the potential unfriendly composite character
 641 * @param end index of the potential unfriendly composite character
 642 * @param status output error status if any.
 643 * @return TRUE if there is non-ignorable accents before at the beginning
 644 *              of the match, FALSE otherwise.
 645 */
 646
 647 static
 648 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
 649                                    int32_t    end,
 650                                    UErrorCode    *status)
 651 {
 652     UBool result = FALSE;
 653     if (strsrch->pattern.hasPrefixAccents) {
 654               int32_t  length = end - start;
 655               int32_t  offset = 0;
 656         const UChar       *text   = strsrch->search->text + start;
 657
 658         UTF_FWD_1(text, offset, length);
 659         // we are only concerned with the first composite character
 660         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
 661             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
 662                                                        text, 0, length);
 663             if (safeoffset != length) {
 664                 safeoffset ++;
 665             }
 666             UChar   *norm = NULL;
 667             UChar    buffer[INITIAL_ARRAY_SIZE_];
 668             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
 669                                             buffer, INITIAL_ARRAY_SIZE_,
 670                                             status);
 671             if (U_FAILURE(*status)) {
 672                 return FALSE;
 673             }
 674             if (size >= INITIAL_ARRAY_SIZE_) {
 675                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
 676                                                status);
 677                 // if allocation failed, status will be set to
 678                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
 679                 // checks for it.
 680                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
 681                                        size, status);
 682                 if (U_FAILURE(*status) && norm != NULL) {
 683                     uprv_free(norm);
 684                     return FALSE;
 685                 }
 686             }
 687             else {
 688                 norm = buffer;
 689             }
 690
 691             UCollationElements *coleiter  = strsrch->utilIter;
 692             ucol_setText(coleiter, norm, size, status);
 693             uint32_t            firstce   = strsrch->pattern.CE[0];
 694             UBool               ignorable = TRUE;
 695             uint32_t            ce        = UCOL_IGNORABLE;
 696             while (U_SUCCESS(*status) && ce != firstce) {
 697                 offset = ucol_getOffset(coleiter);
 698                 if (ce != firstce && ce != UCOL_IGNORABLE) {
 699                     ignorable = FALSE;
 700                 }
 701                 ce = ucol_next(coleiter, status);
 702             }
 703             UChar32 codepoint;
 704             UTF_PREV_CHAR(norm, 0, offset, codepoint);
 705             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
 706
 707             if (norm != buffer) {
 708                 uprv_free(norm);
 709             }
 710         }
 711     }
 712
 713     return result;
 714 }
 715
 716 /**
 717 * Used by exact matches, checks if there are accents before the match.
 718 * This is really painful... we have to check that composite characters at
 719 * the start of the matches have to not have any extra accents.
 720 * We check the FCD of the character first, if it starts with an accent and
 721 * the first pattern ce does not match the first ce of the character, we bail.
 722 * Otherwise we try normalizing the first composite
 723 * character and find the immediate decomposed character before the match to
 724 * see if it is an non-ignorable accent.
 725 * Now normalizing the first composite character is enough because we ensure
 726 * that when the match is passed in here with extra beginning ces, the
 727 * first or last ce that match has to occur within the first character.
 728 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
 729 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
 730 * Note here that accents checking are slow and cautioned in the API docs.
 731 * @param strsrch string search data
 732 * @param start offset
 733 * @param end offset
 734 * @return TRUE if there are accents on either side of the match,
 735 *         FALSE otherwise
 736 */
 737 static
 738 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
 739                                   int32_t    end)
 740 {
 741     if (strsrch->pattern.hasPrefixAccents) {
 742         UCollationElements *coleiter  = strsrch->textIter;
 743         UErrorCode          status    = U_ZERO_ERROR;
 744         // we have been iterating forwards previously
 745         uint32_t            ignorable = TRUE;
 746         uint32_t            firstce   = strsrch->pattern.CE[0];
 747
 748                 setColEIterOffset(coleiter, start);
 749         uint32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
 750                 if (U_FAILURE(status)) {
 751             return TRUE;
 752         }
 753         while (ce != firstce) {
 754             if (ce != UCOL_IGNORABLE) {
 755                 ignorable = FALSE;
 756             }
 757             ce = getCE(strsrch, ucol_next(coleiter, &status));
 758             if (U_FAILURE(status)) {
 759                 return TRUE;
 760             }
 761         }
 762                 if (!ignorable && inNormBuf(coleiter)) {
 763             // within normalization buffer, discontiguous handled here
 764                     return TRUE;
 765         }
 766
 767                 // within text
 768         int32_t temp = start;
 769                 // original code
 770                 // accent = (getFCD(strsrch->search->text, &temp,
 771         //                  strsrch->search->textLength)
 772                 //           >> SECOND_LAST_BYTE_SHIFT_);
 773                 // however this code does not work well with VC7 .net in release mode.
 774                 // maybe the inlines for getFCD combined with shifting has bugs in
 775                 // VC7. anyways this is a work around.
 776                 UBool accent = getFCD(strsrch->search->text, &temp,
 777                               strsrch->search->textLength) > 0xFF;
 778         if (!accent) {
 779                         return checkExtraMatchAccents(strsrch, start, end, &status);
 780         }
 781                 if (!ignorable) {
 782             return TRUE;
 783         }
 784         if (start > 0) {
 785             temp = start;
 786             UTF_BACK_1(strsrch->search->text, 0, temp);
 787             if (getFCD(strsrch->search->text, &temp,
 788                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
 789                 setColEIterOffset(coleiter, start);
 790                 ce = ucol_previous(coleiter, &status);
 791                 if (U_FAILURE(status) ||
 792                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
 793                     return TRUE;
 794                 }
 795             }
 796         }
 797     }
 798
 799     return FALSE;
 800 }
 801
 802 /**
 803 * Used by exact matches, checks if there are accents bounding the match.
 804 * Note this is the initial boundary check. If the potential match
 805 * starts or ends with composite characters, the accents in those
 806 * characters will be determined later.
 807 * Not doing backwards iteration here, since discontiguos contraction for
 808 * backwards collation element iterator, use up too many characters.
 809 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
 810 * should fail since there is a acute at the end of \u01FA
 811 * Note here that accents checking are slow and cautioned in the API docs.
 812 * @param strsrch string search data
 813 * @param start offset of match
 814 * @param end end offset of the match
 815 * @return TRUE if there are accents on either side of the match,
 816 *         FALSE otherwise
 817 */
 818 static
 819 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
 820                                  int32_t    end)
 821 {
 822     if (strsrch->pattern.hasSuffixAccents) {
 823         const UChar       *text       = strsrch->search->text;
 824               int32_t  temp       = end;
 825               int32_t      textlength = strsrch->search->textLength;
 826         UTF_BACK_1(text, 0, temp);
 827         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
 828             uint32_t            firstce  = strsrch->pattern.CE[0];
 829             UCollationElements *coleiter = strsrch->textIter;
 830             UErrorCode          status   = U_ZERO_ERROR;
 831             setColEIterOffset(coleiter, start);
 832             while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
 833                 if (U_FAILURE(status)) {
 834                     return TRUE;
 835                 }
 836             }
 837             int32_t count = 1;
 838             while (count < strsrch->pattern.CELength) {
 839                 if (getCE(strsrch, ucol_next(coleiter, &status))
 840                     == UCOL_IGNORABLE) {
 841                     // Thai can give an ignorable here.
 842                     count --;
 843                 }
 844                 if (U_FAILURE(status)) {
 845                     return TRUE;
 846                 }
 847                 count ++;
 848             }
 849             uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
 850             if (U_FAILURE(status)) {
 851                 return TRUE;
 852             }
 853             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
 854                 if (ucol_getOffset(coleiter) <= end) {
 855                     return TRUE;
 856                 }
 857                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
 858                     return TRUE;
 859                 }
 860             }
 861         }
 862     }
 863     return FALSE;
 864 }
 865
 866 /**
 867 * Checks if the offset runs out of the text string
 868 * @param offset
 869 * @param textlength of the text string
 870 * @return TRUE if offset is out of bounds, FALSE otherwise
 871 */
 872 static
 873 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
 874 {
 875     return offset < 0 || offset > textlength;
 876 }
 877
 878 /**
 879 * Checks for identical match
 880 * @param strsrch string search data
 881 * @param start offset of possible match
 882 * @param end offset of possible match
 883 * @return TRUE if identical match is found
 884 */
 885 static
 886 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
 887                                   int32_t    end)
 888 {
 889     int32_t length = end - start;
 890     if (strsrch->strength != UCOL_IDENTICAL) {
 891         return TRUE;
 892     }
 893
 894     UErrorCode status = U_ZERO_ERROR;
 895     int decomplength = unorm_decompose(NULL, -1,
 896                                        strsrch->search->text + start, length,
 897                                        FALSE, 0, &status);
 898     if (decomplength != unorm_decompose(NULL, -1, strsrch->pattern.text,
 899                                         strsrch->pattern.textLength,
 900                                         FALSE, 0, &status)) {
 901         return FALSE;
 902     }
 903     decomplength ++;
 904     UChar *text    = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
 905     UChar *pattern = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
 906     unorm_decompose(text, decomplength, strsrch->search->text + start,
 907                     length, FALSE, 0, &status);
 908     unorm_decompose(pattern, decomplength, strsrch->pattern.text,
 909                     strsrch->pattern.textLength, FALSE, 0, &status);
 910     UBool result = (uprv_memcmp(pattern, text, decomplength * sizeof(UChar))
 911                     == 0);
 912     uprv_free(text);
 913     uprv_free(pattern);
 914     return result;
 915 }
 916
 917 /**
 918 * Checks to see if the match is repeated
 919 * @param strsrch string search data
 920 * @param start new match start index
 921 * @param end new match end index
 922 * @return TRUE if the the match is repeated, FALSE otherwise
 923 */
 924 static
 925 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
 926                                 int32_t    start,
 927                                 int32_t    end)
 928 {
 929     int32_t lastmatchindex = strsrch->search->matchedIndex;
 930     UBool       result;
 931     if (lastmatchindex == USEARCH_DONE) {
 932         return FALSE;
 933     }
 934     if (strsrch->search->isForwardSearching) {
 935         result = start <= lastmatchindex;
 936     }
 937     else {
 938         result = start >= lastmatchindex;
 939     }
 940     if (!strsrch->search->isOverlap) {
 941         if (strsrch->search->isForwardSearching) {
 942             result = start < lastmatchindex + strsrch->search->matchedLength;
 943         }
 944         else {
 945             result = end > lastmatchindex;
 946         }
 947     }
 948     return result;
 949 }
 950
 951 /**
 952 * Gets the collation element iterator's current offset.
 953 * @param coleiter collation element iterator
 954 * @param forwards flag TRUE if we are moving in th forwards direction
 955 * @return current offset
 956 */
 957 static
 958 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
 959                                               UBool               forwards)
 960 {
 961     int32_t result = ucol_getOffset(coleiter);
 962     // intricacies of the the backwards collation element iterator
 963     if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
 964         result ++;
 965     }
 966     return result;
 967 }
 968
 969 /**
 970 * Checks match for contraction.
 971 * If the match ends with a partial contraction we fail.
 972 * If the match starts too far off (because of backwards iteration) we try to
 973 * chip off the extra characters depending on whether a breakiterator has
 974 * been used.
 975 * Internal method, error assumed to be success, caller has to check status
 976 * before calling this method.
 977 * @param strsrch string search data
 978 * @param start offset of potential match, to be modified if necessary
 979 * @param end offset of potential match, to be modified if necessary
 980 * @param status output error status if any
 981 * @return TRUE if match passes the contraction test, FALSE otherwise
 982 */
 983
 984 static
 985 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
 986                                      int32_t   *start,
 987                                      int32_t   *end, UErrorCode  *status)
 988 {
 989           UCollationElements *coleiter   = strsrch->textIter;
 990           int32_t             textlength = strsrch->search->textLength;
 991           int32_t         temp       = *start;
 992     const UCollator          *collator   = strsrch->collator;
 993     const UChar              *text       = strsrch->search->text;
 994     // This part checks if either ends of the match contains potential
 995     // contraction. If so we'll have to iterate through them
 996         // The start contraction needs to be checked since ucol_previous dumps
 997         // all characters till the first safe character into the buffer.
 998         // *start + 1 is used to test for the unsafe characters instead of *start
 999         // because ucol_prev takes all unsafe characters till the first safe
1000         // character ie *start. so by testing *start + 1, we can estimate if
1001         // excess prefix characters has been included in the potential search
1002         // results.
1003     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1004         (*start + 1 < textlength
1005          && ucol_unsafeCP(text[*start + 1], collator))) {
1006         int32_t expansion  = getExpansionPrefix(coleiter);
1007         UBool   expandflag = expansion > 0;
1008         setColEIterOffset(coleiter, *start);
1009         while (expansion > 0) {
1010             // getting rid of the redundant ce, caused by setOffset.
1011             // since backward contraction/expansion may have extra ces if we
1012             // are in the normalization buffer, hasAccentsBeforeMatch would
1013             // have taken care of it.
1014             // E.g. the character \u01FA will have an expansion of 3, but if
1015             // we are only looking for acute and ring \u030A and \u0301, we'll
1016             // have to skip the first ce in the expansion buffer.
1017             ucol_next(coleiter, status);
1018                         if (U_FAILURE(*status)) {
1019                                 return FALSE;
1020                         }
1021             if (ucol_getOffset(coleiter) != temp) {
1022                 *start = temp;
1023                 temp  = ucol_getOffset(coleiter);
1024             }
1025             expansion --;
1026         }
1027
1028         uint32_t *patternce       = strsrch->pattern.CE;
1029         int32_t   patterncelength = strsrch->pattern.CELength;
1030         int32_t   count           = 0;
1031         while (count < patterncelength) {
1032             uint32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1033             if (ce == UCOL_IGNORABLE) {
1034                 continue;
1035             }
1036             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1037                 *start = temp;
1038                 temp   = ucol_getOffset(coleiter);
1039             }
1040             if (U_FAILURE(*status) || ce != patternce[count]) {
1041                 (*end) ++;
1042                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1043                 return FALSE;
1044             }
1045             count ++;
1046         }
1047     }
1048     return TRUE;
1049 }
1050
1051 /**
1052 * Checks and sets the match information if found.
1053 * Checks
1054 * <ul>
1055 * <li> the potential match does not repeat the previous match
1056 * <li> boundaries are correct
1057 * <li> exact matches has no extra accents
1058 * <li> identical matchesb
1059 * <li> potential match does not end in the middle of a contraction
1060 * <\ul>
1061 * Otherwise the offset will be shifted to the next character.
1062 * Internal method, status assumed to be success, caller has to check status
1063 * before calling this method.
1064 * @param strsrch string search data
1065 * @param textoffset offset in the collation element text. the returned value
1066 *        will be the truncated end offset of the match or the new start
1067 *        search offset.
1068 * @param status output error status if any
1069 * @return TRUE if the match is valid, FALSE otherwise
1070 */
1071 static
1072 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1073                                  int32_t   *textoffset, UErrorCode *status)
1074 {
1075     UCollationElements *coleiter = strsrch->textIter;
1076     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1077
1078         if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1079             return FALSE;
1080     }
1081
1082     // this totally matches, however we need to check if it is repeating
1083     if (!isBreakUnit(strsrch, start, *textoffset) ||
1084         checkRepeatedMatch(strsrch, start, *textoffset) ||
1085         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1086         !checkIdentical(strsrch, start, *textoffset) ||
1087         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1088
1089                 (*textoffset) ++;
1090         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1091                 return FALSE;
1092     }
1093
1094     // totally match, we will get rid of the ending ignorables.
1095     strsrch->search->matchedIndex  = start;
1096     strsrch->search->matchedLength = *textoffset - start;
1097         return TRUE;
1098 }
1099
1100 /**
1101 * Getting the previous base character offset, or the current offset if the
1102 * current character is a base character
1103 * @param text string
1104 * @param textoffset one offset after the current character
1105 * @return the offset of the next character after the base character or the first
1106 *         composed character with accents
1107 */
1108 static
1109 inline int32_t getPreviousBaseOffset(const UChar       *text,
1110                                                int32_t  textoffset)
1111 {
1112     if (textoffset > 0) {
1113         while (TRUE) {
1114             int32_t result = textoffset;
1115             UTF_BACK_1(text, 0, textoffset);
1116             int32_t temp = textoffset;
1117             uint16_t fcd = getFCD(text, &temp, result);
1118             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1119                 if (fcd & LAST_BYTE_MASK_) {
1120                     return textoffset;
1121                 }
1122                 return result;
1123             }
1124             if (textoffset == 0) {
1125                 return 0;
1126             }
1127         }
1128     }
1129     return textoffset;
1130 }
1131
1132 /**
1133 * Getting the indexes of the accents that are not blocked in the argument
1134 * accent array
1135 * @param accents array of accents in nfd terminated by a 0.
1136 * @param accentsindex array of indexes of the accents that are not blocked
1137 */
1138 static
1139 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1140 {
1141     int32_t index     = 0;
1142     int32_t     length    = u_strlen(accents);
1143     UChar32     codepoint = 0;
1144     int         cclass    = 0;
1145     int         result    = 0;
1146     int32_t temp;
1147     while (index < length) {
1148         temp = index;
1149         UTF_NEXT_CHAR(accents, index, length, codepoint);
1150         if (u_getCombiningClass(codepoint) != cclass) {
1151             cclass        = u_getCombiningClass(codepoint);
1152             accentsindex[result] = temp;
1153             result ++;
1154         }
1155     }
1156     accentsindex[result] = length;
1157     return result;
1158 }
1159
1160 /**
1161 * Appends 3 UChar arrays to a destination array.
1162 * Creates a new array if we run out of space. The caller will have to
1163 * manually deallocate the newly allocated array.
1164 * Internal method, status assumed to be success, caller has to check status
1165 * before calling this method. destination not to be NULL and has at least
1166 * size destinationlength.
1167 * @param destination target array
1168 * @param destinationlength target array size, returning the appended length
1169 * @param source1 null-terminated first array
1170 * @param source2 second array
1171 * @param source2length length of seond array
1172 * @param source3 null-terminated third array
1173 * @param status error status if any
1174 * @return new destination array, destination if there was no new allocation
1175 */
1176 static
1177 inline UChar * addToUCharArray(      UChar      *destination,
1178                                      int32_t    *destinationlength,
1179                                const UChar      *source1,
1180                                const UChar      *source2,
1181                                      int32_t     source2length,
1182                                const UChar      *source3,
1183                                      UErrorCode *status)
1184 {
1185     int32_t source1length = source1 ? u_strlen(source1) : 0;
1186     int32_t source3length = source3 ? u_strlen(source3) : 0;
1187     if (*destinationlength < source1length + source2length + source3length +
1188                                                                            1)
1189     {
1190         destination = (UChar *)allocateMemory(
1191           (source1length + source2length + source3length + 1) * sizeof(UChar),
1192           status);
1193         // if error allocating memory, status will be
1194         // U_MEMORY_ALLOCATION_ERROR
1195         if (U_FAILURE(*status)) {
1196             *destinationlength = 0;
1197             return NULL;
1198         }
1199     }
1200     if (source1length != 0) {
1201         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1202     }
1203     if (source2length != 0) {
1204         uprv_memcpy(destination + source1length, source2,
1205                     sizeof(UChar) * source2length);
1206     }
1207     if (source3length != 0) {
1208         uprv_memcpy(destination + source1length + source2length, source3,
1209                     sizeof(UChar) * source3length);
1210     }
1211     *destinationlength = source1length + source2length + source3length;
1212     return destination;
1213 }
1214
1215 /**
1216 * Running through a collation element iterator to see if the contents matches
1217 * pattern in string search data
1218 * @param strsrch string search data
1219 * @param coleiter collation element iterator
1220 * @return TRUE if a match if found, FALSE otherwise
1221 */
1222 static
1223 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1224                                        UCollationElements *coleiter)
1225 {
1226     int         patternceindex = strsrch->pattern.CELength;
1227     uint32_t   *patternce      = strsrch->pattern.CE;
1228     UErrorCode  status = U_ZERO_ERROR;
1229     while (patternceindex > 0) {
1230         uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1231         if (ce == UCOL_IGNORABLE) {
1232             continue;
1233         }
1234         if (U_FAILURE(status) || ce != *patternce) {
1235             return FALSE;
1236         }
1237         patternce ++;
1238         patternceindex --;
1239     }
1240     return TRUE;
1241 }
1242
1243 /**
1244 * Rearranges the front accents to try matching.
1245 * Prefix accents in the text will be grouped according to their combining
1246 * class and the groups will be mixed and matched to try find the perfect
1247 * match with the pattern.
1248 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1249 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1250 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1251 *         "\u0301\u0325".
1252 * step 2: check if any of the generated substrings matches the pattern.
1253 * Internal method, status is assumed to be success, caller has to check status
1254 * before calling this method.
1255 * @param strsrch string search match
1256 * @param start first offset of the accents to start searching
1257 * @param end start of the last accent set
1258 * @param status output error status if any
1259 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1260 *         offset of the match. Note this start includes all preceding accents.
1261 */
1262 static
1263 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1264                                        int32_t    start,
1265                                        int32_t    end,
1266                                        UErrorCode    *status)
1267 {
1268     const UChar       *text       = strsrch->search->text;
1269           int32_t      textlength = strsrch->search->textLength;
1270           int32_t  tempstart  = start;
1271
1272     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1273         // die... failed at a base character
1274         return USEARCH_DONE;
1275     }
1276
1277     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1278     start = getPreviousBaseOffset(text, tempstart);
1279
1280     UChar       accents[INITIAL_ARRAY_SIZE_];
1281     // normalizing the offensive string
1282     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1283                     INITIAL_ARRAY_SIZE_, status);
1284     if (U_FAILURE(*status)) {
1285         return USEARCH_DONE;
1286     }
1287
1288     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1289     int32_t         accentsize = getUnblockedAccentIndex(accents,
1290                                                                  accentsindex);
1291     int32_t         count      = (2 << (accentsize - 1)) - 2;
1292     UChar               buffer[INITIAL_ARRAY_SIZE_];
1293     UCollationElements *coleiter   = strsrch->utilIter;
1294     while (U_SUCCESS(*status) && count > 0) {
1295         UChar *rearrange = strsrch->canonicalPrefixAccents;
1296         // copy the base characters
1297         for (int k = 0; k < accentsindex[0]; k ++) {
1298             *rearrange ++ = accents[k];
1299         }
1300         // forming all possible canonical rearrangement by dropping
1301         // sets of accents
1302         for (int i = 0; i <= accentsize - 1; i ++) {
1303             int32_t mask = 1 << (accentsize - i - 1);
1304             if (count & mask) {
1305                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1306                     *rearrange ++ = accents[j];
1307                 }
1308             }
1309         }
1310         *rearrange = 0;
1311         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1312         UChar   *match     = addToUCharArray(buffer, &matchsize,
1313                                            strsrch->canonicalPrefixAccents,
1314                                            strsrch->search->text + offset,
1315                                            end - offset,
1316                                            strsrch->canonicalSuffixAccents,
1317                                            status);
1318
1319         // if status is a failure, ucol_setText does nothing.
1320         // run the collator iterator through this match
1321         ucol_setText(coleiter, match, matchsize, status);
1322         if (U_SUCCESS(*status)) {
1323             if (checkCollationMatch(strsrch, coleiter)) {
1324                 if (match != buffer) {
1325                     uprv_free(match);
1326                 }
1327                 return start;
1328             }
1329         }
1330         count --;
1331     }
1332     return USEARCH_DONE;
1333 }
1334
1335 /**
1336 * Gets the offset to the safe point in text before textoffset.
1337 * ie. not the middle of a contraction, swappable characters or supplementary
1338 * characters.
1339 * @param collator collation sata
1340 * @param text string to work with
1341 * @param textoffset offset in string
1342 * @param textlength length of text string
1343 * @return offset to the previous safe character
1344 */
1345 static
1346 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1347                                       const UChar       *text,
1348                                             int32_t  textoffset)
1349 {
1350     int32_t result = textoffset; // first contraction character
1351     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1352         result --;
1353     }
1354     if (result != 0) {
1355         // the first contraction character is consider unsafe here
1356         result --;
1357     }
1358     return result;
1359 }
1360
1361 /**
1362 * Cleaning up after we passed the safe zone
1363 * @param strsrch string search data
1364 * @param safetext safe text array
1365 * @param safebuffer safe text buffer
1366 * @param coleiter collation element iterator for safe text
1367 */
1368 static
1369 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1370                                   UChar         *safebuffer)
1371 {
1372     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1373     {
1374        uprv_free(safetext);
1375     }
1376 }
1377
1378 /**
1379 * Take the rearranged end accents and tries matching. If match failed at
1380 * a seperate preceding set of accents (seperated from the rearranged on by
1381 * at least a base character) then we rearrange the preceding accents and
1382 * tries matching again.
1383 * We allow skipping of the ends of the accent set if the ces do not match.
1384 * However if the failure is found before the accent set, it fails.
1385 * Internal method, status assumed to be success, caller has to check status
1386 * before calling this method.
1387 * @param strsrch string search data
1388 * @param textoffset of the start of the rearranged accent
1389 * @param status output error status if any
1390 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1391 *         offset of the match. Note this start includes all preceding accents.
1392 */
1393 static
1394 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1395                                        int32_t    textoffset,
1396                                        UErrorCode    *status)
1397 {
1398     const UChar              *text           = strsrch->search->text;
1399     const UCollator          *collator       = strsrch->collator;
1400           int32_t             safelength     = 0;
1401           UChar              *safetext;
1402           int32_t             safetextlength;
1403           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1404           UCollationElements *coleiter       = strsrch->utilIter;
1405           int32_t         safeoffset     = textoffset;
1406
1407     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1408                                          collator)) {
1409         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1410         safelength     = textoffset - safeoffset;
1411         safetextlength = INITIAL_ARRAY_SIZE_;
1412         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1413                                          text + safeoffset, safelength,
1414                                          strsrch->canonicalSuffixAccents,
1415                                          status);
1416     }
1417     else {
1418         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1419         safetext       = strsrch->canonicalSuffixAccents;
1420     }
1421
1422     // if status is a failure, ucol_setText does nothing
1423     ucol_setText(coleiter, safetext, safetextlength, status);
1424     // status checked in loop below
1425
1426     uint32_t *ce        = strsrch->pattern.CE;
1427     uint32_t  celength  = strsrch->pattern.CELength;
1428     int       ceindex   = celength - 1;
1429     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1430
1431     while (ceindex >= 0) {
1432         uint32_t textce = ucol_previous(coleiter, status);
1433         if (U_FAILURE(*status)) {
1434             if (isSafe) {
1435                 cleanUpSafeText(strsrch, safetext, safebuffer);
1436             }
1437             return USEARCH_DONE;
1438         }
1439         if (textce == UCOL_NULLORDER) {
1440             // check if we have passed the safe buffer
1441             if (coleiter == strsrch->textIter) {
1442                 cleanUpSafeText(strsrch, safetext, safebuffer);
1443                 return USEARCH_DONE;
1444             }
1445             cleanUpSafeText(strsrch, safetext, safebuffer);
1446             safetext = safebuffer;
1447             coleiter = strsrch->textIter;
1448             setColEIterOffset(coleiter, safeoffset);
1449             // status checked at the start of the loop
1450             isSafe = FALSE;
1451             continue;
1452         }
1453         textce = getCE(strsrch, textce);
1454         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1455             // do the beginning stuff
1456             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1457             if (isSafe && failedoffset >= safelength) {
1458                 // alas... no hope. failed at rearranged accent set
1459                 cleanUpSafeText(strsrch, safetext, safebuffer);
1460                 return USEARCH_DONE;
1461             }
1462             else {
1463                 if (isSafe) {
1464                     failedoffset += safeoffset;
1465                     cleanUpSafeText(strsrch, safetext, safebuffer);
1466                 }
1467
1468                 // try rearranging the front accents
1469                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1470                                         failedoffset, textoffset, status);
1471                 if (result != USEARCH_DONE) {
1472                     // if status is a failure, ucol_setOffset does nothing
1473                     setColEIterOffset(strsrch->textIter, result);
1474                 }
1475                 if (U_FAILURE(*status)) {
1476                     return USEARCH_DONE;
1477                 }
1478                 return result;
1479             }
1480         }
1481         if (textce == ce[ceindex]) {
1482             ceindex --;
1483         }
1484     }
1485     // set offset here
1486     if (isSafe) {
1487         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1488         // sets the text iterator here with the correct expansion and offset
1489         int32_t    leftoverces = getExpansionPrefix(coleiter);
1490         cleanUpSafeText(strsrch, safetext, safebuffer);
1491         if (result >= safelength) {
1492             result = textoffset;
1493         }
1494         else {
1495             result += safeoffset;
1496         }
1497         setColEIterOffset(strsrch->textIter, result);
1498         strsrch->textIter->iteratordata_.toReturn =
1499                        setExpansionPrefix(strsrch->textIter, leftoverces);
1500         return result;
1501     }
1502
1503     return ucol_getOffset(coleiter);
1504 }
1505
1506 /**
1507 * Trying out the substring and sees if it can be a canonical match.
1508 * This will try normalizing the end accents and arranging them into canonical
1509 * equivalents and check their corresponding ces with the pattern ce.
1510 * Suffix accents in the text will be grouped according to their combining
1511 * class and the groups will be mixed and matched to try find the perfect
1512 * match with the pattern.
1513 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1514 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1515 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1516 *         "\u0301\u0325".
1517 * step 2: check if any of the generated substrings matches the pattern.
1518 * Internal method, status assumed to be success, caller has to check status
1519 * before calling this method.
1520 * @param strsrch string search data
1521 * @param textoffset end offset in the collation element text that ends with
1522 *                   the accents to be rearranged
1523 * @param status error status if any
1524 * @return TRUE if the match is valid, FALSE otherwise
1525 */
1526 static
1527 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1528                            int32_t    textoffset,
1529                            UErrorCode    *status)
1530 {
1531     const UChar       *text = strsrch->search->text;
1532           int32_t  temp = textoffset;
1533     UTF_BACK_1(text, 0, temp);
1534     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1535         UCollationElements *coleiter = strsrch->textIter;
1536         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1537         if (strsrch->pattern.hasPrefixAccents) {
1538             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1539                                                 status);
1540             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1541                 setColEIterOffset(coleiter, offset);
1542                 return TRUE;
1543             }
1544         }
1545         return FALSE;
1546     }
1547
1548     if (!strsrch->pattern.hasSuffixAccents) {
1549         return FALSE;
1550     }
1551
1552     UChar       accents[INITIAL_ARRAY_SIZE_];
1553     // offset to the last base character in substring to search
1554     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1555     // normalizing the offensive string
1556     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1557                                0, accents, INITIAL_ARRAY_SIZE_, status);
1558     // status checked in loop below
1559
1560     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1561     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1562
1563     // 2 power n - 1 minus the full set of accents
1564     int32_t  count = (2 << (size - 1)) - 2;
1565     while (U_SUCCESS(*status) && count > 0) {
1566         UChar *rearrange = strsrch->canonicalSuffixAccents;
1567         // copy the base characters
1568         for (int k = 0; k < accentsindex[0]; k ++) {
1569             *rearrange ++ = accents[k];
1570         }
1571         // forming all possible canonical rearrangement by dropping
1572         // sets of accents
1573         for (int i = 0; i <= size - 1; i ++) {
1574             int32_t mask = 1 << (size - i - 1);
1575             if (count & mask) {
1576                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1577                     *rearrange ++ = accents[j];
1578                 }
1579             }
1580         }
1581         *rearrange = 0;
1582         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1583                                                         status);
1584         if (offset != USEARCH_DONE) {
1585             return TRUE; // match found
1586         }
1587         count --;
1588     }
1589     return FALSE;
1590 }
1591
1592 /**
1593 * Gets the previous base character offset depending on the string search
1594 * pattern data
1595 * @param strsrch string search data
1596 * @param textoffset current offset, current character
1597 * @return the offset of the next character after this base character or itself
1598 *         if it is a composed character with accents
1599 */
1600 static
1601 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1602                                                       int32_t textoffset)
1603 {
1604     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1605         const UChar       *text = strsrch->search->text;
1606               int32_t  offset = textoffset;
1607         if (getFCD(text, &offset, strsrch->search->textLength) >>
1608                                                    SECOND_LAST_BYTE_SHIFT_) {
1609             return getPreviousBaseOffset(text, textoffset);
1610         }
1611     }
1612     return textoffset;
1613 }
1614
1615 /**
1616 * Checks match for contraction.
1617 * If the match ends with a partial contraction we fail.
1618 * If the match starts too far off (because of backwards iteration) we try to
1619 * chip off the extra characters
1620 * Internal method, status assumed to be success, caller has to check status
1621 * before calling this method.
1622 * @param strsrch string search data
1623 * @param start offset of potential match, to be modified if necessary
1624 * @param end offset of potential match, to be modified if necessary
1625 * @param status output error status if any
1626 * @return TRUE if match passes the contraction test, FALSE otherwise
1627 */
1628 static
1629 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1630                                          int32_t   *start,
1631                                          int32_t   *end,
1632                                          UErrorCode    *status)
1633 {
1634           UCollationElements *coleiter   = strsrch->textIter;
1635           int32_t             textlength = strsrch->search->textLength;
1636           int32_t         temp       = *start;
1637     const UCollator          *collator   = strsrch->collator;
1638     const UChar              *text       = strsrch->search->text;
1639     // This part checks if either ends of the match contains potential
1640     // contraction. If so we'll have to iterate through them
1641         if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1642         (*start + 1 < textlength
1643          && ucol_unsafeCP(text[*start + 1], collator))) {
1644         int32_t expansion  = getExpansionPrefix(coleiter);
1645         UBool   expandflag = expansion > 0;
1646         setColEIterOffset(coleiter, *start);
1647         while (expansion > 0) {
1648             // getting rid of the redundant ce, caused by setOffset.
1649             // since backward contraction/expansion may have extra ces if we
1650             // are in the normalization buffer, hasAccentsBeforeMatch would
1651             // have taken care of it.
1652             // E.g. the character \u01FA will have an expansion of 3, but if
1653             // we are only looking for acute and ring \u030A and \u0301, we'll
1654             // have to skip the first ce in the expansion buffer.
1655             ucol_next(coleiter, status);
1656                         if (U_FAILURE(*status)) {
1657                                 return FALSE;
1658                         }
1659             if (ucol_getOffset(coleiter) != temp) {
1660                 *start = temp;
1661                 temp  = ucol_getOffset(coleiter);
1662             }
1663             expansion --;
1664         }
1665
1666         uint32_t *patternce       = strsrch->pattern.CE;
1667         int32_t   patterncelength = strsrch->pattern.CELength;
1668         int32_t   count           = 0;
1669         int32_t   textlength      = strsrch->search->textLength;
1670         while (count < patterncelength) {
1671             uint32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1672             // status checked below, note that if status is a failure
1673             // ucol_next returns UCOL_NULLORDER
1674             if (ce == UCOL_IGNORABLE) {
1675                 continue;
1676             }
1677             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1678                 *start = temp;
1679                 temp   = ucol_getOffset(coleiter);
1680             }
1681
1682             if (count == 0 && ce != patternce[0]) {
1683                 // accents may have extra starting ces, this occurs when a
1684                 // pure accent pattern is matched without rearrangement
1685                 // text \u0325\u0300 and looking for \u0300
1686                 uint32_t expected = patternce[0];
1687                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1688                     ce = getCE(strsrch, ucol_next(coleiter, status));
1689                     while (U_SUCCESS(*status) && ce != expected &&
1690                            ce != UCOL_NULLORDER &&
1691                            ucol_getOffset(coleiter) <= *end) {
1692                         ce = getCE(strsrch, ucol_next(coleiter, status));
1693                     }
1694                 }
1695             }
1696             if (U_FAILURE(*status) || ce != patternce[count]) {
1697                 (*end) ++;
1698                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1699                 return FALSE;
1700             }
1701             count ++;
1702         }
1703     }
1704     return TRUE;
1705 }
1706
1707 /**
1708 * Checks and sets the match information if found.
1709 * Checks
1710 * <ul>
1711 * <li> the potential match does not repeat the previous match
1712 * <li> boundaries are correct
1713 * <li> potential match does not end in the middle of a contraction
1714 * <li> identical matches
1715 * <\ul>
1716 * Otherwise the offset will be shifted to the next character.
1717 * Internal method, status assumed to be success, caller has to check the
1718 * status before calling this method.
1719 * @param strsrch string search data
1720 * @param textoffset offset in the collation element text. the returned value
1721 *        will be the truncated end offset of the match or the new start
1722 *        search offset.
1723 * @param status output error status if any
1724 * @return TRUE if the match is valid, FALSE otherwise
1725 */
1726 static
1727 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1728                                      int32_t   *textoffset,
1729                                      UErrorCode    *status)
1730 {
1731     // to ensure that the start and ends are not composite characters
1732     UCollationElements *coleiter = strsrch->textIter;
1733     // if we have a canonical accent match
1734     if ((strsrch->pattern.hasSuffixAccents &&
1735         strsrch->canonicalSuffixAccents[0]) ||
1736         (strsrch->pattern.hasPrefixAccents &&
1737         strsrch->canonicalPrefixAccents[0])) {
1738         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1739                                                     strsrch,
1740                                                     ucol_getOffset(coleiter));
1741         strsrch->search->matchedLength = *textoffset -
1742                                                 strsrch->search->matchedIndex;
1743         return TRUE;
1744     }
1745
1746     int32_t start = getColElemIterOffset(coleiter, FALSE);
1747     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1748                                             status) || U_FAILURE(*status)) {
1749         return FALSE;
1750     }
1751
1752     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1753     // this totally matches, however we need to check if it is repeating
1754     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1755         !isBreakUnit(strsrch, start, *textoffset) ||
1756         !checkIdentical(strsrch, start, *textoffset)) {
1757         (*textoffset) ++;
1758         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1759                                         strsrch->search->textLength);
1760         return FALSE;
1761     }
1762
1763     strsrch->search->matchedIndex  = start;
1764     strsrch->search->matchedLength = *textoffset - start;
1765     return TRUE;
1766 }
1767
1768 /**
1769 * Shifting the collation element iterator position forward to prepare for
1770 * a preceding match. If the first character is a unsafe character, we'll only
1771 * shift by 1 to capture contractions, normalization etc.
1772 * Internal method, status assumed to be success, caller has to check status
1773 * before calling this method.
1774 * @param text strsrch string search data
1775 * @param textoffset start text position to do search
1776 * @param ce the text ce which failed the match.
1777 * @param patternceindex index of the ce within the pattern ce buffer which
1778 *        failed the match
1779 * @return final offset
1780 */
1781 static
1782 inline int32_t reverseShift(UStringSearch *strsrch,
1783                                 int32_t    textoffset,
1784                                 uint32_t       ce,
1785                                 int32_t        patternceindex)
1786 {
1787     if (strsrch->search->isOverlap) {
1788         if (textoffset != strsrch->search->textLength) {
1789             textoffset --;
1790         }
1791         else {
1792             textoffset -= strsrch->pattern.defaultShiftSize;
1793         }
1794     }
1795     else {
1796         if (ce != UCOL_NULLORDER) {
1797             int32_t shift = strsrch->pattern.backShift[hash(ce)];
1798
1799             // this is to adjust for characters in the middle of the substring
1800             // for matching that failed.
1801             int32_t adjust = patternceindex;
1802             if (adjust > 1 && shift > adjust) {
1803                 shift -= adjust - 1;
1804             }
1805             textoffset -= shift;
1806         }
1807         else {
1808             textoffset -= strsrch->pattern.defaultShiftSize;
1809         }
1810     }
1811     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1812     return textoffset;
1813 }
1814
1815 /**
1816 * Checks match for contraction.
1817 * If the match starts with a partial contraction we fail.
1818 * Internal method, status assumed to be success, caller has to check status
1819 * before calling this method.
1820 * @param strsrch string search data
1821 * @param start offset of potential match, to be modified if necessary
1822 * @param end offset of potential match, to be modified if necessary
1823 * @param status output error status if any
1824 * @return TRUE if match passes the contraction test, FALSE otherwise
1825 */
1826 static
1827 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
1828                                      int32_t   *start,
1829                                      int32_t   *end, UErrorCode  *status)
1830 {
1831           UCollationElements *coleiter   = strsrch->textIter;
1832           int32_t             textlength = strsrch->search->textLength;
1833           int32_t             temp       = *end;
1834     const UCollator          *collator   = strsrch->collator;
1835     const UChar              *text       = strsrch->search->text;
1836     // This part checks if either if the start of the match contains potential
1837     // contraction. If so we'll have to iterate through them
1838         // Since we used ucol_next while previously looking for the potential
1839         // match, this guarantees that our end will not be a partial contraction,
1840         // or a partial supplementary character.
1841     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
1842         int32_t expansion  = getExpansionSuffix(coleiter);
1843         UBool   expandflag = expansion > 0;
1844         setColEIterOffset(coleiter, *end);
1845         while (U_SUCCESS(*status) && expansion > 0) {
1846             // getting rid of the redundant ce
1847             // since forward contraction/expansion may have extra ces
1848             // if we are in the normalization buffer, hasAccentsBeforeMatch
1849             // would have taken care of it.
1850             // E.g. the character \u01FA will have an expansion of 3, but if
1851             // we are only looking for A ring A\u030A, we'll have to skip the
1852             // last ce in the expansion buffer
1853             ucol_previous(coleiter, status);
1854                         if (U_FAILURE(*status)) {
1855                                 return FALSE;
1856                         }
1857             if (ucol_getOffset(coleiter) != temp) {
1858                 *end = temp;
1859                 temp  = ucol_getOffset(coleiter);
1860             }
1861             expansion --;
1862         }
1863
1864         uint32_t *patternce       = strsrch->pattern.CE;
1865         int32_t   patterncelength = strsrch->pattern.CELength;
1866         int32_t   count           = patterncelength;
1867         while (count > 0) {
1868             uint32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
1869             // status checked below, note that if status is a failure
1870             // ucol_previous returns UCOL_NULLORDER
1871             if (ce == UCOL_IGNORABLE) {
1872                 continue;
1873             }
1874             if (expandflag && count == 0 &&
1875                 getColElemIterOffset(coleiter, FALSE) != temp) {
1876                 *end = temp;
1877                 temp  = ucol_getOffset(coleiter);
1878             }
1879             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
1880                 (*start) --;
1881                 *start = getPreviousBaseOffset(text, *start);
1882                 return FALSE;
1883             }
1884             count --;
1885         }
1886     }
1887     return TRUE;
1888 }
1889
1890 /**
1891 * Checks and sets the match information if found.
1892 * Checks
1893 * <ul>
1894 * <li> the current match does not repeat the last match
1895 * <li> boundaries are correct
1896 * <li> exact matches has no extra accents
1897 * <li> identical matches
1898 * <\ul>
1899 * Otherwise the offset will be shifted to the preceding character.
1900 * Internal method, status assumed to be success, caller has to check status
1901 * before calling this method.
1902 * @param strsrch string search data
1903 * @param collator
1904 * @param coleiter collation element iterator
1905 * @param text string
1906 * @param textoffset offset in the collation element text. the returned value
1907 *        will be the truncated start offset of the match or the new start
1908 *        search offset.
1909 * @param status output error status if any
1910 * @return TRUE if the match is valid, FALSE otherwise
1911 */
1912 static
1913 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
1914                                      int32_t   *textoffset,
1915                                      UErrorCode    *status)
1916 {
1917     // to ensure that the start and ends are not composite characters
1918     int32_t end = ucol_getOffset(strsrch->textIter);
1919     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
1920         || U_FAILURE(*status)) {
1921             return FALSE;
1922     }
1923
1924     // this totally matches, however we need to check if it is repeating
1925     // the old match
1926     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
1927         !isBreakUnit(strsrch, *textoffset, end) ||
1928         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
1929         !checkIdentical(strsrch, *textoffset, end) ||
1930         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
1931         (*textoffset) --;
1932         *textoffset = getPreviousBaseOffset(strsrch->search->text,
1933                                             *textoffset);
1934         return FALSE;
1935     }
1936     strsrch->search->matchedIndex = *textoffset;
1937     strsrch->search->matchedLength = end - *textoffset;
1938     return TRUE;
1939 }
1940
1941 /**
1942 * Rearranges the end accents to try matching.
1943 * Suffix accents in the text will be grouped according to their combining
1944 * class and the groups will be mixed and matched to try find the perfect
1945 * match with the pattern.
1946 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1947 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1948 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1949 *         "\u0301\u0325".
1950 * step 2: check if any of the generated substrings matches the pattern.
1951 * Internal method, status assumed to be success, user has to check status
1952 * before calling this method.
1953 * @param strsrch string search match
1954 * @param start offset of the first base character
1955 * @param end start of the last accent set
1956 * @param status only error status if any
1957 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1958 *         offset of the match. Note this start includes all following accents.
1959 */
1960 static
1961 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
1962                                            int32_t    start,
1963                                            int32_t    end,
1964                                            UErrorCode    *status)
1965 {
1966     const UChar       *text       = strsrch->search->text;
1967           int32_t  tempend    = end;
1968
1969     UTF_BACK_1(text, 0, tempend);
1970     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
1971                                                            LAST_BYTE_MASK_)) {
1972         // die... failed at a base character
1973         return USEARCH_DONE;
1974     }
1975     end = getNextBaseOffset(text, end, strsrch->search->textLength);
1976
1977     if (U_SUCCESS(*status)) {
1978         UChar       accents[INITIAL_ARRAY_SIZE_];
1979         int32_t offset = getPreviousBaseOffset(text, end);
1980         // normalizing the offensive string
1981         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
1982                         INITIAL_ARRAY_SIZE_, status);
1983
1984         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1985         int32_t         accentsize = getUnblockedAccentIndex(accents,
1986                                                          accentsindex);
1987         int32_t         count      = (2 << (accentsize - 1)) - 2;
1988         UChar               buffer[INITIAL_ARRAY_SIZE_];
1989         UCollationElements *coleiter = strsrch->utilIter;
1990         while (U_SUCCESS(*status) && count > 0) {
1991             UChar *rearrange = strsrch->canonicalSuffixAccents;
1992             // copy the base characters
1993             for (int k = 0; k < accentsindex[0]; k ++) {
1994                 *rearrange ++ = accents[k];
1995             }
1996             // forming all possible canonical rearrangement by dropping
1997             // sets of accents
1998             for (int i = 0; i <= accentsize - 1; i ++) {
1999                 int32_t mask = 1 << (accentsize - i - 1);
2000                 if (count & mask) {
2001                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2002                         *rearrange ++ = accents[j];
2003                     }
2004                 }
2005             }
2006             *rearrange = 0;
2007             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2008             UChar   *match     = addToUCharArray(buffer, &matchsize,
2009                                            strsrch->canonicalPrefixAccents,
2010                                            strsrch->search->text + start,
2011                                            offset - start,
2012                                            strsrch->canonicalSuffixAccents,
2013                                            status);
2014
2015             // run the collator iterator through this match
2016             // if status is a failure ucol_setText does nothing
2017             ucol_setText(coleiter, match, matchsize, status);
2018             if (U_SUCCESS(*status)) {
2019                 if (checkCollationMatch(strsrch, coleiter)) {
2020                     if (match != buffer) {
2021                         uprv_free(match);
2022                     }
2023                     return end;
2024                 }
2025             }
2026             count --;
2027         }
2028     }
2029     return USEARCH_DONE;
2030 }
2031
2032 /**
2033 * Take the rearranged start accents and tries matching. If match failed at
2034 * a seperate following set of accents (seperated from the rearranged on by
2035 * at least a base character) then we rearrange the preceding accents and
2036 * tries matching again.
2037 * We allow skipping of the ends of the accent set if the ces do not match.
2038 * However if the failure is found before the accent set, it fails.
2039 * Internal method, status assumed to be success, caller has to check status
2040 * before calling this method.
2041 * @param strsrch string search data
2042 * @param textoffset of the ends of the rearranged accent
2043 * @param status output error status if any
2044 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2045 *         offset of the match. Note this start includes all following accents.
2046 */
2047 static
2048 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2049                                            int32_t    textoffset,
2050                                            UErrorCode    *status)
2051 {
2052     const UChar       *text       = strsrch->search->text;
2053     const UCollator   *collator   = strsrch->collator;
2054           int32_t      safelength = 0;
2055           UChar       *safetext;
2056           int32_t      safetextlength;
2057           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2058           int32_t  safeoffset = textoffset;
2059
2060     if (textoffset &&
2061         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2062                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2063                                          ], collator)) {
2064         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2065                                            strsrch->search->textLength);
2066         safelength     = safeoffset - textoffset;
2067         safetextlength = INITIAL_ARRAY_SIZE_;
2068         safetext       = addToUCharArray(safebuffer, &safetextlength,
2069                                          strsrch->canonicalPrefixAccents,
2070                                          text + textoffset, safelength,
2071                                          NULL, status);
2072     }
2073     else {
2074         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2075         safetext       = strsrch->canonicalPrefixAccents;
2076     }
2077
2078     UCollationElements *coleiter = strsrch->utilIter;
2079      // if status is a failure, ucol_setText does nothing
2080     ucol_setText(coleiter, safetext, safetextlength, status);
2081     // status checked in loop below
2082
2083     uint32_t *ce           = strsrch->pattern.CE;
2084     int32_t   celength     = strsrch->pattern.CELength;
2085     int       ceindex      = 0;
2086     UBool     isSafe       = TRUE; // safe zone indication flag for position
2087     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2088
2089     while (ceindex < celength) {
2090         uint32_t textce = ucol_next(coleiter, status);
2091         if (U_FAILURE(*status)) {
2092             if (isSafe) {
2093                 cleanUpSafeText(strsrch, safetext, safebuffer);
2094             }
2095             return USEARCH_DONE;
2096         }
2097         if (textce == UCOL_NULLORDER) {
2098             // check if we have passed the safe buffer
2099             if (coleiter == strsrch->textIter) {
2100                 cleanUpSafeText(strsrch, safetext, safebuffer);
2101                 return USEARCH_DONE;
2102             }
2103             cleanUpSafeText(strsrch, safetext, safebuffer);
2104             safetext = safebuffer;
2105             coleiter = strsrch->textIter;
2106             setColEIterOffset(coleiter, safeoffset);
2107             // status checked at the start of the loop
2108             isSafe = FALSE;
2109             continue;
2110         }
2111         textce = getCE(strsrch, textce);
2112         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2113             // do the beginning stuff
2114             int32_t failedoffset = ucol_getOffset(coleiter);
2115             if (isSafe && failedoffset <= prefixlength) {
2116                 // alas... no hope. failed at rearranged accent set
2117                 cleanUpSafeText(strsrch, safetext, safebuffer);
2118                 return USEARCH_DONE;
2119             }
2120             else {
2121                 if (isSafe) {
2122                     failedoffset = safeoffset - failedoffset;
2123                     cleanUpSafeText(strsrch, safetext, safebuffer);
2124                 }
2125
2126                 // try rearranging the end accents
2127                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2128                                         textoffset, failedoffset, status);
2129                 if (result != USEARCH_DONE) {
2130                     // if status is a failure, ucol_setOffset does nothing
2131                     setColEIterOffset(strsrch->textIter, result);
2132                 }
2133                 if (U_FAILURE(*status)) {
2134                     return USEARCH_DONE;
2135                 }
2136                 return result;
2137             }
2138         }
2139         if (textce == ce[ceindex]) {
2140             ceindex ++;
2141         }
2142     }
2143     // set offset here
2144     if (isSafe) {
2145         int32_t result      = ucol_getOffset(coleiter);
2146         // sets the text iterator here with the correct expansion and offset
2147         int32_t     leftoverces = getExpansionSuffix(coleiter);
2148         cleanUpSafeText(strsrch, safetext, safebuffer);
2149         if (result <= prefixlength) {
2150             result = textoffset;
2151         }
2152         else {
2153             result = textoffset + (safeoffset - result);
2154         }
2155         setColEIterOffset(strsrch->textIter, result);
2156         setExpansionSuffix(strsrch->textIter, leftoverces);
2157         return result;
2158     }
2159
2160     return ucol_getOffset(coleiter);
2161 }
2162
2163 /**
2164 * Trying out the substring and sees if it can be a canonical match.
2165 * This will try normalizing the starting accents and arranging them into
2166 * canonical equivalents and check their corresponding ces with the pattern ce.
2167 * Prefix accents in the text will be grouped according to their combining
2168 * class and the groups will be mixed and matched to try find the perfect
2169 * match with the pattern.
2170 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2171 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2172 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2173 *         "\u0301\u0325".
2174 * step 2: check if any of the generated substrings matches the pattern.
2175 * Internal method, status assumed to be success, caller has to check status
2176 * before calling this method.
2177 * @param strsrch string search data
2178 * @param textoffset start offset in the collation element text that starts
2179 *                   with the accents to be rearranged
2180 * @param status output error status if any
2181 * @return TRUE if the match is valid, FALSE otherwise
2182 */
2183 static
2184 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2185                                int32_t    textoffset,
2186                                UErrorCode    *status)
2187 {
2188     const UChar       *text       = strsrch->search->text;
2189           int32_t  temp       = textoffset;
2190           int32_t      textlength = strsrch->search->textLength;
2191     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2192         UCollationElements *coleiter = strsrch->textIter;
2193         int32_t         offset   = ucol_getOffset(coleiter);
2194         if (strsrch->pattern.hasSuffixAccents) {
2195             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2196                                                     offset, status);
2197             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2198                 setColEIterOffset(coleiter, offset);
2199                 return TRUE;
2200             }
2201         }
2202         return FALSE;
2203     }
2204
2205     if (!strsrch->pattern.hasPrefixAccents) {
2206         return FALSE;
2207     }
2208
2209     UChar       accents[INITIAL_ARRAY_SIZE_];
2210     // offset to the last base character in substring to search
2211     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2212     // normalizing the offensive string
2213     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2214                                0, accents, INITIAL_ARRAY_SIZE_, status);
2215     // status checked in loop
2216
2217     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2218     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2219
2220     // 2 power n - 1 minus the full set of accents
2221     int32_t  count = (2 << (size - 1)) - 2;
2222     while (U_SUCCESS(*status) && count > 0) {
2223         UChar *rearrange = strsrch->canonicalPrefixAccents;
2224         // copy the base characters
2225         for (int k = 0; k < accentsindex[0]; k ++) {
2226             *rearrange ++ = accents[k];
2227         }
2228         // forming all possible canonical rearrangement by dropping
2229         // sets of accents
2230         for (int i = 0; i <= size - 1; i ++) {
2231             int32_t mask = 1 << (size - i - 1);
2232             if (count & mask) {
2233                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2234                     *rearrange ++ = accents[j];
2235                 }
2236             }
2237         }
2238         *rearrange = 0;
2239         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2240                                                           baseoffset, status);
2241         if (offset != USEARCH_DONE) {
2242             return TRUE; // match found
2243         }
2244         count --;
2245     }
2246     return FALSE;
2247 }
2248
2249 /**
2250 * Checks match for contraction.
2251 * If the match starts with a partial contraction we fail.
2252 * Internal method, status assumed to be success, caller has to check status
2253 * before calling this method.
2254 * @param strsrch string search data
2255 * @param start offset of potential match, to be modified if necessary
2256 * @param end offset of potential match, to be modified if necessary
2257 * @param status only error status if any
2258 * @return TRUE if match passes the contraction test, FALSE otherwise
2259 */
2260 static
2261 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2262                                      int32_t   *start,
2263                                      int32_t   *end, UErrorCode  *status)
2264 {
2265           UCollationElements *coleiter   = strsrch->textIter;
2266           int32_t             textlength = strsrch->search->textLength;
2267           int32_t         temp       = *end;
2268     const UCollator          *collator   = strsrch->collator;
2269     const UChar              *text       = strsrch->search->text;
2270         // This part checks if either if the start of the match contains potential
2271     // contraction. If so we'll have to iterate through them
2272         // Since we used ucol_next while previously looking for the potential
2273         // match, this guarantees that our end will not be a partial contraction,
2274         // or a partial supplementary character.
2275     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2276         int32_t expansion  = getExpansionSuffix(coleiter);
2277         UBool   expandflag = expansion > 0;
2278         setColEIterOffset(coleiter, *end);
2279         while (expansion > 0) {
2280             // getting rid of the redundant ce
2281             // since forward contraction/expansion may have extra ces
2282             // if we are in the normalization buffer, hasAccentsBeforeMatch
2283             // would have taken care of it.
2284             // E.g. the character \u01FA will have an expansion of 3, but if
2285             // we are only looking for A ring A\u030A, we'll have to skip the
2286             // last ce in the expansion buffer
2287             ucol_previous(coleiter, status);
2288                         if (U_FAILURE(*status)) {
2289                                 return FALSE;
2290                         }
2291             if (ucol_getOffset(coleiter) != temp) {
2292                 *end = temp;
2293                 temp  = ucol_getOffset(coleiter);
2294             }
2295             expansion --;
2296         }
2297
2298         uint32_t *patternce       = strsrch->pattern.CE;
2299         int32_t   patterncelength = strsrch->pattern.CELength;
2300         int32_t   count           = patterncelength;
2301         while (count > 0) {
2302             uint32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2303             // status checked below, note that if status is a failure
2304             // ucol_previous returns UCOL_NULLORDER
2305             if (ce == UCOL_IGNORABLE) {
2306                 continue;
2307             }
2308             if (expandflag && count == 0 &&
2309                 getColElemIterOffset(coleiter, FALSE) != temp) {
2310                 *end = temp;
2311                 temp  = ucol_getOffset(coleiter);
2312             }
2313             if (count == patterncelength &&
2314                 ce != patternce[patterncelength - 1]) {
2315                 // accents may have extra starting ces, this occurs when a
2316                 // pure accent pattern is matched without rearrangement
2317                 uint32_t    expected = patternce[patterncelength - 1];
2318                 UTF_BACK_1(text, 0, *end);
2319                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2320                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2321                     while (U_SUCCESS(*status) && ce != expected &&
2322                            ce != UCOL_NULLORDER &&
2323                            ucol_getOffset(coleiter) <= *start) {
2324                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2325                     }
2326                 }
2327             }
2328             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2329                 (*start) --;
2330                 *start = getPreviousBaseOffset(text, *start);
2331                 return FALSE;
2332             }
2333             count --;
2334         }
2335     }
2336     return TRUE;
2337 }
2338
2339 /**
2340 * Checks and sets the match information if found.
2341 * Checks
2342 * <ul>
2343 * <li> the potential match does not repeat the previous match
2344 * <li> boundaries are correct
2345 * <li> potential match does not end in the middle of a contraction
2346 * <li> identical matches
2347 * <\ul>
2348 * Otherwise the offset will be shifted to the next character.
2349 * Internal method, status assumed to be success, caller has to check status
2350 * before calling this method.
2351 * @param strsrch string search data
2352 * @param textoffset offset in the collation element text. the returned value
2353 *        will be the truncated start offset of the match or the new start
2354 *        search offset.
2355 * @param status only error status if any
2356 * @return TRUE if the match is valid, FALSE otherwise
2357 */
2358 static
2359 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2360                                          int32_t   *textoffset,
2361                                          UErrorCode    *status)
2362 {
2363     // to ensure that the start and ends are not composite characters
2364     UCollationElements *coleiter = strsrch->textIter;
2365     // if we have a canonical accent match
2366     if ((strsrch->pattern.hasSuffixAccents &&
2367         strsrch->canonicalSuffixAccents[0]) ||
2368         (strsrch->pattern.hasPrefixAccents &&
2369         strsrch->canonicalPrefixAccents[0])) {
2370         strsrch->search->matchedIndex  = *textoffset;
2371         strsrch->search->matchedLength =
2372             getNextUStringSearchBaseOffset(strsrch,
2373                                       getColElemIterOffset(coleiter, FALSE))
2374             - *textoffset;
2375         return TRUE;
2376     }
2377
2378     int32_t end = ucol_getOffset(coleiter);
2379     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2380                                                 status) ||
2381          U_FAILURE(*status)) {
2382         return FALSE;
2383     }
2384
2385     end = getNextUStringSearchBaseOffset(strsrch, end);
2386     // this totally matches, however we need to check if it is repeating
2387     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2388         !isBreakUnit(strsrch, *textoffset, end) ||
2389         !checkIdentical(strsrch, *textoffset, end)) {
2390         (*textoffset) --;
2391         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2392                                             *textoffset);
2393         return FALSE;
2394     }
2395
2396     strsrch->search->matchedIndex  = *textoffset;
2397     strsrch->search->matchedLength = end - *textoffset;
2398     return TRUE;
2399 }
2400
2401 // constructors and destructor -------------------------------------------
2402
2403 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2404                                           int32_t         patternlength,
2405                                     const UChar          *text,
2406                                           int32_t         textlength,
2407                                     const char           *locale,
2408                                           UBreakIterator *breakiter,
2409                                           UErrorCode     *status)
2410 {
2411     if (U_FAILURE(*status)) {
2412         return NULL;
2413     }
2414 #if UCONFIG_NO_BREAK_ITERATION
2415     if (breakiter != NULL) {
2416         *status = U_UNSUPPORTED_ERROR;
2417         return NULL;
2418     }
2419 #endif
2420     if (locale) {
2421         // ucol_open internally checks for status
2422         UCollator     *collator = ucol_open(locale, status);
2423         // pattern, text checks are done in usearch_openFromCollator
2424         UStringSearch *result   = usearch_openFromCollator(pattern,
2425                                               patternlength, text, textlength,
2426                                               collator, breakiter, status);
2427
2428         if (result == NULL || U_FAILURE(*status)) {
2429             if (collator) {
2430                 ucol_close(collator);
2431             }
2432             return NULL;
2433         }
2434         else {
2435             result->ownCollator = TRUE;
2436         }
2437         return result;
2438     }
2439     *status = U_ILLEGAL_ARGUMENT_ERROR;
2440     return NULL;
2441 }
2442
2443 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2444                                   const UChar          *pattern,
2445                                         int32_t         patternlength,
2446                                   const UChar          *text,
2447                                         int32_t         textlength,
2448                                   const UCollator      *collator,
2449                                         UBreakIterator *breakiter,
2450                                         UErrorCode     *status)
2451 {
2452     if (U_FAILURE(*status)) {
2453         return NULL;
2454     }
2455 #if UCONFIG_NO_BREAK_ITERATION
2456     if (breakiter != NULL) {
2457         *status = U_UNSUPPORTED_ERROR;
2458         return NULL;
2459     }
2460 #endif
2461     if (pattern == NULL || text == NULL || collator == NULL) {
2462         *status = U_ILLEGAL_ARGUMENT_ERROR;
2463     }
2464
2465     if (U_SUCCESS(*status)) {
2466         initializeFCD(status);
2467         if (U_FAILURE(*status)) {
2468             return NULL;
2469         }
2470
2471         UStringSearch *result;
2472         if (textlength == -1) {
2473             textlength = u_strlen(text);
2474         }
2475         if (patternlength == -1) {
2476             patternlength = u_strlen(pattern);
2477         }
2478         if (textlength <= 0 || patternlength <= 0) {
2479             *status = U_ILLEGAL_ARGUMENT_ERROR;
2480             return NULL;
2481         }
2482
2483         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2484         if (result == NULL) {
2485             *status = U_MEMORY_ALLOCATION_ERROR;
2486             return NULL;
2487         }
2488
2489         result->collator    = collator;
2490         result->strength    = ucol_getStrength(collator);
2491         result->ceMask      = getMask(result->strength);
2492         result->toShift     =
2493              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2494                                                             UCOL_SHIFTED;
2495         result->variableTop = ucol_getVariableTop(collator, status);
2496
2497         if (U_FAILURE(*status)) {
2498             uprv_free(result);
2499             return NULL;
2500         }
2501
2502         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2503         if (result->search == NULL) {
2504             *status = U_MEMORY_ALLOCATION_ERROR;
2505             uprv_free(result);
2506             return NULL;
2507         }
2508
2509         result->search->text       = text;
2510         result->search->textLength = textlength;
2511
2512         result->pattern.text       = pattern;
2513         result->pattern.textLength = patternlength;
2514         result->pattern.CE         = NULL;
2515
2516         result->search->breakIter  = breakiter;
2517 #if !UCONFIG_NO_BREAK_ITERATION
2518         if (breakiter) {
2519             ubrk_setText(breakiter, text, textlength, status);
2520         }
2521 #endif
2522
2523         result->ownCollator           = FALSE;
2524         result->search->matchedLength = 0;
2525         result->search->matchedIndex  = USEARCH_DONE;
2526         result->textIter              = ucol_openElements(collator, text,
2527                                                           textlength, status);
2528         if (U_FAILURE(*status)) {
2529             usearch_close(result);
2530             return NULL;
2531         }
2532
2533         result->utilIter              = NULL;
2534
2535         result->search->isOverlap          = FALSE;
2536         result->search->isCanonicalMatch   = FALSE;
2537         result->search->isForwardSearching = TRUE;
2538         result->search->reset              = TRUE;
2539
2540         initialize(result, status);
2541
2542         if (U_FAILURE(*status)) {
2543             usearch_close(result);
2544             return NULL;
2545         }
2546
2547         return result;
2548     }
2549     return NULL;
2550 }
2551
2552 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2553 {
2554     if (strsrch) {
2555         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2556             strsrch->pattern.CE) {
2557             uprv_free(strsrch->pattern.CE);
2558         }
2559         ucol_closeElements(strsrch->textIter);
2560         ucol_closeElements(strsrch->utilIter);
2561         if (strsrch->ownCollator && strsrch->collator) {
2562             ucol_close((UCollator *)strsrch->collator);
2563         }
2564         uprv_free(strsrch->search);
2565         uprv_free(strsrch);
2566     }
2567 }
2568
2569 // set and get methods --------------------------------------------------
2570
2571 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2572                                         int32_t    position,
2573                                         UErrorCode    *status)
2574 {
2575     if (U_SUCCESS(*status) && strsrch) {
2576         if (isOutOfBounds(strsrch->search->textLength, position)) {
2577             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2578         }
2579         else {
2580             setColEIterOffset(strsrch->textIter, position);
2581         }
2582         strsrch->search->matchedIndex  = USEARCH_DONE;
2583         strsrch->search->matchedLength = 0;
2584         strsrch->search->reset         = FALSE;
2585     }
2586 }
2587
2588 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2589 {
2590     if (strsrch) {
2591         int32_t result = ucol_getOffset(strsrch->textIter);
2592         if (isOutOfBounds(strsrch->search->textLength, result)) {
2593             return USEARCH_DONE;
2594         }
2595         return result;
2596     }
2597     return USEARCH_DONE;
2598 }
2599
2600 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2601                                  USearchAttribute attribute,
2602                                  USearchAttributeValue value,
2603                                  UErrorCode *status)
2604 {
2605     if (U_SUCCESS(*status) && strsrch) {
2606         switch (attribute)
2607         {
2608         case USEARCH_OVERLAP :
2609             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2610             break;
2611         case USEARCH_CANONICAL_MATCH :
2612             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2613                                                                       FALSE);
2614             break;
2615         case USEARCH_ATTRIBUTE_COUNT :
2616         default:
2617             *status = U_ILLEGAL_ARGUMENT_ERROR;
2618         }
2619     }
2620     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2621         *status = U_ILLEGAL_ARGUMENT_ERROR;
2622     }
2623 }
2624
2625 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2626                                                 const UStringSearch *strsrch,
2627                                                 USearchAttribute attribute)
2628 {
2629     if (strsrch) {
2630         switch (attribute) {
2631         case USEARCH_OVERLAP :
2632             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2633                                                         USEARCH_OFF);
2634         case USEARCH_CANONICAL_MATCH :
2635             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2636                                                                USEARCH_OFF);
2637         case USEARCH_ATTRIBUTE_COUNT :
2638             return USEARCH_DEFAULT;
2639         }
2640     }
2641     return USEARCH_DEFAULT;
2642 }
2643
2644 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2645                                                 const UStringSearch *strsrch)
2646 {
2647     if (strsrch == NULL) {
2648         return USEARCH_DONE;
2649     }
2650     return strsrch->search->matchedIndex;
2651 }
2652
2653
2654 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2655                                             UChar         *result,
2656                                             int32_t        resultCapacity,
2657                                             UErrorCode    *status)
2658 {
2659     if (U_FAILURE(*status)) {
2660         return USEARCH_DONE;
2661     }
2662     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2663         result == NULL)) {
2664         *status = U_ILLEGAL_ARGUMENT_ERROR;
2665         return USEARCH_DONE;
2666     }
2667
2668     int32_t     copylength = strsrch->search->matchedLength;
2669     int32_t copyindex  = strsrch->search->matchedIndex;
2670     if (copyindex == USEARCH_DONE) {
2671         u_terminateUChars(result, resultCapacity, 0, status);
2672         return USEARCH_DONE;
2673     }
2674
2675     if (resultCapacity < copylength) {
2676         copylength = resultCapacity;
2677     }
2678     if (copylength > 0) {
2679         uprv_memcpy(result, strsrch->search->text + copyindex,
2680                     copylength * sizeof(UChar));
2681     }
2682     return u_terminateUChars(result, resultCapacity,
2683                              strsrch->search->matchedLength, status);
2684 }
2685
2686 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2687                                               const UStringSearch *strsrch)
2688 {
2689     if (strsrch) {
2690         return strsrch->search->matchedLength;
2691     }
2692     return USEARCH_DONE;
2693 }
2694
2695 #if !UCONFIG_NO_BREAK_ITERATION
2696
2697 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2698                                                UBreakIterator *breakiter,
2699                                                UErrorCode     *status)
2700 {
2701     if (U_SUCCESS(*status) && strsrch) {
2702         strsrch->search->breakIter = breakiter;
2703         if (breakiter) {
2704             ubrk_setText(breakiter, strsrch->search->text,
2705                          strsrch->search->textLength, status);
2706         }
2707     }
2708 }
2709
2710 U_CAPI const UBreakIterator* U_EXPORT2
2711 usearch_getBreakIterator(const UStringSearch *strsrch)
2712 {
2713     if (strsrch) {
2714         return strsrch->search->breakIter;
2715     }
2716     return NULL;
2717 }
2718
2719 #endif
2720
2721 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2722                                       const UChar         *text,
2723                                             int32_t        textlength,
2724                                             UErrorCode    *status)
2725 {
2726     if (U_SUCCESS(*status)) {
2727         if (strsrch == NULL || text == NULL || textlength < -1 ||
2728             textlength == 0) {
2729             *status = U_ILLEGAL_ARGUMENT_ERROR;
2730         }
2731         else {
2732             if (textlength == -1) {
2733                 textlength = u_strlen(text);
2734             }
2735             strsrch->search->text       = text;
2736             strsrch->search->textLength = textlength;
2737             ucol_setText(strsrch->textIter, text, textlength, status);
2738             strsrch->search->matchedIndex  = USEARCH_DONE;
2739             strsrch->search->matchedLength = 0;
2740             strsrch->search->reset         = TRUE;
2741 #if !UCONFIG_NO_BREAK_ITERATION
2742                         if (strsrch->search->breakIter != NULL) {
2743                                 ubrk_setText(strsrch->search->breakIter, text,
2744                                                          textlength, status);
2745                         }
2746 #endif
2747         }
2748     }
2749 }
2750
2751 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2752                                                      int32_t       *length)
2753 {
2754     if (strsrch) {
2755         *length = strsrch->search->textLength;
2756         return strsrch->search->text;
2757     }
2758     return NULL;
2759 }
2760
2761 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
2762                                           const UCollator     *collator,
2763                                                 UErrorCode    *status)
2764 {
2765     if (U_SUCCESS(*status)) {
2766         if (collator == NULL) {
2767             *status = U_ILLEGAL_ARGUMENT_ERROR;
2768             return;
2769         }
2770         if (strsrch) {
2771             if (strsrch->ownCollator && (strsrch->collator != collator)) {
2772                 ucol_close((UCollator *)strsrch->collator);
2773                 strsrch->ownCollator = FALSE;
2774             }
2775             strsrch->collator    = collator;
2776             strsrch->strength    = ucol_getStrength(collator);
2777             strsrch->ceMask      = getMask(strsrch->strength);
2778             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2779             strsrch->toShift     =
2780                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2781                                                                 UCOL_SHIFTED;
2782             // if status is a failure, ucol_getVariableTop returns 0
2783             strsrch->variableTop = ucol_getVariableTop(collator, status);
2784             if (U_SUCCESS(*status)) {
2785                 initialize(strsrch, status);
2786                 if (U_SUCCESS(*status)) {
2787                     uprv_init_collIterate(collator, strsrch->search->text,
2788                                           strsrch->search->textLength,
2789                                           &(strsrch->textIter->iteratordata_));
2790                                         strsrch->utilIter->iteratordata_.coll = collator;
2791                 }
2792             }
2793         }
2794     }
2795 }
2796
2797 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
2798 {
2799     if (strsrch) {
2800         return (UCollator *)strsrch->collator;
2801     }
2802     return NULL;
2803 }
2804
2805 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
2806                                          const UChar         *pattern,
2807                                                int32_t        patternlength,
2808                                                UErrorCode    *status)
2809 {
2810     if (U_SUCCESS(*status)) {
2811         if (strsrch == NULL || pattern == NULL) {
2812             *status = U_ILLEGAL_ARGUMENT_ERROR;
2813         }
2814         else {
2815             if (patternlength == -1) {
2816                 patternlength = u_strlen(pattern);
2817             }
2818             if (patternlength == 0) {
2819                 *status = U_ILLEGAL_ARGUMENT_ERROR;
2820                 return;
2821             }
2822             strsrch->pattern.text       = pattern;
2823             strsrch->pattern.textLength = patternlength;
2824             initialize(strsrch, status);
2825         }
2826     }
2827 }
2828
2829 U_CAPI const UChar* U_EXPORT2
2830 usearch_getPattern(const UStringSearch *strsrch,
2831                    int32_t       *length)
2832 {
2833     if (strsrch) {
2834         *length = strsrch->pattern.textLength;
2835         return strsrch->pattern.text;
2836     }
2837     return NULL;
2838 }
2839
2840 // miscellanous methods --------------------------------------------------
2841
2842 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
2843                                            UErrorCode    *status)
2844 {
2845     if (strsrch && U_SUCCESS(*status)) {
2846         strsrch->search->isForwardSearching = TRUE;
2847         usearch_setOffset(strsrch, 0, status);
2848         if (U_SUCCESS(*status)) {
2849             return usearch_next(strsrch, status);
2850         }
2851     }
2852     return USEARCH_DONE;
2853 }
2854
2855 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
2856                                                int32_t    position,
2857                                                UErrorCode    *status)
2858 {
2859     if (strsrch && U_SUCCESS(*status)) {
2860         strsrch->search->isForwardSearching = TRUE;
2861         // position checked in usearch_setOffset
2862         usearch_setOffset(strsrch, position, status);
2863         if (U_SUCCESS(*status)) {
2864             return usearch_next(strsrch, status);
2865         }
2866     }
2867     return USEARCH_DONE;
2868 }
2869
2870 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
2871                                           UErrorCode    *status)
2872 {
2873     if (strsrch && U_SUCCESS(*status)) {
2874         strsrch->search->isForwardSearching = FALSE;
2875         usearch_setOffset(strsrch, strsrch->search->textLength, status);
2876         if (U_SUCCESS(*status)) {
2877             return usearch_previous(strsrch, status);
2878         }
2879     }
2880     return USEARCH_DONE;
2881 }
2882
2883 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
2884                                                int32_t    position,
2885                                                UErrorCode    *status)
2886 {
2887     if (strsrch && U_SUCCESS(*status)) {
2888         strsrch->search->isForwardSearching = FALSE;
2889         // position checked in usearch_setOffset
2890         usearch_setOffset(strsrch, position, status);
2891         if (U_SUCCESS(*status)) {
2892             return usearch_previous(strsrch, status);
2893         }
2894     }
2895     return USEARCH_DONE;
2896 }
2897
2898 /**
2899 * If a direction switch is required, we'll count the number of ces till the
2900 * beginning of the collation element iterator and iterate forwards that
2901 * number of times. This is so that we get to the correct point within the
2902 * string to continue the search in. Imagine when we are in the middle of the
2903 * normalization buffer when the change in direction is request. arrrgghh....
2904 * After searching the offset within the collation element iterator will be
2905 * shifted to the start of the match. If a match is not found, the offset would
2906 * have been set to the end of the text string in the collation element
2907 * iterator.
2908 * Okay, here's my take on normalization buffer. The only time when there can
2909 * be 2 matches within the same normalization is when the pattern is consists
2910 * of all accents. But since the offset returned is from the text string, we
2911 * should not confuse the caller by returning the second match within the
2912 * same normalization buffer. If we do, the 2 results will have the same match
2913 * offsets, and that'll be confusing. I'll return the next match that doesn't
2914 * fall within the same normalization buffer. Note this does not affect the
2915 * results of matches spanning the text and the normalization buffer.
2916 * The position to start searching is taken from the collation element
2917 * iterator. Callers of this API would have to set the offset in the collation
2918 * element iterator before using this method.
2919 */
2920 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
2921                                           UErrorCode    *status)
2922 {
2923     if (U_SUCCESS(*status) && strsrch) {
2924         int32_t  offset     = usearch_getOffset(strsrch);
2925         USearch     *search     = strsrch->search;
2926         search->reset           = FALSE;
2927         int32_t      textlength = search->textLength;
2928         int32_t  matchedindex = search->matchedIndex;
2929         if (search->isForwardSearching) {
2930             if (offset == textlength || matchedindex == textlength ||
2931                 (!search->isOverlap &&
2932                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
2933                     (matchedindex != USEARCH_DONE &&
2934                     matchedindex + search->matchedLength >= textlength)))) {
2935                 // not enough characters to match
2936                 setMatchNotFound(strsrch);
2937                 return USEARCH_DONE;
2938             }
2939         }
2940         else {
2941             // switching direction.
2942             // if matchedIndex == USEARCH_DONE, it means that either a
2943             // setOffset has been called or that previous ran off the text
2944             // string. the iterator would have been set to offset 0 if a
2945             // match is not found.
2946             search->isForwardSearching = TRUE;
2947             if (matchedindex != USEARCH_DONE) {
2948                 // there's no need to set the collation element iterator
2949                 // the next call to next will set the offset.
2950                 return matchedindex;
2951             }
2952         }
2953
2954         if (U_SUCCESS(*status)) {
2955             if (strsrch->pattern.CELength == 0) {
2956                 if (matchedindex == USEARCH_DONE) {
2957                     search->matchedIndex = offset;
2958                 }
2959                 else { // moves by codepoints
2960                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
2961                 }
2962
2963                 search->matchedLength = 0;
2964                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
2965                 // status checked below
2966                 if (search->matchedIndex == textlength) {
2967                     search->matchedIndex = USEARCH_DONE;
2968                 }
2969             }
2970             else {
2971                                 if (search->matchedLength > 0) {
2972                                         // if matchlength is 0 we are at the start of the iteration
2973                                         int offset = ucol_getOffset(strsrch->textIter);
2974                                         if (search->isOverlap) {
2975                                                 ucol_setOffset(strsrch->textIter, offset + 1, status);
2976                                         }
2977                                         else {
2978                                                 ucol_setOffset(strsrch->textIter,
2979                                                                    offset + search->matchedLength, status);
2980                                         }
2981                                 }
2982                                 if (search->isCanonicalMatch) {
2983                                         // can't use exact here since extra accents are allowed.
2984                                         usearch_handleNextCanonical(strsrch, status);
2985                                 }
2986                                 else {
2987                                         usearch_handleNextExact(strsrch, status);
2988                                 }
2989                         }
2990
2991             if (U_FAILURE(*status)) {
2992                 return USEARCH_DONE;
2993             }
2994
2995             return search->matchedIndex;
2996         }
2997     }
2998     return USEARCH_DONE;
2999 }
3000
3001 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3002                                               UErrorCode *status)
3003 {
3004     if (U_SUCCESS(*status) && strsrch) {
3005         int32_t offset;
3006         USearch *search = strsrch->search;
3007         if (search->reset) {
3008             offset                     = search->textLength;
3009             search->isForwardSearching = FALSE;
3010             search->reset              = FALSE;
3011             setColEIterOffset(strsrch->textIter, offset);
3012         }
3013         else {
3014             offset = usearch_getOffset(strsrch);
3015         }
3016
3017         int32_t matchedindex = search->matchedIndex;
3018         if (search->isForwardSearching == TRUE) {
3019             // switching direction.
3020             // if matchedIndex == USEARCH_DONE, it means that either a
3021             // setOffset has been called or that next ran off the text
3022             // string. the iterator would have been set to offset textLength if
3023             // a match is not found.
3024             search->isForwardSearching = FALSE;
3025             if (matchedindex != USEARCH_DONE) {
3026                 return matchedindex;
3027             }
3028         }
3029         else {
3030             if (offset == 0 || matchedindex == 0 ||
3031                 (!search->isOverlap &&
3032                     (offset < strsrch->pattern.defaultShiftSize ||
3033                     (matchedindex != USEARCH_DONE &&
3034                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3035                 // not enough characters to match
3036                 setMatchNotFound(strsrch);
3037                 return USEARCH_DONE;
3038             }
3039         }
3040
3041         if (U_SUCCESS(*status)) {
3042             if (strsrch->pattern.CELength == 0) {
3043                 search->matchedIndex =
3044                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3045                 if (search->matchedIndex == 0) {
3046                     setMatchNotFound(strsrch);
3047                     // status checked below
3048                 }
3049                 else { // move by codepoints
3050                     UTF_BACK_1(search->text, 0, search->matchedIndex);
3051                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3052                     // status checked below
3053                     search->matchedLength = 0;
3054                 }
3055             }
3056             else {
3057                 if (strsrch->search->isCanonicalMatch) {
3058                     // can't use exact here since extra accents are allowed.
3059                     usearch_handlePreviousCanonical(strsrch, status);
3060                     // status checked below
3061                 }
3062                 else {
3063                     usearch_handlePreviousExact(strsrch, status);
3064                     // status checked below
3065                 }
3066             }
3067
3068             if (U_FAILURE(*status)) {
3069                 return USEARCH_DONE;
3070             }
3071
3072             return search->matchedIndex;
3073         }
3074     }
3075     return USEARCH_DONE;
3076 }
3077
3078
3079
3080 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3081 {
3082     /*
3083     reset is setting the attributes that are already in
3084     string search, hence all attributes in the collator should
3085     be retrieved without any problems
3086     */
3087     if (strsrch) {
3088         UErrorCode status            = U_ZERO_ERROR;
3089         UBool      sameCollAttribute = TRUE;
3090         uint32_t   ceMask;
3091         UBool      shift;
3092         uint32_t   varTop;
3093
3094         strsrch->strength    = ucol_getStrength(strsrch->collator);
3095         ceMask = getMask(strsrch->strength);
3096         if (strsrch->ceMask != ceMask) {
3097             strsrch->ceMask = ceMask;
3098             sameCollAttribute = FALSE;
3099         }
3100         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3101         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3102                                   &status) == UCOL_SHIFTED;
3103         if (strsrch->toShift != shift) {
3104             strsrch->toShift  = shift;
3105             sameCollAttribute = FALSE;
3106         }
3107
3108         // if status is a failure, ucol_getVariableTop returns 0
3109         varTop = ucol_getVariableTop(strsrch->collator, &status);
3110         if (strsrch->variableTop != varTop) {
3111             strsrch->variableTop = varTop;
3112             sameCollAttribute    = FALSE;
3113         }
3114         if (!sameCollAttribute) {
3115             initialize(strsrch, &status);
3116         }
3117         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3118                               strsrch->search->textLength,
3119                               &(strsrch->textIter->iteratordata_));
3120         strsrch->search->matchedLength      = 0;
3121         strsrch->search->matchedIndex       = USEARCH_DONE;
3122         strsrch->search->isOverlap          = FALSE;
3123         strsrch->search->isCanonicalMatch   = FALSE;
3124         strsrch->search->isForwardSearching = TRUE;
3125         strsrch->search->reset              = TRUE;
3126     }
3127 }
3128
3129 // internal use methods declared in usrchimp.h -----------------------------
3130
3131 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
3132 {
3133     if (U_FAILURE(*status)) {
3134         setMatchNotFound(strsrch);
3135         return FALSE;
3136     }
3137
3138         UCollationElements *coleiter        = strsrch->textIter;
3139     int32_t             textlength      = strsrch->search->textLength;
3140     uint32_t           *patternce       = strsrch->pattern.CE;
3141     int32_t             patterncelength = strsrch->pattern.CELength;
3142     int32_t             textoffset      = ucol_getOffset(coleiter);
3143
3144         // status used in setting coleiter offset, since offset is checked in
3145         // shiftForward before setting the coleiter offset, status never
3146         // a failure
3147     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3148                               patterncelength);
3149     while (textoffset <= textlength)
3150     {
3151         uint32_t    patternceindex = patterncelength - 1;
3152         uint32_t    targetce;
3153         UBool       found          = FALSE;
3154         uint32_t    lastce         = UCOL_NULLORDER;
3155
3156                 setColEIterOffset(coleiter, textoffset);
3157
3158         while (TRUE) {
3159             // finding the last pattern ce match, imagine composite characters
3160             // for example: search for pattern A in text \u00C0
3161             // we'll have to skip \u0300 the grave first before we get to A
3162             targetce = ucol_previous(coleiter, status);
3163             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3164                 found = FALSE;
3165                 break;
3166             }
3167             targetce = getCE(strsrch, targetce);
3168             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
3169                 // this is for the text \u0315\u0300 that requires
3170                 // normalization and pattern \u0300, where \u0315 is ignorable
3171                 continue;
3172             }
3173             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3174                 lastce = targetce;
3175             }
3176             if (targetce == patternce[patternceindex]) {
3177                 // the first ce can be a contraction
3178                 found = TRUE;
3179                 break;
3180             }
3181             if (!hasExpansion(coleiter)) {
3182                 found = FALSE;
3183                 break;
3184             }
3185         }
3186
3187         targetce = lastce;
3188
3189         while (found && patternceindex > 0) {
3190             targetce    = ucol_previous(coleiter, status);
3191             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3192                 found = FALSE;
3193                 break;
3194             }
3195             targetce    = getCE(strsrch, targetce);
3196             if (targetce == UCOL_IGNORABLE) {
3197                 continue;
3198             }
3199
3200             patternceindex --;
3201             found = found && targetce == patternce[patternceindex];
3202         }
3203
3204         if (!found) {
3205                         if (U_FAILURE(*status)) {
3206                                 break;
3207                         }
3208             textoffset = shiftForward(strsrch, textoffset, targetce,
3209                                       patternceindex);
3210             // status checked at loop.
3211             patternceindex = patterncelength;
3212             continue;
3213         }
3214
3215                 if (checkNextExactMatch(strsrch, &textoffset, status)) {
3216             // status checked in ucol_setOffset
3217             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3218                         return TRUE;
3219         }
3220     }
3221     setMatchNotFound(strsrch);
3222         return FALSE;
3223 }
3224
3225 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
3226 {
3227     if (U_FAILURE(*status)) {
3228         setMatchNotFound(strsrch);
3229         return FALSE;
3230     }
3231
3232     UCollationElements *coleiter        = strsrch->textIter;
3233     int32_t             textlength      = strsrch->search->textLength;
3234     uint32_t           *patternce       = strsrch->pattern.CE;
3235     int32_t             patterncelength = strsrch->pattern.CELength;
3236     int32_t         textoffset      = ucol_getOffset(coleiter);
3237     UBool               hasPatternAccents =
3238        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3239
3240     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3241                               patterncelength);
3242     strsrch->canonicalPrefixAccents[0] = 0;
3243     strsrch->canonicalSuffixAccents[0] = 0;
3244
3245     while (textoffset <= textlength)
3246     {
3247         int32_t     patternceindex = patterncelength - 1;
3248         uint32_t    targetce;
3249         UBool       found          = FALSE;
3250         uint32_t    lastce         = UCOL_NULLORDER;
3251
3252                 setColEIterOffset(coleiter, textoffset);
3253
3254         while (TRUE) {
3255             // finding the last pattern ce match, imagine composite characters
3256             // for example: search for pattern A in text \u00C0
3257             // we'll have to skip \u0300 the grave first before we get to A
3258             targetce = ucol_previous(coleiter, status);
3259             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3260                 found = FALSE;
3261                 break;
3262             }
3263             targetce = getCE(strsrch, targetce);
3264             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3265                 lastce = targetce;
3266             }
3267             if (targetce == patternce[patternceindex]) {
3268                 // the first ce can be a contraction
3269                 found = TRUE;
3270                 break;
3271             }
3272             if (!hasExpansion(coleiter)) {
3273                 found = FALSE;
3274                 break;
3275             }
3276         }
3277         targetce = lastce;
3278
3279         while (found && patternceindex > 0) {
3280             targetce    = ucol_previous(coleiter, status);
3281             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3282                 found = FALSE;
3283                 break;
3284             }
3285             targetce    = getCE(strsrch, targetce);
3286             if (targetce == UCOL_IGNORABLE) {
3287                 continue;
3288             }
3289
3290             patternceindex --;
3291             found = found && targetce == patternce[patternceindex];
3292         }
3293
3294         // initializing the rearranged accent array
3295         if (hasPatternAccents && !found) {
3296             strsrch->canonicalPrefixAccents[0] = 0;
3297             strsrch->canonicalSuffixAccents[0] = 0;
3298                         if (U_FAILURE(*status)) {
3299                                 break;
3300                         }
3301             found = doNextCanonicalMatch(strsrch, textoffset, status);
3302         }
3303
3304         if (!found) {
3305                         if (U_FAILURE(*status)) {
3306                                 break;
3307                         }
3308             textoffset = shiftForward(strsrch, textoffset, targetce,
3309                                       patternceindex);
3310             // status checked at loop
3311             patternceindex = patterncelength;
3312             continue;
3313         }
3314
3315         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
3316             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3317             return TRUE;
3318         }
3319     }
3320     setMatchNotFound(strsrch);
3321     return FALSE;
3322 }
3323
3324 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
3325 {
3326     if (U_FAILURE(*status)) {
3327         setMatchNotFound(strsrch);
3328         return FALSE;
3329     }
3330
3331     UCollationElements *coleiter        = strsrch->textIter;
3332     uint32_t           *patternce       = strsrch->pattern.CE;
3333     int32_t             patterncelength = strsrch->pattern.CELength;
3334     int32_t         textoffset      = ucol_getOffset(coleiter);
3335
3336     // shifting it check for setting offset
3337     // if setOffset is called previously or there was no previous match, we
3338     // leave the offset as it is.
3339     if (strsrch->search->matchedIndex != USEARCH_DONE) {
3340         textoffset = strsrch->search->matchedIndex;
3341     }
3342
3343     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3344                               patterncelength);
3345
3346     while (textoffset >= 0)
3347     {
3348         int32_t     patternceindex = 1;
3349         uint32_t    targetce;
3350         UBool       found          = FALSE;
3351         uint32_t    firstce        = UCOL_NULLORDER;
3352
3353                 // if status is a failure, ucol_setOffset does nothing
3354         setColEIterOffset(coleiter, textoffset);
3355
3356         while (TRUE) {
3357             // finding the first pattern ce match, imagine composite
3358             // characters. for example: search for pattern \u0300 in text
3359             // \u00C0, we'll have to skip A first before we get to
3360             // \u0300 the grave accent
3361             targetce = ucol_next(coleiter, status);
3362             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3363                 found = FALSE;
3364                 break;
3365             }
3366             targetce = getCE(strsrch, targetce);
3367             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3368                 firstce = targetce;
3369             }
3370             if (targetce == UCOL_IGNORABLE) {
3371                 continue;
3372             }
3373             if (targetce == patternce[0]) {
3374                 found = TRUE;
3375                 break;
3376             }
3377             if (!hasExpansion(coleiter)) {
3378                 // checking for accents in composite character
3379                 found = FALSE;
3380                 break;
3381             }
3382         }
3383
3384         targetce = firstce;
3385
3386         while (found && (patternceindex < patterncelength)) {
3387             targetce    = ucol_next(coleiter, status);
3388             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3389                 found = FALSE;
3390                 break;
3391             }
3392             targetce    = getCE(strsrch, targetce);
3393             if (targetce == UCOL_IGNORABLE) {
3394                 continue;
3395             }
3396
3397             found = found && targetce == patternce[patternceindex];
3398             patternceindex ++;
3399         }
3400
3401         if (!found) {
3402                         if (U_FAILURE(*status)) {
3403                                 break;
3404                         }
3405             textoffset = reverseShift(strsrch, textoffset, targetce,
3406                                       patternceindex);
3407             patternceindex = 0;
3408             continue;
3409         }
3410
3411         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
3412             setColEIterOffset(coleiter, textoffset);
3413             return TRUE;
3414         }
3415     }
3416     setMatchNotFound(strsrch);
3417     return FALSE;
3418 }
3419
3420 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
3421                                       UErrorCode    *status)
3422 {
3423     if (U_FAILURE(*status)) {
3424         setMatchNotFound(strsrch);
3425         return FALSE;
3426     }
3427
3428     UCollationElements *coleiter        = strsrch->textIter;
3429     uint32_t           *patternce       = strsrch->pattern.CE;
3430     int32_t             patterncelength = strsrch->pattern.CELength;
3431     int32_t         textoffset      = ucol_getOffset(coleiter);
3432     UBool               hasPatternAccents =
3433        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3434
3435     // shifting it check for setting offset
3436     // if setOffset is called previously or there was no previous match, we
3437     // leave the offset as it is.
3438     if (strsrch->search->matchedIndex != USEARCH_DONE) {
3439         textoffset = strsrch->search->matchedIndex;
3440     }
3441
3442     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3443                               patterncelength);
3444     strsrch->canonicalPrefixAccents[0] = 0;
3445     strsrch->canonicalSuffixAccents[0] = 0;
3446
3447     while (textoffset >= 0)
3448     {
3449         int32_t     patternceindex = 1;
3450         uint32_t    targetce;
3451         UBool       found          = FALSE;
3452         uint32_t    firstce        = UCOL_NULLORDER;
3453
3454         setColEIterOffset(coleiter, textoffset);
3455         while (TRUE) {
3456             // finding the first pattern ce match, imagine composite
3457             // characters. for example: search for pattern \u0300 in text
3458             // \u00C0, we'll have to skip A first before we get to
3459             // \u0300 the grave accent
3460             targetce = ucol_next(coleiter, status);
3461             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3462                 found = FALSE;
3463                 break;
3464             }
3465             targetce = getCE(strsrch, targetce);
3466             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3467                 firstce = targetce;
3468             }
3469
3470             if (targetce == patternce[0]) {
3471                 // the first ce can be a contraction
3472                 found = TRUE;
3473                 break;
3474             }
3475             if (!hasExpansion(coleiter)) {
3476                 // checking for accents in composite character
3477                 found = FALSE;
3478                 break;
3479             }
3480         }
3481
3482         targetce = firstce;
3483
3484         while (found && patternceindex < patterncelength) {
3485             targetce    = ucol_next(coleiter, status);
3486             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3487                 found = FALSE;
3488                 break;
3489             }
3490             targetce = getCE(strsrch, targetce);
3491             if (targetce == UCOL_IGNORABLE) {
3492                 continue;
3493             }
3494
3495             found = found && targetce == patternce[patternceindex];
3496             patternceindex ++;
3497         }
3498
3499         // initializing the rearranged accent array
3500         if (hasPatternAccents && !found) {
3501             strsrch->canonicalPrefixAccents[0] = 0;
3502             strsrch->canonicalSuffixAccents[0] = 0;
3503                         if (U_FAILURE(*status)) {
3504                 break;
3505             }
3506             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
3507         }
3508
3509         if (!found) {
3510                         if (U_FAILURE(*status)) {
3511                 break;
3512             }
3513             textoffset = reverseShift(strsrch, textoffset, targetce,
3514                                       patternceindex);
3515             patternceindex = 0;
3516             continue;
3517         }
3518
3519         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
3520             setColEIterOffset(coleiter, textoffset);
3521             return TRUE;
3522         }
3523     }
3524     setMatchNotFound(strsrch);
3525     return FALSE;
3526 }
3527
3528 #endif /* #if !UCONFIG_NO_COLLATION */