]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/usearch.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
b331163b 3* Copyright (C) 2001-2015 IBM and others. All rights reserved.
b75a7d8f
A
4**********************************************************************
5* Date Name Description
6* 07/02/2001 synwee Creation.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11
46f4442e 12#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
b75a7d8f
A
13
14#include "unicode/usearch.h"
15#include "unicode/ustring.h"
16#include "unicode/uchar.h"
4388f060 17#include "unicode/utf16.h"
729e4ab9 18#include "normalizer2impl.h"
b75a7d8f
A
19#include "usrchimp.h"
20#include "cmemory.h"
374ca955 21#include "ucln_in.h"
46f4442e 22#include "uassert.h"
729e4ab9 23#include "ustr_imp.h"
46f4442e
A
24
25U_NAMESPACE_USE
26
27// don't use Boyer-Moore
729e4ab9 28// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
46f4442e 29#define BOYER_MOORE 0
b75a7d8f
A
30
31// internal definition ---------------------------------------------------
32
33#define LAST_BYTE_MASK_ 0xFF
34#define SECOND_LAST_BYTE_SHIFT_ 8
35#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36
4388f060 37static const Normalizer2Impl *g_nfcImpl = NULL;
b75a7d8f
A
38
39// internal methods -------------------------------------------------
40
41/**
42* Fast collation element iterator setOffset.
43* This function does not check for bounds.
44* @param coleiter collation element iterator
45* @param offset to set
46*/
729e4ab9 47static
b75a7d8f
A
48inline void setColEIterOffset(UCollationElements *elems,
49 int32_t offset)
50{
57a6839d
A
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status = U_ZERO_ERROR;
54 ucol_setOffset(elems, offset, &status);
b75a7d8f
A
55}
56
57/**
58* Getting the mask for collation strength
59* @param strength collation strength
60* @return collation element mask
61*/
62static
729e4ab9 63inline uint32_t getMask(UCollationStrength strength)
b75a7d8f 64{
729e4ab9 65 switch (strength)
b75a7d8f
A
66 {
67 case UCOL_PRIMARY:
68 return UCOL_PRIMARYORDERMASK;
69 case UCOL_SECONDARY:
70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71 default:
729e4ab9 72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
b75a7d8f
A
73 UCOL_PRIMARYORDERMASK;
74 }
75}
76
77/**
78* This is to squeeze the 21bit ces into a 256 table
79* @param ce collation element
80* @return collapsed version of the collation element
81*/
82static
729e4ab9 83inline int hash(uint32_t ce)
b75a7d8f
A
84{
85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
86 // well with the new collation where most of the latin 1 characters
87 // are of the value xx000xxx. their hashes will most of the time be 0
88 // to be discussed on the hash algo.
89 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
90}
91
374ca955
A
92U_CDECL_BEGIN
93static UBool U_CALLCONV
94usearch_cleanup(void) {
4388f060 95 g_nfcImpl = NULL;
374ca955
A
96 return TRUE;
97}
98U_CDECL_END
99
b75a7d8f
A
100/**
101* Initializing the fcd tables.
102* Internal method, status assumed to be a success.
729e4ab9 103* @param status output error if any, caller to check status before calling
b75a7d8f
A
104* method, status assumed to be success when passed in.
105*/
106static
729e4ab9 107inline void initializeFCD(UErrorCode *status)
b75a7d8f 108{
4388f060
A
109 if (g_nfcImpl == NULL) {
110 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
374ca955 111 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
b75a7d8f
A
112 }
113}
114
115/**
116* Gets the fcd value for a character at the argument index.
117* This method takes into accounts of the supplementary characters.
118* @param str UTF16 string where character for fcd retrieval resides
729e4ab9
A
119* @param offset position of the character whose fcd is to be retrieved, to be
120* overwritten with the next character position, taking
b75a7d8f
A
121* surrogate characters into consideration.
122* @param strlength length of the argument string
123* @return fcd value
124*/
125static
729e4ab9 126uint16_t getFCD(const UChar *str, int32_t *offset,
b75a7d8f
A
127 int32_t strlength)
128{
729e4ab9 129 const UChar *temp = str + *offset;
4388f060 130 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
729e4ab9 131 *offset = (int32_t)(temp - str);
b75a7d8f
A
132 return result;
133}
134
135/**
729e4ab9 136* Getting the modified collation elements taking into account the collation
b75a7d8f
A
137* attributes
138* @param strsrch string search data
729e4ab9 139* @param sourcece
b75a7d8f
A
140* @return the modified collation element
141*/
142static
374ca955 143inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
b75a7d8f
A
144{
145 // note for tertiary we can't use the collator->tertiaryMask, that
146 // is a preprocessed mask that takes into account case options. since
147 // we are only concerned with exact matches, we don't need that.
148 sourcece &= strsrch->ceMask;
729e4ab9 149
b75a7d8f
A
150 if (strsrch->toShift) {
151 // alternate handling here, since only the 16 most significant digits
152 // is only used, we can safely do a compare without masking
153 // if the ce is a variable, we mask and get only the primary values
154 // no shifting to quartenary is required since all primary values
155 // less than variabletop will need to be masked off anyway.
156 if (strsrch->variableTop > sourcece) {
729e4ab9 157 if (strsrch->strength >= UCOL_QUATERNARY) {
b75a7d8f
A
158 sourcece &= UCOL_PRIMARYORDERMASK;
159 }
729e4ab9 160 else {
b75a7d8f
A
161 sourcece = UCOL_IGNORABLE;
162 }
163 }
729e4ab9
A
164 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
165 sourcece = 0xFFFF;
b75a7d8f
A
166 }
167
168 return sourcece;
169}
170
729e4ab9 171/**
b75a7d8f
A
172* Allocate a memory and returns NULL if it failed.
173* Internal method, status assumed to be a success.
174* @param size to allocate
729e4ab9 175* @param status output error if any, caller to check status before calling
b75a7d8f
A
176* method, status assumed to be success when passed in.
177* @return newly allocated array, NULL otherwise
178*/
179static
729e4ab9 180inline void * allocateMemory(uint32_t size, UErrorCode *status)
b75a7d8f
A
181{
182 uint32_t *result = (uint32_t *)uprv_malloc(size);
183 if (result == NULL) {
184 *status = U_MEMORY_ALLOCATION_ERROR;
185 }
186 return result;
187}
188
189/**
190* Adds a uint32_t value to a destination array.
729e4ab9 191* Creates a new array if we run out of space. The caller will have to
b75a7d8f 192* manually deallocate the newly allocated array.
729e4ab9
A
193* Internal method, status assumed to be success, caller has to check status
194* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
195* size destinationlength.
196* @param destination target array
197* @param offset destination offset to add value
198* @param destinationlength target array size, return value for the new size
199* @param value to be added
200* @param increments incremental size expected
729e4ab9 201* @param status output error if any, caller to check status before calling
b75a7d8f
A
202* method, status assumed to be success when passed in.
203* @return new destination array, destination if there was no new allocation
204*/
205static
729e4ab9
A
206inline int32_t * addTouint32_tArray(int32_t *destination,
207 uint32_t offset,
208 uint32_t *destinationlength,
374ca955 209 uint32_t value,
729e4ab9
A
210 uint32_t increments,
211 UErrorCode *status)
b75a7d8f
A
212{
213 uint32_t newlength = *destinationlength;
214 if (offset + 1 == newlength) {
215 newlength += increments;
374ca955
A
216 int32_t *temp = (int32_t *)allocateMemory(
217 sizeof(int32_t) * newlength, status);
b75a7d8f
A
218 if (U_FAILURE(*status)) {
219 return NULL;
220 }
374ca955 221 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
b75a7d8f
A
222 *destinationlength = newlength;
223 destination = temp;
224 }
225 destination[offset] = value;
226 return destination;
227}
228
46f4442e
A
229/**
230* Adds a uint64_t value to a destination array.
729e4ab9 231* Creates a new array if we run out of space. The caller will have to
46f4442e 232* manually deallocate the newly allocated array.
729e4ab9
A
233* Internal method, status assumed to be success, caller has to check status
234* before calling this method. destination not to be NULL and has at least
46f4442e
A
235* size destinationlength.
236* @param destination target array
237* @param offset destination offset to add value
238* @param destinationlength target array size, return value for the new size
239* @param value to be added
240* @param increments incremental size expected
729e4ab9 241* @param status output error if any, caller to check status before calling
46f4442e
A
242* method, status assumed to be success when passed in.
243* @return new destination array, destination if there was no new allocation
244*/
245static
729e4ab9
A
246inline int64_t * addTouint64_tArray(int64_t *destination,
247 uint32_t offset,
248 uint32_t *destinationlength,
46f4442e 249 uint64_t value,
729e4ab9
A
250 uint32_t increments,
251 UErrorCode *status)
46f4442e
A
252{
253 uint32_t newlength = *destinationlength;
254 if (offset + 1 == newlength) {
255 newlength += increments;
256 int64_t *temp = (int64_t *)allocateMemory(
257 sizeof(int64_t) * newlength, status);
729e4ab9 258
46f4442e
A
259 if (U_FAILURE(*status)) {
260 return NULL;
261 }
262
263 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
264 *destinationlength = newlength;
265 destination = temp;
266 }
267
268 destination[offset] = value;
269
270 return destination;
271}
272
b75a7d8f
A
273/**
274* Initializing the ce table for a pattern.
275* Stores non-ignorable collation keys.
729e4ab9
A
276* Table size will be estimated by the size of the pattern text. Table
277* expansion will be perform as we go along. Adding 1 to ensure that the table
b75a7d8f
A
278* size definitely increases.
279* Internal method, status assumed to be a success.
280* @param strsrch string search data
729e4ab9 281* @param status output error if any, caller to check status before calling
b75a7d8f 282* method, status assumed to be success when passed in.
729e4ab9 283* @return total number of expansions
b75a7d8f
A
284*/
285static
729e4ab9 286inline uint16_t initializePatternCETable(UStringSearch *strsrch,
b75a7d8f
A
287 UErrorCode *status)
288{
289 UPattern *pattern = &(strsrch->pattern);
290 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
b331163b 291 int32_t *cetable = pattern->cesBuffer;
b75a7d8f
A
292 uint32_t patternlength = pattern->textLength;
293 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 294
b75a7d8f 295 if (coleiter == NULL) {
729e4ab9 296 coleiter = ucol_openElements(strsrch->collator, pattern->text,
b75a7d8f 297 patternlength, status);
729e4ab9
A
298 // status will be checked in ucol_next(..) later and if it is an
299 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
b75a7d8f
A
300 // returned.
301 strsrch->utilIter = coleiter;
302 }
303 else {
57a6839d 304 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
305 }
306 if(U_FAILURE(*status)) {
307 return 0;
b75a7d8f 308 }
729e4ab9 309
b331163b
A
310 if (pattern->ces != cetable && pattern->ces) {
311 uprv_free(pattern->ces);
b75a7d8f 312 }
729e4ab9 313
b75a7d8f
A
314 uint16_t offset = 0;
315 uint16_t result = 0;
374ca955 316 int32_t ce;
b75a7d8f
A
317
318 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
319 U_SUCCESS(*status)) {
320 uint32_t newce = getCE(strsrch, ce);
321 if (newce) {
729e4ab9 322 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
b75a7d8f 323 newce,
729e4ab9 324 patternlength - ucol_getOffset(coleiter) + 1,
b75a7d8f
A
325 status);
326 if (U_FAILURE(*status)) {
327 return 0;
328 }
329 offset ++;
b331163b 330 if (cetable != temp && cetable != pattern->cesBuffer) {
b75a7d8f
A
331 uprv_free(cetable);
332 }
333 cetable = temp;
334 }
335 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
336 }
337
338 cetable[offset] = 0;
b331163b
A
339 pattern->ces = cetable;
340 pattern->cesLength = offset;
b75a7d8f
A
341
342 return result;
343}
344
46f4442e
A
345/**
346* Initializing the pce table for a pattern.
347* Stores non-ignorable collation keys.
729e4ab9
A
348* Table size will be estimated by the size of the pattern text. Table
349* expansion will be perform as we go along. Adding 1 to ensure that the table
46f4442e
A
350* size definitely increases.
351* Internal method, status assumed to be a success.
352* @param strsrch string search data
729e4ab9 353* @param status output error if any, caller to check status before calling
46f4442e 354* method, status assumed to be success when passed in.
729e4ab9 355* @return total number of expansions
46f4442e
A
356*/
357static
729e4ab9 358inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
46f4442e
A
359 UErrorCode *status)
360{
361 UPattern *pattern = &(strsrch->pattern);
362 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
b331163b 363 int64_t *pcetable = pattern->pcesBuffer;
46f4442e
A
364 uint32_t patternlength = pattern->textLength;
365 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 366
46f4442e 367 if (coleiter == NULL) {
729e4ab9 368 coleiter = ucol_openElements(strsrch->collator, pattern->text,
46f4442e 369 patternlength, status);
729e4ab9
A
370 // status will be checked in ucol_next(..) later and if it is an
371 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
46f4442e
A
372 // returned.
373 strsrch->utilIter = coleiter;
374 } else {
57a6839d 375 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
376 }
377 if(U_FAILURE(*status)) {
378 return 0;
46f4442e 379 }
729e4ab9 380
b331163b
A
381 if (pattern->pces != pcetable && pattern->pces != NULL) {
382 uprv_free(pattern->pces);
46f4442e 383 }
729e4ab9 384
46f4442e
A
385 uint16_t offset = 0;
386 uint16_t result = 0;
387 int64_t pce;
388
57a6839d 389 icu::UCollationPCE iter(coleiter);
46f4442e
A
390
391 // ** Should processed CEs be signed or unsigned?
729e4ab9 392 // ** (the rest of the code in this file seems to play fast-and-loose with
46f4442e 393 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
57a6839d 394 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
46f4442e 395 U_SUCCESS(*status)) {
729e4ab9 396 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
46f4442e 397 pce,
729e4ab9 398 patternlength - ucol_getOffset(coleiter) + 1,
46f4442e
A
399 status);
400
401 if (U_FAILURE(*status)) {
402 return 0;
403 }
404
405 offset += 1;
406
b331163b 407 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
46f4442e
A
408 uprv_free(pcetable);
409 }
410
411 pcetable = temp;
412 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
413 }
414
415 pcetable[offset] = 0;
b331163b
A
416 pattern->pces = pcetable;
417 pattern->pcesLength = offset;
46f4442e
A
418
419 return result;
420}
421
b75a7d8f
A
422/**
423* Initializes the pattern struct.
424* Internal method, status assumed to be success.
425* @param strsrch UStringSearch data storage
729e4ab9 426* @param status output error if any, caller to check status before calling
b75a7d8f
A
427* method, status assumed to be success when passed in.
428* @return expansionsize the total expansion size of the pattern
729e4ab9 429*/
b75a7d8f 430static
729e4ab9 431inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 432{
57a6839d 433 if (U_FAILURE(*status)) { return 0; }
b75a7d8f
A
434 UPattern *pattern = &(strsrch->pattern);
435 const UChar *patterntext = pattern->text;
436 int32_t length = pattern->textLength;
437 int32_t index = 0;
729e4ab9 438
46f4442e
A
439 // Since the strength is primary, accents are ignored in the pattern.
440 if (strsrch->strength == UCOL_PRIMARY) {
4388f060
A
441 pattern->hasPrefixAccents = 0;
442 pattern->hasSuffixAccents = 0;
46f4442e 443 } else {
4388f060
A
444 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
445 SECOND_LAST_BYTE_SHIFT_;
446 index = length;
447 U16_BACK_1(patterntext, 0, index);
448 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
449 LAST_BYTE_MASK_;
46f4442e
A
450 }
451
452 // ** HACK **
b331163b
A
453 if (strsrch->pattern.pces != NULL) {
454 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
455 uprv_free(strsrch->pattern.pces);
46f4442e
A
456 }
457
b331163b 458 strsrch->pattern.pces = NULL;
46f4442e 459 }
b75a7d8f 460
b75a7d8f 461 // since intializePattern is an internal method status is a success.
729e4ab9 462 return initializePatternCETable(strsrch, status);
b75a7d8f
A
463}
464
465/**
466* Initializing shift tables, with the default values.
467* If a corresponding default value is 0, the shift table is not set.
729e4ab9 468* @param shift table for forwards shift
b75a7d8f
A
469* @param backshift table for backwards shift
470* @param cetable table containing pattern ce
471* @param cesize size of the pattern ces
472* @param expansionsize total size of the expansions
473* @param defaultforward the default forward value
474* @param defaultbackward the default backward value
475*/
476static
729e4ab9
A
477inline void setShiftTable(int16_t shift[], int16_t backshift[],
478 int32_t *cetable, int32_t cesize,
b75a7d8f
A
479 int16_t expansionsize,
480 int16_t defaultforward,
481 int16_t defaultbackward)
482{
729e4ab9 483 // estimate the value to shift. to do that we estimate the smallest
b75a7d8f 484 // number of characters to give the relevant ces, ie approximately
729e4ab9 485 // the number of ces minus their expansion, since expansions can come
b75a7d8f
A
486 // from a character.
487 int32_t count;
488 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
489 shift[count] = defaultforward;
490 }
491 cesize --; // down to the last index
492 for (count = 0; count < cesize; count ++) {
493 // number of ces from right of array to the count
494 int temp = defaultforward - count - 1;
495 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
496 }
497 shift[hash(cetable[cesize])] = 1;
498 // for ignorables we just shift by one. see test examples.
499 shift[hash(0)] = 1;
729e4ab9 500
b75a7d8f
A
501 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
502 backshift[count] = defaultbackward;
503 }
504 for (count = cesize; count > 0; count --) {
505 // the original value count does not seem to work
729e4ab9 506 backshift[hash(cetable[count])] = count > expansionsize ?
b75a7d8f
A
507 (int16_t)(count - expansionsize) : 1;
508 }
509 backshift[hash(cetable[0])] = 1;
510 backshift[hash(0)] = 1;
511}
512
513/**
514* Building of the pattern collation element list and the boyer moore strsrch
515* table.
516* The canonical match will only be performed after the default match fails.
517* For both cases we need to remember the size of the composed and decomposed
518* versions of the string. Since the Boyer-Moore shift calculations shifts by
729e4ab9
A
519* a number of characters in the text and tries to match the pattern from that
520* offset, the shift value can not be too large in case we miss some
521* characters. To choose a right shift size, we estimate the NFC form of the
522* and use its size as a shift guide. The NFC form should be the small
b75a7d8f
A
523* possible representation of the pattern. Anyways, we'll err on the smaller
524* shift size. Hence the calculation for minlength.
729e4ab9
A
525* Canonical match will be performed slightly differently. We'll split the
526* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
527* the first and last base character (MS), the ending accents (EA). Matches
b75a7d8f
A
528* will be done on MS first, and only when we match MS then some processing
529* will be required for the prefix and end accents in order to determine if
729e4ab9
A
530* they match PA and EA. Hence the default shift values
531* for the canonical match will take the size of either end's accent into
b75a7d8f
A
532* consideration. Forwards search will take the end accents into consideration
533* for the default shift values and the backwards search will take the prefix
534* accents into consideration.
535* If pattern has no non-ignorable ce, we return a illegal argument error.
536* Internal method, status assumed to be success.
537* @param strsrch UStringSearch data storage
538* @param status for output errors if it occurs, status is assumed to be a
539* success when it is passed in.
729e4ab9 540*/
b75a7d8f 541static
729e4ab9 542inline void initialize(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 543{
729e4ab9 544 int16_t expandlength = initializePattern(strsrch, status);
b331163b 545 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
b75a7d8f 546 UPattern *pattern = &strsrch->pattern;
b331163b 547 int32_t cesize = pattern->cesLength;
b75a7d8f 548
729e4ab9 549 int16_t minlength = cesize > expandlength
374ca955 550 ? (int16_t)cesize - expandlength : 1;
b75a7d8f 551 pattern->defaultShiftSize = minlength;
b331163b 552 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
b75a7d8f
A
553 cesize, expandlength, minlength, minlength);
554 return;
555 }
556 strsrch->pattern.defaultShiftSize = 0;
557}
558
46f4442e
A
559#if BOYER_MOORE
560/**
729e4ab9 561* Check to make sure that the match length is at the end of the character by
46f4442e 562* using the breakiterator.
729e4ab9 563* @param strsrch string search data
46f4442e
A
564* @param start target text start offset
565* @param end target text end offset
566*/
567static
729e4ab9 568void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
46f4442e
A
569 int32_t *end)
570{
571#if !UCONFIG_NO_BREAK_ITERATION
572 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
573 if (breakiterator) {
4388f060
A
574 int32_t matchend = *end;
575 //int32_t matchstart = *start;
729e4ab9 576
4388f060
A
577 if (!ubrk_isBoundary(breakiterator, matchend)) {
578 *end = ubrk_following(breakiterator, matchend);
46f4442e 579 }
729e4ab9 580
4388f060
A
581 /* Check the start of the matched text to make sure it doesn't have any accents
582 * before it. This code may not be necessary and so it is commented out */
583 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
584 *start = ubrk_preceding(breakiterator, matchstart);
585 }*/
46f4442e
A
586 }
587#endif
588}
589
b75a7d8f 590/**
729e4ab9
A
591* Determine whether the target text in UStringSearch bounded by the offset
592* start and end is one or more whole units of text as
b75a7d8f 593* determined by the breakiterator in UStringSearch.
729e4ab9 594* @param strsrch string search data
b75a7d8f
A
595* @param start target text start offset
596* @param end target text end offset
597*/
598static
729e4ab9 599UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
b75a7d8f
A
600 int32_t end)
601{
602#if !UCONFIG_NO_BREAK_ITERATION
603 UBreakIterator *breakiterator = strsrch->search->breakIter;
46f4442e 604 //TODO: Add here.
b75a7d8f
A
605 if (breakiterator) {
606 int32_t startindex = ubrk_first(breakiterator);
607 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 608
b75a7d8f
A
609 // out-of-range indexes are never boundary positions
610 if (start < startindex || start > endindex ||
611 end < startindex || end > endindex) {
612 return FALSE;
613 }
729e4ab9
A
614 // otherwise, we can use following() on the position before the
615 // specified one and return true of the position we get back is the
b75a7d8f 616 // one the user specified
729e4ab9
A
617 UBool result = (start == startindex ||
618 ubrk_following(breakiterator, start - 1) == start) &&
619 (end == endindex ||
b75a7d8f
A
620 ubrk_following(breakiterator, end - 1) == end);
621 if (result) {
622 // iterates the individual ces
623 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 624 const UChar *text = strsrch->search->text +
b75a7d8f
A
625 start;
626 UErrorCode status = U_ZERO_ERROR;
627 ucol_setText(coleiter, text, end - start, &status);
b331163b 628 for (int32_t count = 0; count < strsrch->pattern.cesLength;
b75a7d8f 629 count ++) {
374ca955 630 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
631 if (ce == UCOL_IGNORABLE) {
632 count --;
633 continue;
634 }
b331163b 635 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
b75a7d8f
A
636 return FALSE;
637 }
638 }
374ca955 639 int32_t nextce = ucol_next(coleiter, &status);
b75a7d8f
A
640 while (ucol_getOffset(coleiter) == (end - start)
641 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
642 nextce = ucol_next(coleiter, &status);
643 }
644 if (ucol_getOffset(coleiter) == (end - start)
645 && nextce != UCOL_NULLORDER) {
646 // extra collation elements at the end of the match
647 return FALSE;
648 }
649 }
650 return result;
651 }
652#endif
653 return TRUE;
654}
655
656/**
729e4ab9
A
657* Getting the next base character offset if current offset is an accent,
658* or the current offset if the current character contains a base character.
b75a7d8f
A
659* accents the following base character will be returned
660* @param text string
661* @param textoffset current offset
662* @param textlength length of text string
663* @return the next base character or the current offset
664* if the current character is contains a base character.
665*/
666static
729e4ab9 667inline int32_t getNextBaseOffset(const UChar *text,
b75a7d8f
A
668 int32_t textoffset,
669 int32_t textlength)
670{
671 if (textoffset < textlength) {
672 int32_t temp = textoffset;
673 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
729e4ab9 674 while (temp < textlength) {
b75a7d8f 675 int32_t result = temp;
729e4ab9 676 if ((getFCD(text, &temp, textlength) >>
b75a7d8f
A
677 SECOND_LAST_BYTE_SHIFT_) == 0) {
678 return result;
679 }
680 }
681 return textlength;
682 }
683 }
684 return textoffset;
685}
686
687/**
688* Gets the next base character offset depending on the string search pattern
689* data
690* @param strsrch string search data
691* @param textoffset current offset, one offset away from the last character
692* to search for.
693* @return start index of the next base character or the current offset
694* if the current character is contains a base character.
695*/
696static
729e4ab9 697inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
698 int32_t textoffset)
699{
374ca955 700 int32_t textlength = strsrch->search->textLength;
729e4ab9 701 if (strsrch->pattern.hasSuffixAccents &&
b75a7d8f
A
702 textoffset < textlength) {
703 int32_t temp = textoffset;
704 const UChar *text = strsrch->search->text;
4388f060 705 U16_BACK_1(text, 0, temp);
b75a7d8f
A
706 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
707 return getNextBaseOffset(text, textoffset, textlength);
708 }
709 }
710 return textoffset;
711}
712
713/**
714* Shifting the collation element iterator position forward to prepare for
715* a following match. If the last character is a unsafe character, we'll only
716* shift by 1 to capture contractions, normalization etc.
717* Internal method, status assumed to be success.
718* @param text strsrch string search data
719* @param textoffset start text position to do search
720* @param ce the text ce which failed the match.
721* @param patternceindex index of the ce within the pattern ce buffer which
722* failed the match
723* @return final offset
724*/
725static
726inline int32_t shiftForward(UStringSearch *strsrch,
727 int32_t textoffset,
374ca955 728 int32_t ce,
b75a7d8f
A
729 int32_t patternceindex)
730{
374ca955 731 UPattern *pattern = &(strsrch->pattern);
b75a7d8f
A
732 if (ce != UCOL_NULLORDER) {
733 int32_t shift = pattern->shift[hash(ce)];
729e4ab9 734 // this is to adjust for characters in the middle of the
b75a7d8f 735 // substring for matching that failed.
b331163b 736 int32_t adjust = pattern->cesLength - patternceindex;
b75a7d8f
A
737 if (adjust > 1 && shift >= adjust) {
738 shift -= adjust - 1;
739 }
740 textoffset += shift;
741 }
742 else {
743 textoffset += pattern->defaultShiftSize;
744 }
729e4ab9 745
b75a7d8f
A
746 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
747 // check for unsafe characters
729e4ab9 748 // * if it is the start or middle of a contraction: to be done after
b75a7d8f
A
749 // a initial match is found
750 // * thai or lao base consonant character: similar to contraction
751 // * high surrogate character: similar to contraction
752 // * next character is a accent: shift to the next base character
753 return textoffset;
754}
46f4442e 755#endif // #if BOYER_MOORE
b75a7d8f
A
756
757/**
729e4ab9 758* sets match not found
b75a7d8f
A
759* @param strsrch string search data
760*/
761static
729e4ab9 762inline void setMatchNotFound(UStringSearch *strsrch)
b75a7d8f
A
763{
764 // this method resets the match result regardless of the error status.
765 strsrch->search->matchedIndex = USEARCH_DONE;
766 strsrch->search->matchedLength = 0;
767 if (strsrch->search->isForwardSearching) {
768 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
769 }
770 else {
771 setColEIterOffset(strsrch->textIter, 0);
772 }
773}
774
46f4442e 775#if BOYER_MOORE
b75a7d8f
A
776/**
777* Gets the offset to the next safe point in text.
778* ie. not the middle of a contraction, swappable characters or supplementary
779* characters.
780* @param collator collation sata
781* @param text string to work with
782* @param textoffset offset in string
783* @param textlength length of text string
784* @return offset to the next safe character
785*/
786static
729e4ab9 787inline int32_t getNextSafeOffset(const UCollator *collator,
b75a7d8f
A
788 const UChar *text,
789 int32_t textoffset,
790 int32_t textlength)
791{
792 int32_t result = textoffset; // first contraction character
793 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
794 result ++;
795 }
729e4ab9 796 return result;
b75a7d8f
A
797}
798
729e4ab9 799/**
b75a7d8f
A
800* This checks for accents in the potential match started with a .
801* composite character.
729e4ab9
A
802* This is really painful... we have to check that composite character do not
803* have any extra accents. We have to normalize the potential match and find
b75a7d8f 804* the immediate decomposed character before the match.
729e4ab9 805* The first composite character would have been taken care of by the fcd
b75a7d8f 806* checks in checkForwardExactMatch.
729e4ab9
A
807* This is the slow path after the fcd of the first character and
808* the last character has been checked by checkForwardExactMatch and we
b75a7d8f
A
809* determine that the potential match has extra non-ignorable preceding
810* ces.
729e4ab9 811* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
812* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
813* Note here that accents checking are slow and cautioned in the API docs.
814* Internal method, status assumed to be a success, caller should check status
815* before calling this method
816* @param strsrch string search data
817* @param start index of the potential unfriendly composite character
818* @param end index of the potential unfriendly composite character
819* @param status output error status if any.
820* @return TRUE if there is non-ignorable accents before at the beginning
821* of the match, FALSE otherwise.
822*/
823
824static
825UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
729e4ab9 826 int32_t end,
b75a7d8f
A
827 UErrorCode *status)
828{
829 UBool result = FALSE;
830 if (strsrch->pattern.hasPrefixAccents) {
831 int32_t length = end - start;
832 int32_t offset = 0;
833 const UChar *text = strsrch->search->text + start;
729e4ab9 834
4388f060 835 U16_FWD_1(text, offset, length);
b75a7d8f
A
836 // we are only concerned with the first composite character
837 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
729e4ab9 838 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
b75a7d8f
A
839 text, 0, length);
840 if (safeoffset != length) {
841 safeoffset ++;
842 }
843 UChar *norm = NULL;
844 UChar buffer[INITIAL_ARRAY_SIZE_];
729e4ab9
A
845 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
846 buffer, INITIAL_ARRAY_SIZE_,
847 status);
b75a7d8f
A
848 if (U_FAILURE(*status)) {
849 return FALSE;
850 }
851 if (size >= INITIAL_ARRAY_SIZE_) {
852 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
853 status);
729e4ab9 854 // if allocation failed, status will be set to
b75a7d8f
A
855 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
856 // checks for it.
729e4ab9 857 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
b75a7d8f
A
858 size, status);
859 if (U_FAILURE(*status) && norm != NULL) {
860 uprv_free(norm);
861 return FALSE;
862 }
863 }
864 else {
865 norm = buffer;
866 }
867
868 UCollationElements *coleiter = strsrch->utilIter;
869 ucol_setText(coleiter, norm, size, status);
b331163b 870 uint32_t firstce = strsrch->pattern.ces[0];
b75a7d8f
A
871 UBool ignorable = TRUE;
872 uint32_t ce = UCOL_IGNORABLE;
46f4442e 873 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
b75a7d8f
A
874 offset = ucol_getOffset(coleiter);
875 if (ce != firstce && ce != UCOL_IGNORABLE) {
876 ignorable = FALSE;
877 }
878 ce = ucol_next(coleiter, status);
879 }
880 UChar32 codepoint;
4388f060 881 U16_PREV(norm, 0, offset, codepoint);
b75a7d8f
A
882 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
883
884 if (norm != buffer) {
885 uprv_free(norm);
886 }
887 }
888 }
889
890 return result;
891}
892
893/**
729e4ab9 894* Used by exact matches, checks if there are accents before the match.
b75a7d8f 895* This is really painful... we have to check that composite characters at
729e4ab9
A
896* the start of the matches have to not have any extra accents.
897* We check the FCD of the character first, if it starts with an accent and
b75a7d8f 898* the first pattern ce does not match the first ce of the character, we bail.
729e4ab9
A
899* Otherwise we try normalizing the first composite
900* character and find the immediate decomposed character before the match to
b75a7d8f 901* see if it is an non-ignorable accent.
729e4ab9
A
902* Now normalizing the first composite character is enough because we ensure
903* that when the match is passed in here with extra beginning ces, the
b75a7d8f 904* first or last ce that match has to occur within the first character.
729e4ab9 905* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
906* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
907* Note here that accents checking are slow and cautioned in the API docs.
908* @param strsrch string search data
729e4ab9 909* @param start offset
b75a7d8f 910* @param end offset
729e4ab9 911* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
912* FALSE otherwise
913*/
914static
915UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
729e4ab9 916 int32_t end)
b75a7d8f
A
917{
918 if (strsrch->pattern.hasPrefixAccents) {
919 UCollationElements *coleiter = strsrch->textIter;
920 UErrorCode status = U_ZERO_ERROR;
921 // we have been iterating forwards previously
922 uint32_t ignorable = TRUE;
b331163b 923 int32_t firstce = strsrch->pattern.ces[0];
b75a7d8f 924
374ca955
A
925 setColEIterOffset(coleiter, start);
926 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
927 if (U_FAILURE(status)) {
b75a7d8f
A
928 return TRUE;
929 }
930 while (ce != firstce) {
931 if (ce != UCOL_IGNORABLE) {
932 ignorable = FALSE;
933 }
934 ce = getCE(strsrch, ucol_next(coleiter, &status));
46f4442e 935 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
936 return TRUE;
937 }
938 }
374ca955 939 if (!ignorable && inNormBuf(coleiter)) {
b75a7d8f 940 // within normalization buffer, discontiguous handled here
374ca955 941 return TRUE;
b75a7d8f
A
942 }
943
374ca955 944 // within text
b75a7d8f 945 int32_t temp = start;
374ca955 946 // original code
729e4ab9
A
947 // accent = (getFCD(strsrch->search->text, &temp,
948 // strsrch->search->textLength)
949 // >> SECOND_LAST_BYTE_SHIFT_);
374ca955 950 // however this code does not work well with VC7 .net in release mode.
729e4ab9 951 // maybe the inlines for getFCD combined with shifting has bugs in
374ca955 952 // VC7. anyways this is a work around.
729e4ab9 953 UBool accent = getFCD(strsrch->search->text, &temp,
b75a7d8f
A
954 strsrch->search->textLength) > 0xFF;
955 if (!accent) {
374ca955 956 return checkExtraMatchAccents(strsrch, start, end, &status);
b75a7d8f 957 }
374ca955 958 if (!ignorable) {
b75a7d8f
A
959 return TRUE;
960 }
961 if (start > 0) {
962 temp = start;
4388f060 963 U16_BACK_1(strsrch->search->text, 0, temp);
729e4ab9 964 if (getFCD(strsrch->search->text, &temp,
b75a7d8f
A
965 strsrch->search->textLength) & LAST_BYTE_MASK_) {
966 setColEIterOffset(coleiter, start);
967 ce = ucol_previous(coleiter, &status);
729e4ab9 968 if (U_FAILURE(status) ||
b75a7d8f
A
969 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
970 return TRUE;
971 }
972 }
973 }
974 }
729e4ab9 975
b75a7d8f
A
976 return FALSE;
977}
978
979/**
980* Used by exact matches, checks if there are accents bounding the match.
981* Note this is the initial boundary check. If the potential match
982* starts or ends with composite characters, the accents in those
983* characters will be determined later.
729e4ab9 984* Not doing backwards iteration here, since discontiguos contraction for
b75a7d8f 985* backwards collation element iterator, use up too many characters.
729e4ab9 986* E.g. looking for \u030A ring in \u01FA A ring above and acute,
b75a7d8f
A
987* should fail since there is a acute at the end of \u01FA
988* Note here that accents checking are slow and cautioned in the API docs.
989* @param strsrch string search data
990* @param start offset of match
991* @param end end offset of the match
729e4ab9 992* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
993* FALSE otherwise
994*/
995static
729e4ab9
A
996UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
997 int32_t end)
b75a7d8f
A
998{
999 if (strsrch->pattern.hasSuffixAccents) {
1000 const UChar *text = strsrch->search->text;
1001 int32_t temp = end;
1002 int32_t textlength = strsrch->search->textLength;
4388f060 1003 U16_BACK_1(text, 0, temp);
b75a7d8f 1004 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
b331163b 1005 int32_t firstce = strsrch->pattern.ces[0];
b75a7d8f
A
1006 UCollationElements *coleiter = strsrch->textIter;
1007 UErrorCode status = U_ZERO_ERROR;
4388f060 1008 int32_t ce;
b75a7d8f 1009 setColEIterOffset(coleiter, start);
46f4442e
A
1010 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1011 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
1012 return TRUE;
1013 }
1014 }
1015 int32_t count = 1;
b331163b 1016 while (count < strsrch->pattern.cesLength) {
729e4ab9 1017 if (getCE(strsrch, ucol_next(coleiter, &status))
b75a7d8f
A
1018 == UCOL_IGNORABLE) {
1019 // Thai can give an ignorable here.
1020 count --;
1021 }
1022 if (U_FAILURE(status)) {
1023 return TRUE;
1024 }
1025 count ++;
1026 }
729e4ab9 1027
4388f060 1028 ce = ucol_next(coleiter, &status);
b75a7d8f
A
1029 if (U_FAILURE(status)) {
1030 return TRUE;
1031 }
46f4442e 1032 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
4388f060 1033 ce = getCE(strsrch, ce);
46f4442e 1034 }
b75a7d8f
A
1035 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1036 if (ucol_getOffset(coleiter) <= end) {
1037 return TRUE;
1038 }
1039 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1040 return TRUE;
1041 }
1042 }
1043 }
1044 }
1045 return FALSE;
1046}
46f4442e 1047#endif // #if BOYER_MOORE
b75a7d8f
A
1048
1049/**
1050* Checks if the offset runs out of the text string
729e4ab9 1051* @param offset
b75a7d8f
A
1052* @param textlength of the text string
1053* @return TRUE if offset is out of bounds, FALSE otherwise
1054*/
1055static
1056inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1057{
1058 return offset < 0 || offset > textlength;
1059}
1060
1061/**
1062* Checks for identical match
1063* @param strsrch string search data
1064* @param start offset of possible match
1065* @param end offset of possible match
1066* @return TRUE if identical match is found
1067*/
1068static
729e4ab9
A
1069inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1070 int32_t end)
b75a7d8f 1071{
b75a7d8f
A
1072 if (strsrch->strength != UCOL_IDENTICAL) {
1073 return TRUE;
1074 }
1075
729e4ab9
A
1076 // Note: We could use Normalizer::compare() or similar, but for short strings
1077 // which may not be in FCD it might be faster to just NFD them.
1078 UErrorCode status = U_ZERO_ERROR;
1079 UnicodeString t2, p2;
1080 strsrch->nfd->normalize(
1081 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1082 strsrch->nfd->normalize(
1083 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
73c04bcf 1084 // return FALSE if NFD failed
729e4ab9 1085 return U_SUCCESS(status) && t2 == p2;
b75a7d8f
A
1086}
1087
729e4ab9 1088#if BOYER_MOORE
b75a7d8f
A
1089/**
1090* Checks to see if the match is repeated
1091* @param strsrch string search data
1092* @param start new match start index
1093* @param end new match end index
1094* @return TRUE if the the match is repeated, FALSE otherwise
1095*/
1096static
1097inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1098 int32_t start,
1099 int32_t end)
1100{
1101 int32_t lastmatchindex = strsrch->search->matchedIndex;
1102 UBool result;
1103 if (lastmatchindex == USEARCH_DONE) {
1104 return FALSE;
1105 }
1106 if (strsrch->search->isForwardSearching) {
1107 result = start <= lastmatchindex;
1108 }
1109 else {
1110 result = start >= lastmatchindex;
1111 }
374ca955 1112 if (!result && !strsrch->search->isOverlap) {
b75a7d8f
A
1113 if (strsrch->search->isForwardSearching) {
1114 result = start < lastmatchindex + strsrch->search->matchedLength;
1115 }
1116 else {
1117 result = end > lastmatchindex;
1118 }
1119 }
1120 return result;
1121}
1122
1123/**
1124* Gets the collation element iterator's current offset.
1125* @param coleiter collation element iterator
1126* @param forwards flag TRUE if we are moving in th forwards direction
729e4ab9 1127* @return current offset
b75a7d8f
A
1128*/
1129static
1130inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1131 UBool forwards)
1132{
1133 int32_t result = ucol_getOffset(coleiter);
1134 // intricacies of the the backwards collation element iterator
46f4442e 1135 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
b75a7d8f
A
1136 result ++;
1137 }
1138 return result;
1139}
1140
1141/**
729e4ab9 1142* Checks match for contraction.
b75a7d8f
A
1143* If the match ends with a partial contraction we fail.
1144* If the match starts too far off (because of backwards iteration) we try to
1145* chip off the extra characters depending on whether a breakiterator has
1146* been used.
729e4ab9 1147* Internal method, error assumed to be success, caller has to check status
b75a7d8f
A
1148* before calling this method.
1149* @param strsrch string search data
1150* @param start offset of potential match, to be modified if necessary
1151* @param end offset of potential match, to be modified if necessary
1152* @param status output error status if any
1153* @return TRUE if match passes the contraction test, FALSE otherwise
1154*/
1155
1156static
729e4ab9
A
1157UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1158 int32_t *start,
1159 int32_t *end, UErrorCode *status)
b75a7d8f
A
1160{
1161 UCollationElements *coleiter = strsrch->textIter;
1162 int32_t textlength = strsrch->search->textLength;
46f4442e 1163 int32_t temp = *start;
b75a7d8f
A
1164 const UCollator *collator = strsrch->collator;
1165 const UChar *text = strsrch->search->text;
729e4ab9 1166 // This part checks if either ends of the match contains potential
b75a7d8f 1167 // contraction. If so we'll have to iterate through them
374ca955
A
1168 // The start contraction needs to be checked since ucol_previous dumps
1169 // all characters till the first safe character into the buffer.
729e4ab9
A
1170 // *start + 1 is used to test for the unsafe characters instead of *start
1171 // because ucol_prev takes all unsafe characters till the first safe
1172 // character ie *start. so by testing *start + 1, we can estimate if
1173 // excess prefix characters has been included in the potential search
374ca955 1174 // results.
729e4ab9
A
1175 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1176 (*start + 1 < textlength
b75a7d8f
A
1177 && ucol_unsafeCP(text[*start + 1], collator))) {
1178 int32_t expansion = getExpansionPrefix(coleiter);
1179 UBool expandflag = expansion > 0;
1180 setColEIterOffset(coleiter, *start);
1181 while (expansion > 0) {
1182 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1183 // since backward contraction/expansion may have extra ces if we
1184 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1185 // have taken care of it.
1186 // E.g. the character \u01FA will have an expansion of 3, but if
1187 // we are only looking for acute and ring \u030A and \u0301, we'll
1188 // have to skip the first ce in the expansion buffer.
1189 ucol_next(coleiter, status);
374ca955
A
1190 if (U_FAILURE(*status)) {
1191 return FALSE;
1192 }
b75a7d8f
A
1193 if (ucol_getOffset(coleiter) != temp) {
1194 *start = temp;
1195 temp = ucol_getOffset(coleiter);
1196 }
1197 expansion --;
1198 }
1199
b331163b
A
1200 int32_t *patternce = strsrch->pattern.ces;
1201 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
1202 int32_t count = 0;
1203 while (count < patterncelength) {
374ca955 1204 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1205 if (ce == UCOL_IGNORABLE) {
1206 continue;
1207 }
1208 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1209 *start = temp;
1210 temp = ucol_getOffset(coleiter);
1211 }
1212 if (U_FAILURE(*status) || ce != patternce[count]) {
1213 (*end) ++;
729e4ab9 1214 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1215 return FALSE;
1216 }
1217 count ++;
1218 }
729e4ab9 1219 }
b75a7d8f
A
1220 return TRUE;
1221}
1222
1223/**
1224* Checks and sets the match information if found.
729e4ab9 1225* Checks
b75a7d8f
A
1226* <ul>
1227* <li> the potential match does not repeat the previous match
1228* <li> boundaries are correct
1229* <li> exact matches has no extra accents
1230* <li> identical matchesb
1231* <li> potential match does not end in the middle of a contraction
1232* <\ul>
1233* Otherwise the offset will be shifted to the next character.
729e4ab9 1234* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1235* before calling this method.
1236* @param strsrch string search data
1237* @param textoffset offset in the collation element text. the returned value
729e4ab9 1238* will be the truncated end offset of the match or the new start
b75a7d8f
A
1239* search offset.
1240* @param status output error status if any
1241* @return TRUE if the match is valid, FALSE otherwise
1242*/
1243static
729e4ab9 1244inline UBool checkNextExactMatch(UStringSearch *strsrch,
b75a7d8f
A
1245 int32_t *textoffset, UErrorCode *status)
1246{
1247 UCollationElements *coleiter = strsrch->textIter;
729e4ab9
A
1248 int32_t start = getColElemIterOffset(coleiter, FALSE);
1249
374ca955
A
1250 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1251 return FALSE;
b75a7d8f
A
1252 }
1253
1254 // this totally matches, however we need to check if it is repeating
1255 if (!isBreakUnit(strsrch, start, *textoffset) ||
729e4ab9
A
1256 checkRepeatedMatch(strsrch, start, *textoffset) ||
1257 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
b75a7d8f
A
1258 !checkIdentical(strsrch, start, *textoffset) ||
1259 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
374ca955
A
1260
1261 (*textoffset) ++;
729e4ab9 1262 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
374ca955 1263 return FALSE;
b75a7d8f 1264 }
46f4442e
A
1265
1266 //Add breakiterator boundary check for primary strength search.
1267 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 1268 checkBreakBoundary(strsrch, &start, textoffset);
46f4442e 1269 }
729e4ab9 1270
b75a7d8f
A
1271 // totally match, we will get rid of the ending ignorables.
1272 strsrch->search->matchedIndex = start;
1273 strsrch->search->matchedLength = *textoffset - start;
374ca955 1274 return TRUE;
b75a7d8f
A
1275}
1276
1277/**
729e4ab9 1278* Getting the previous base character offset, or the current offset if the
b75a7d8f
A
1279* current character is a base character
1280* @param text string
1281* @param textoffset one offset after the current character
729e4ab9 1282* @return the offset of the next character after the base character or the first
b75a7d8f
A
1283* composed character with accents
1284*/
1285static
729e4ab9 1286inline int32_t getPreviousBaseOffset(const UChar *text,
b75a7d8f
A
1287 int32_t textoffset)
1288{
1289 if (textoffset > 0) {
46f4442e 1290 for (;;) {
b75a7d8f 1291 int32_t result = textoffset;
4388f060 1292 U16_BACK_1(text, 0, textoffset);
b75a7d8f
A
1293 int32_t temp = textoffset;
1294 uint16_t fcd = getFCD(text, &temp, result);
1295 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1296 if (fcd & LAST_BYTE_MASK_) {
1297 return textoffset;
1298 }
1299 return result;
1300 }
1301 if (textoffset == 0) {
1302 return 0;
1303 }
1304 }
1305 }
1306 return textoffset;
1307}
1308
1309/**
1310* Getting the indexes of the accents that are not blocked in the argument
1311* accent array
1312* @param accents array of accents in nfd terminated by a 0.
1313* @param accentsindex array of indexes of the accents that are not blocked
1314*/
1315static
1316inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1317{
1318 int32_t index = 0;
1319 int32_t length = u_strlen(accents);
1320 UChar32 codepoint = 0;
1321 int cclass = 0;
1322 int result = 0;
1323 int32_t temp;
1324 while (index < length) {
1325 temp = index;
4388f060 1326 U16_NEXT(accents, index, length, codepoint);
b75a7d8f
A
1327 if (u_getCombiningClass(codepoint) != cclass) {
1328 cclass = u_getCombiningClass(codepoint);
1329 accentsindex[result] = temp;
1330 result ++;
1331 }
1332 }
1333 accentsindex[result] = length;
1334 return result;
1335}
1336
1337/**
1338* Appends 3 UChar arrays to a destination array.
729e4ab9 1339* Creates a new array if we run out of space. The caller will have to
b75a7d8f 1340* manually deallocate the newly allocated array.
729e4ab9
A
1341* Internal method, status assumed to be success, caller has to check status
1342* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
1343* size destinationlength.
1344* @param destination target array
1345* @param destinationlength target array size, returning the appended length
1346* @param source1 null-terminated first array
1347* @param source2 second array
1348* @param source2length length of seond array
1349* @param source3 null-terminated third array
1350* @param status error status if any
1351* @return new destination array, destination if there was no new allocation
1352*/
1353static
729e4ab9
A
1354inline UChar * addToUCharArray( UChar *destination,
1355 int32_t *destinationlength,
1356 const UChar *source1,
b75a7d8f 1357 const UChar *source2,
729e4ab9
A
1358 int32_t source2length,
1359 const UChar *source3,
1360 UErrorCode *status)
b75a7d8f
A
1361{
1362 int32_t source1length = source1 ? u_strlen(source1) : 0;
729e4ab9
A
1363 int32_t source3length = source3 ? u_strlen(source3) : 0;
1364 if (*destinationlength < source1length + source2length + source3length +
1365 1)
b75a7d8f
A
1366 {
1367 destination = (UChar *)allocateMemory(
1368 (source1length + source2length + source3length + 1) * sizeof(UChar),
1369 status);
729e4ab9 1370 // if error allocating memory, status will be
b75a7d8f
A
1371 // U_MEMORY_ALLOCATION_ERROR
1372 if (U_FAILURE(*status)) {
1373 *destinationlength = 0;
1374 return NULL;
1375 }
1376 }
1377 if (source1length != 0) {
1378 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1379 }
1380 if (source2length != 0) {
729e4ab9 1381 uprv_memcpy(destination + source1length, source2,
b75a7d8f
A
1382 sizeof(UChar) * source2length);
1383 }
1384 if (source3length != 0) {
729e4ab9 1385 uprv_memcpy(destination + source1length + source2length, source3,
b75a7d8f
A
1386 sizeof(UChar) * source3length);
1387 }
1388 *destinationlength = source1length + source2length + source3length;
1389 return destination;
1390}
1391
1392/**
1393* Running through a collation element iterator to see if the contents matches
1394* pattern in string search data
1395* @param strsrch string search data
1396* @param coleiter collation element iterator
1397* @return TRUE if a match if found, FALSE otherwise
1398*/
1399static
729e4ab9 1400inline UBool checkCollationMatch(const UStringSearch *strsrch,
b75a7d8f
A
1401 UCollationElements *coleiter)
1402{
b331163b
A
1403 int patternceindex = strsrch->pattern.cesLength;
1404 int32_t *patternce = strsrch->pattern.ces;
b75a7d8f
A
1405 UErrorCode status = U_ZERO_ERROR;
1406 while (patternceindex > 0) {
374ca955 1407 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
1408 if (ce == UCOL_IGNORABLE) {
1409 continue;
1410 }
1411 if (U_FAILURE(status) || ce != *patternce) {
1412 return FALSE;
1413 }
1414 patternce ++;
1415 patternceindex --;
1416 }
1417 return TRUE;
1418}
1419
1420/**
1421* Rearranges the front accents to try matching.
729e4ab9
A
1422* Prefix accents in the text will be grouped according to their combining
1423* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1424* match with the pattern.
1425* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1426* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1427* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1428* "\u0301\u0325".
1429* step 2: check if any of the generated substrings matches the pattern.
1430* Internal method, status is assumed to be success, caller has to check status
1431* before calling this method.
1432* @param strsrch string search match
1433* @param start first offset of the accents to start searching
1434* @param end start of the last accent set
1435* @param status output error status if any
1436* @return USEARCH_DONE if a match is not found, otherwise return the starting
1437* offset of the match. Note this start includes all preceding accents.
1438*/
1439static
729e4ab9 1440int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f 1441 int32_t start,
729e4ab9 1442 int32_t end,
b75a7d8f
A
1443 UErrorCode *status)
1444{
1445 const UChar *text = strsrch->search->text;
1446 int32_t textlength = strsrch->search->textLength;
1447 int32_t tempstart = start;
1448
1449 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1450 // die... failed at a base character
1451 return USEARCH_DONE;
1452 }
1453
1454 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1455 start = getPreviousBaseOffset(text, tempstart);
1456
1457 UChar accents[INITIAL_ARRAY_SIZE_];
1458 // normalizing the offensive string
729e4ab9
A
1459 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1460 INITIAL_ARRAY_SIZE_, status);
b75a7d8f
A
1461 if (U_FAILURE(*status)) {
1462 return USEARCH_DONE;
1463 }
729e4ab9
A
1464
1465 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1466 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 1467 accentsindex);
729e4ab9 1468 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
1469 UChar buffer[INITIAL_ARRAY_SIZE_];
1470 UCollationElements *coleiter = strsrch->utilIter;
1471 while (U_SUCCESS(*status) && count > 0) {
1472 UChar *rearrange = strsrch->canonicalPrefixAccents;
1473 // copy the base characters
1474 for (int k = 0; k < accentsindex[0]; k ++) {
1475 *rearrange ++ = accents[k];
1476 }
1477 // forming all possible canonical rearrangement by dropping
1478 // sets of accents
1479 for (int i = 0; i <= accentsize - 1; i ++) {
1480 int32_t mask = 1 << (accentsize - i - 1);
1481 if (count & mask) {
1482 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1483 *rearrange ++ = accents[j];
1484 }
1485 }
1486 }
1487 *rearrange = 0;
1488 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1489 UChar *match = addToUCharArray(buffer, &matchsize,
1490 strsrch->canonicalPrefixAccents,
1491 strsrch->search->text + offset,
1492 end - offset,
1493 strsrch->canonicalSuffixAccents,
1494 status);
729e4ab9 1495
b75a7d8f
A
1496 // if status is a failure, ucol_setText does nothing.
1497 // run the collator iterator through this match
1498 ucol_setText(coleiter, match, matchsize, status);
1499 if (U_SUCCESS(*status)) {
1500 if (checkCollationMatch(strsrch, coleiter)) {
1501 if (match != buffer) {
1502 uprv_free(match);
1503 }
1504 return start;
1505 }
1506 }
1507 count --;
1508 }
1509 return USEARCH_DONE;
1510}
1511
1512/**
1513* Gets the offset to the safe point in text before textoffset.
1514* ie. not the middle of a contraction, swappable characters or supplementary
1515* characters.
1516* @param collator collation sata
1517* @param text string to work with
1518* @param textoffset offset in string
1519* @param textlength length of text string
1520* @return offset to the previous safe character
1521*/
1522static
729e4ab9 1523inline uint32_t getPreviousSafeOffset(const UCollator *collator,
b75a7d8f
A
1524 const UChar *text,
1525 int32_t textoffset)
1526{
1527 int32_t result = textoffset; // first contraction character
1528 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1529 result --;
1530 }
1531 if (result != 0) {
1532 // the first contraction character is consider unsafe here
1533 result --;
1534 }
729e4ab9 1535 return result;
b75a7d8f
A
1536}
1537
1538/**
1539* Cleaning up after we passed the safe zone
1540* @param strsrch string search data
1541* @param safetext safe text array
1542* @param safebuffer safe text buffer
1543* @param coleiter collation element iterator for safe text
1544*/
1545static
1546inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1547 UChar *safebuffer)
1548{
729e4ab9 1549 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
b75a7d8f
A
1550 {
1551 uprv_free(safetext);
1552 }
1553}
1554
1555/**
1556* Take the rearranged end accents and tries matching. If match failed at
1557* a seperate preceding set of accents (seperated from the rearranged on by
729e4ab9 1558* at least a base character) then we rearrange the preceding accents and
b75a7d8f 1559* tries matching again.
729e4ab9 1560* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f
A
1561* However if the failure is found before the accent set, it fails.
1562* Internal method, status assumed to be success, caller has to check status
1563* before calling this method.
1564* @param strsrch string search data
1565* @param textoffset of the start of the rearranged accent
1566* @param status output error status if any
1567* @return USEARCH_DONE if a match is not found, otherwise return the starting
1568* offset of the match. Note this start includes all preceding accents.
1569*/
1570static
729e4ab9 1571int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f
A
1572 int32_t textoffset,
1573 UErrorCode *status)
1574{
1575 const UChar *text = strsrch->search->text;
1576 const UCollator *collator = strsrch->collator;
1577 int32_t safelength = 0;
1578 UChar *safetext;
1579 int32_t safetextlength;
1580 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1581 UCollationElements *coleiter = strsrch->utilIter;
1582 int32_t safeoffset = textoffset;
1583
729e4ab9 1584 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
b75a7d8f
A
1585 collator)) {
1586 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1587 safelength = textoffset - safeoffset;
1588 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
1589 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1590 text + safeoffset, safelength,
1591 strsrch->canonicalSuffixAccents,
b75a7d8f
A
1592 status);
1593 }
1594 else {
1595 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1596 safetext = strsrch->canonicalSuffixAccents;
1597 }
1598
1599 // if status is a failure, ucol_setText does nothing
1600 ucol_setText(coleiter, safetext, safetextlength, status);
1601 // status checked in loop below
1602
b331163b
A
1603 int32_t *ce = strsrch->pattern.ces;
1604 int32_t celength = strsrch->pattern.cesLength;
b75a7d8f
A
1605 int ceindex = celength - 1;
1606 UBool isSafe = TRUE; // indication flag for position in safe zone
729e4ab9 1607
b75a7d8f 1608 while (ceindex >= 0) {
374ca955 1609 int32_t textce = ucol_previous(coleiter, status);
b75a7d8f
A
1610 if (U_FAILURE(*status)) {
1611 if (isSafe) {
1612 cleanUpSafeText(strsrch, safetext, safebuffer);
1613 }
1614 return USEARCH_DONE;
1615 }
1616 if (textce == UCOL_NULLORDER) {
1617 // check if we have passed the safe buffer
1618 if (coleiter == strsrch->textIter) {
1619 cleanUpSafeText(strsrch, safetext, safebuffer);
1620 return USEARCH_DONE;
1621 }
1622 cleanUpSafeText(strsrch, safetext, safebuffer);
1623 safetext = safebuffer;
1624 coleiter = strsrch->textIter;
1625 setColEIterOffset(coleiter, safeoffset);
1626 // status checked at the start of the loop
1627 isSafe = FALSE;
1628 continue;
1629 }
1630 textce = getCE(strsrch, textce);
1631 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1632 // do the beginning stuff
1633 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1634 if (isSafe && failedoffset >= safelength) {
1635 // alas... no hope. failed at rearranged accent set
1636 cleanUpSafeText(strsrch, safetext, safebuffer);
1637 return USEARCH_DONE;
1638 }
1639 else {
1640 if (isSafe) {
1641 failedoffset += safeoffset;
1642 cleanUpSafeText(strsrch, safetext, safebuffer);
1643 }
729e4ab9 1644
b75a7d8f 1645 // try rearranging the front accents
729e4ab9 1646 int32_t result = doNextCanonicalPrefixMatch(strsrch,
b75a7d8f
A
1647 failedoffset, textoffset, status);
1648 if (result != USEARCH_DONE) {
1649 // if status is a failure, ucol_setOffset does nothing
1650 setColEIterOffset(strsrch->textIter, result);
1651 }
1652 if (U_FAILURE(*status)) {
1653 return USEARCH_DONE;
1654 }
1655 return result;
1656 }
1657 }
1658 if (textce == ce[ceindex]) {
1659 ceindex --;
1660 }
1661 }
1662 // set offset here
1663 if (isSafe) {
1664 int32_t result = getColElemIterOffset(coleiter, FALSE);
1665 // sets the text iterator here with the correct expansion and offset
1666 int32_t leftoverces = getExpansionPrefix(coleiter);
1667 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 1668 if (result >= safelength) {
b75a7d8f
A
1669 result = textoffset;
1670 }
1671 else {
1672 result += safeoffset;
1673 }
1674 setColEIterOffset(strsrch->textIter, result);
729e4ab9 1675 strsrch->textIter->iteratordata_.toReturn =
b75a7d8f
A
1676 setExpansionPrefix(strsrch->textIter, leftoverces);
1677 return result;
1678 }
729e4ab9
A
1679
1680 return ucol_getOffset(coleiter);
b75a7d8f
A
1681}
1682
1683/**
1684* Trying out the substring and sees if it can be a canonical match.
1685* This will try normalizing the end accents and arranging them into canonical
1686* equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
1687* Suffix accents in the text will be grouped according to their combining
1688* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1689* match with the pattern.
1690* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1691* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1692* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1693* "\u0301\u0325".
1694* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 1695* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1696* before calling this method.
1697* @param strsrch string search data
729e4ab9 1698* @param textoffset end offset in the collation element text that ends with
b75a7d8f
A
1699* the accents to be rearranged
1700* @param status error status if any
1701* @return TRUE if the match is valid, FALSE otherwise
1702*/
1703static
729e4ab9
A
1704UBool doNextCanonicalMatch(UStringSearch *strsrch,
1705 int32_t textoffset,
b75a7d8f
A
1706 UErrorCode *status)
1707{
1708 const UChar *text = strsrch->search->text;
1709 int32_t temp = textoffset;
4388f060 1710 U16_BACK_1(text, 0, temp);
b75a7d8f
A
1711 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1712 UCollationElements *coleiter = strsrch->textIter;
1713 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1714 if (strsrch->pattern.hasPrefixAccents) {
729e4ab9 1715 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
b75a7d8f
A
1716 status);
1717 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1718 setColEIterOffset(coleiter, offset);
1719 return TRUE;
1720 }
1721 }
1722 return FALSE;
1723 }
1724
1725 if (!strsrch->pattern.hasSuffixAccents) {
1726 return FALSE;
1727 }
1728
1729 UChar accents[INITIAL_ARRAY_SIZE_];
1730 // offset to the last base character in substring to search
1731 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1732 // normalizing the offensive string
729e4ab9
A
1733 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1734 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 1735 // status checked in loop below
729e4ab9 1736
b75a7d8f
A
1737 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1738 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1739
374ca955
A
1740 // 2 power n - 1 plus the full set of accents
1741 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
1742 while (U_SUCCESS(*status) && count > 0) {
1743 UChar *rearrange = strsrch->canonicalSuffixAccents;
1744 // copy the base characters
1745 for (int k = 0; k < accentsindex[0]; k ++) {
1746 *rearrange ++ = accents[k];
1747 }
1748 // forming all possible canonical rearrangement by dropping
1749 // sets of accents
1750 for (int i = 0; i <= size - 1; i ++) {
1751 int32_t mask = 1 << (size - i - 1);
1752 if (count & mask) {
1753 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1754 *rearrange ++ = accents[j];
1755 }
1756 }
1757 }
1758 *rearrange = 0;
729e4ab9 1759 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
b75a7d8f
A
1760 status);
1761 if (offset != USEARCH_DONE) {
1762 return TRUE; // match found
1763 }
1764 count --;
1765 }
1766 return FALSE;
1767}
1768
1769/**
729e4ab9 1770* Gets the previous base character offset depending on the string search
b75a7d8f
A
1771* pattern data
1772* @param strsrch string search data
1773* @param textoffset current offset, current character
1774* @return the offset of the next character after this base character or itself
1775* if it is a composed character with accents
1776*/
1777static
729e4ab9 1778inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
1779 int32_t textoffset)
1780{
1781 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1782 const UChar *text = strsrch->search->text;
1783 int32_t offset = textoffset;
729e4ab9 1784 if (getFCD(text, &offset, strsrch->search->textLength) >>
b75a7d8f
A
1785 SECOND_LAST_BYTE_SHIFT_) {
1786 return getPreviousBaseOffset(text, textoffset);
1787 }
1788 }
1789 return textoffset;
1790}
1791
1792/**
729e4ab9 1793* Checks match for contraction.
b75a7d8f
A
1794* If the match ends with a partial contraction we fail.
1795* If the match starts too far off (because of backwards iteration) we try to
1796* chip off the extra characters
729e4ab9 1797* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1798* before calling this method.
1799* @param strsrch string search data
1800* @param start offset of potential match, to be modified if necessary
1801* @param end offset of potential match, to be modified if necessary
1802* @param status output error status if any
1803* @return TRUE if match passes the contraction test, FALSE otherwise
1804*/
1805static
729e4ab9
A
1806UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1807 int32_t *start,
1808 int32_t *end,
1809 UErrorCode *status)
b75a7d8f
A
1810{
1811 UCollationElements *coleiter = strsrch->textIter;
1812 int32_t textlength = strsrch->search->textLength;
1813 int32_t temp = *start;
1814 const UCollator *collator = strsrch->collator;
1815 const UChar *text = strsrch->search->text;
729e4ab9 1816 // This part checks if either ends of the match contains potential
b75a7d8f 1817 // contraction. If so we'll have to iterate through them
729e4ab9
A
1818 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1819 (*start + 1 < textlength
b75a7d8f
A
1820 && ucol_unsafeCP(text[*start + 1], collator))) {
1821 int32_t expansion = getExpansionPrefix(coleiter);
1822 UBool expandflag = expansion > 0;
1823 setColEIterOffset(coleiter, *start);
1824 while (expansion > 0) {
1825 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1826 // since backward contraction/expansion may have extra ces if we
1827 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1828 // have taken care of it.
1829 // E.g. the character \u01FA will have an expansion of 3, but if
1830 // we are only looking for acute and ring \u030A and \u0301, we'll
1831 // have to skip the first ce in the expansion buffer.
1832 ucol_next(coleiter, status);
374ca955
A
1833 if (U_FAILURE(*status)) {
1834 return FALSE;
1835 }
b75a7d8f
A
1836 if (ucol_getOffset(coleiter) != temp) {
1837 *start = temp;
1838 temp = ucol_getOffset(coleiter);
1839 }
1840 expansion --;
1841 }
1842
b331163b
A
1843 int32_t *patternce = strsrch->pattern.ces;
1844 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
1845 int32_t count = 0;
1846 int32_t textlength = strsrch->search->textLength;
1847 while (count < patterncelength) {
374ca955 1848 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1849 // status checked below, note that if status is a failure
1850 // ucol_next returns UCOL_NULLORDER
1851 if (ce == UCOL_IGNORABLE) {
1852 continue;
1853 }
1854 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1855 *start = temp;
1856 temp = ucol_getOffset(coleiter);
1857 }
1858
1859 if (count == 0 && ce != patternce[0]) {
729e4ab9 1860 // accents may have extra starting ces, this occurs when a
b75a7d8f
A
1861 // pure accent pattern is matched without rearrangement
1862 // text \u0325\u0300 and looking for \u0300
729e4ab9 1863 int32_t expected = patternce[0];
b75a7d8f
A
1864 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1865 ce = getCE(strsrch, ucol_next(coleiter, status));
729e4ab9 1866 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
1867 ce != UCOL_NULLORDER &&
1868 ucol_getOffset(coleiter) <= *end) {
1869 ce = getCE(strsrch, ucol_next(coleiter, status));
1870 }
1871 }
1872 }
1873 if (U_FAILURE(*status) || ce != patternce[count]) {
1874 (*end) ++;
729e4ab9 1875 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1876 return FALSE;
1877 }
1878 count ++;
1879 }
729e4ab9 1880 }
b75a7d8f
A
1881 return TRUE;
1882}
1883
1884/**
1885* Checks and sets the match information if found.
729e4ab9 1886* Checks
b75a7d8f
A
1887* <ul>
1888* <li> the potential match does not repeat the previous match
1889* <li> boundaries are correct
1890* <li> potential match does not end in the middle of a contraction
1891* <li> identical matches
1892* <\ul>
1893* Otherwise the offset will be shifted to the next character.
729e4ab9 1894* Internal method, status assumed to be success, caller has to check the
b75a7d8f
A
1895* status before calling this method.
1896* @param strsrch string search data
1897* @param textoffset offset in the collation element text. the returned value
729e4ab9 1898* will be the truncated end offset of the match or the new start
b75a7d8f
A
1899* search offset.
1900* @param status output error status if any
1901* @return TRUE if the match is valid, FALSE otherwise
1902*/
1903static
729e4ab9
A
1904inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1905 int32_t *textoffset,
b75a7d8f
A
1906 UErrorCode *status)
1907{
1908 // to ensure that the start and ends are not composite characters
1909 UCollationElements *coleiter = strsrch->textIter;
1910 // if we have a canonical accent match
729e4ab9
A
1911 if ((strsrch->pattern.hasSuffixAccents &&
1912 strsrch->canonicalSuffixAccents[0]) ||
1913 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
1914 strsrch->canonicalPrefixAccents[0])) {
1915 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1916 strsrch,
1917 ucol_getOffset(coleiter));
729e4ab9 1918 strsrch->search->matchedLength = *textoffset -
b75a7d8f
A
1919 strsrch->search->matchedIndex;
1920 return TRUE;
1921 }
1922
1923 int32_t start = getColElemIterOffset(coleiter, FALSE);
729e4ab9 1924 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
b75a7d8f
A
1925 status) || U_FAILURE(*status)) {
1926 return FALSE;
1927 }
729e4ab9 1928
b75a7d8f
A
1929 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1930 // this totally matches, however we need to check if it is repeating
729e4ab9
A
1931 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1932 !isBreakUnit(strsrch, start, *textoffset) ||
b75a7d8f
A
1933 !checkIdentical(strsrch, start, *textoffset)) {
1934 (*textoffset) ++;
729e4ab9 1935 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
b75a7d8f
A
1936 strsrch->search->textLength);
1937 return FALSE;
1938 }
729e4ab9 1939
b75a7d8f
A
1940 strsrch->search->matchedIndex = start;
1941 strsrch->search->matchedLength = *textoffset - start;
1942 return TRUE;
1943}
1944
1945/**
1946* Shifting the collation element iterator position forward to prepare for
1947* a preceding match. If the first character is a unsafe character, we'll only
1948* shift by 1 to capture contractions, normalization etc.
729e4ab9 1949* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1950* before calling this method.
1951* @param text strsrch string search data
1952* @param textoffset start text position to do search
1953* @param ce the text ce which failed the match.
1954* @param patternceindex index of the ce within the pattern ce buffer which
1955* failed the match
1956* @return final offset
1957*/
1958static
1959inline int32_t reverseShift(UStringSearch *strsrch,
1960 int32_t textoffset,
374ca955 1961 int32_t ce,
b75a7d8f 1962 int32_t patternceindex)
729e4ab9 1963{
b75a7d8f
A
1964 if (strsrch->search->isOverlap) {
1965 if (textoffset != strsrch->search->textLength) {
1966 textoffset --;
1967 }
1968 else {
1969 textoffset -= strsrch->pattern.defaultShiftSize;
1970 }
1971 }
1972 else {
1973 if (ce != UCOL_NULLORDER) {
1974 int32_t shift = strsrch->pattern.backShift[hash(ce)];
729e4ab9
A
1975
1976 // this is to adjust for characters in the middle of the substring
b75a7d8f
A
1977 // for matching that failed.
1978 int32_t adjust = patternceindex;
1979 if (adjust > 1 && shift > adjust) {
1980 shift -= adjust - 1;
1981 }
1982 textoffset -= shift;
1983 }
1984 else {
1985 textoffset -= strsrch->pattern.defaultShiftSize;
1986 }
729e4ab9 1987 }
b75a7d8f
A
1988 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1989 return textoffset;
1990}
1991
1992/**
729e4ab9 1993* Checks match for contraction.
b75a7d8f 1994* If the match starts with a partial contraction we fail.
729e4ab9 1995* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1996* before calling this method.
1997* @param strsrch string search data
1998* @param start offset of potential match, to be modified if necessary
1999* @param end offset of potential match, to be modified if necessary
2000* @param status output error status if any
2001* @return TRUE if match passes the contraction test, FALSE otherwise
2002*/
2003static
729e4ab9
A
2004UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2005 int32_t *start,
2006 int32_t *end, UErrorCode *status)
b75a7d8f
A
2007{
2008 UCollationElements *coleiter = strsrch->textIter;
2009 int32_t textlength = strsrch->search->textLength;
2010 int32_t temp = *end;
2011 const UCollator *collator = strsrch->collator;
2012 const UChar *text = strsrch->search->text;
729e4ab9 2013 // This part checks if either if the start of the match contains potential
b75a7d8f 2014 // contraction. If so we'll have to iterate through them
729e4ab9 2015 // Since we used ucol_next while previously looking for the potential
374ca955
A
2016 // match, this guarantees that our end will not be a partial contraction,
2017 // or a partial supplementary character.
b75a7d8f
A
2018 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2019 int32_t expansion = getExpansionSuffix(coleiter);
2020 UBool expandflag = expansion > 0;
2021 setColEIterOffset(coleiter, *end);
2022 while (U_SUCCESS(*status) && expansion > 0) {
2023 // getting rid of the redundant ce
2024 // since forward contraction/expansion may have extra ces
2025 // if we are in the normalization buffer, hasAccentsBeforeMatch
2026 // would have taken care of it.
2027 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2028 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2029 // last ce in the expansion buffer
2030 ucol_previous(coleiter, status);
374ca955
A
2031 if (U_FAILURE(*status)) {
2032 return FALSE;
2033 }
b75a7d8f
A
2034 if (ucol_getOffset(coleiter) != temp) {
2035 *end = temp;
2036 temp = ucol_getOffset(coleiter);
2037 }
2038 expansion --;
2039 }
2040
b331163b
A
2041 int32_t *patternce = strsrch->pattern.ces;
2042 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
2043 int32_t count = patterncelength;
2044 while (count > 0) {
374ca955 2045 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2046 // status checked below, note that if status is a failure
2047 // ucol_previous returns UCOL_NULLORDER
2048 if (ce == UCOL_IGNORABLE) {
2049 continue;
2050 }
729e4ab9 2051 if (expandflag && count == 0 &&
b75a7d8f
A
2052 getColElemIterOffset(coleiter, FALSE) != temp) {
2053 *end = temp;
2054 temp = ucol_getOffset(coleiter);
2055 }
2056 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2057 (*start) --;
2058 *start = getPreviousBaseOffset(text, *start);
2059 return FALSE;
2060 }
2061 count --;
2062 }
729e4ab9 2063 }
b75a7d8f
A
2064 return TRUE;
2065}
2066
2067/**
2068* Checks and sets the match information if found.
729e4ab9 2069* Checks
b75a7d8f
A
2070* <ul>
2071* <li> the current match does not repeat the last match
2072* <li> boundaries are correct
2073* <li> exact matches has no extra accents
2074* <li> identical matches
2075* <\ul>
2076* Otherwise the offset will be shifted to the preceding character.
2077* Internal method, status assumed to be success, caller has to check status
2078* before calling this method.
2079* @param strsrch string search data
729e4ab9 2080* @param collator
b75a7d8f
A
2081* @param coleiter collation element iterator
2082* @param text string
2083* @param textoffset offset in the collation element text. the returned value
729e4ab9 2084* will be the truncated start offset of the match or the new start
b75a7d8f
A
2085* search offset.
2086* @param status output error status if any
2087* @return TRUE if the match is valid, FALSE otherwise
2088*/
2089static
729e4ab9
A
2090inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2091 int32_t *textoffset,
b75a7d8f
A
2092 UErrorCode *status)
2093{
2094 // to ensure that the start and ends are not composite characters
729e4ab9 2095 int32_t end = ucol_getOffset(strsrch->textIter);
b75a7d8f
A
2096 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2097 || U_FAILURE(*status)) {
2098 return FALSE;
2099 }
729e4ab9 2100
b75a7d8f
A
2101 // this totally matches, however we need to check if it is repeating
2102 // the old match
729e4ab9 2103 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
b75a7d8f
A
2104 !isBreakUnit(strsrch, *textoffset, end) ||
2105 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
729e4ab9 2106 !checkIdentical(strsrch, *textoffset, end) ||
b75a7d8f
A
2107 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2108 (*textoffset) --;
729e4ab9 2109 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2110 *textoffset);
2111 return FALSE;
2112 }
729e4ab9 2113
46f4442e
A
2114 //Add breakiterator boundary check for primary strength search.
2115 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 2116 checkBreakBoundary(strsrch, textoffset, &end);
46f4442e 2117 }
729e4ab9 2118
b75a7d8f
A
2119 strsrch->search->matchedIndex = *textoffset;
2120 strsrch->search->matchedLength = end - *textoffset;
2121 return TRUE;
2122}
2123
2124/**
2125* Rearranges the end accents to try matching.
729e4ab9
A
2126* Suffix accents in the text will be grouped according to their combining
2127* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2128* match with the pattern.
2129* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2130* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2131* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2132* "\u0301\u0325".
2133* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 2134* Internal method, status assumed to be success, user has to check status
b75a7d8f
A
2135* before calling this method.
2136* @param strsrch string search match
2137* @param start offset of the first base character
2138* @param end start of the last accent set
2139* @param status only error status if any
2140* @return USEARCH_DONE if a match is not found, otherwise return the ending
2141* offset of the match. Note this start includes all following accents.
2142*/
2143static
729e4ab9 2144int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f 2145 int32_t start,
729e4ab9 2146 int32_t end,
b75a7d8f
A
2147 UErrorCode *status)
2148{
2149 const UChar *text = strsrch->search->text;
2150 int32_t tempend = end;
2151
4388f060 2152 U16_BACK_1(text, 0, tempend);
729e4ab9 2153 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
b75a7d8f
A
2154 LAST_BYTE_MASK_)) {
2155 // die... failed at a base character
2156 return USEARCH_DONE;
2157 }
2158 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2159
2160 if (U_SUCCESS(*status)) {
2161 UChar accents[INITIAL_ARRAY_SIZE_];
2162 int32_t offset = getPreviousBaseOffset(text, end);
2163 // normalizing the offensive string
729e4ab9
A
2164 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2165 INITIAL_ARRAY_SIZE_, status);
2166
2167 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2168 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 2169 accentsindex);
729e4ab9 2170 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
2171 UChar buffer[INITIAL_ARRAY_SIZE_];
2172 UCollationElements *coleiter = strsrch->utilIter;
2173 while (U_SUCCESS(*status) && count > 0) {
2174 UChar *rearrange = strsrch->canonicalSuffixAccents;
2175 // copy the base characters
2176 for (int k = 0; k < accentsindex[0]; k ++) {
2177 *rearrange ++ = accents[k];
2178 }
2179 // forming all possible canonical rearrangement by dropping
2180 // sets of accents
2181 for (int i = 0; i <= accentsize - 1; i ++) {
2182 int32_t mask = 1 << (accentsize - i - 1);
2183 if (count & mask) {
2184 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2185 *rearrange ++ = accents[j];
2186 }
2187 }
2188 }
2189 *rearrange = 0;
2190 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2191 UChar *match = addToUCharArray(buffer, &matchsize,
2192 strsrch->canonicalPrefixAccents,
2193 strsrch->search->text + start,
2194 offset - start,
2195 strsrch->canonicalSuffixAccents,
2196 status);
729e4ab9 2197
b75a7d8f
A
2198 // run the collator iterator through this match
2199 // if status is a failure ucol_setText does nothing
2200 ucol_setText(coleiter, match, matchsize, status);
2201 if (U_SUCCESS(*status)) {
2202 if (checkCollationMatch(strsrch, coleiter)) {
2203 if (match != buffer) {
2204 uprv_free(match);
2205 }
2206 return end;
2207 }
2208 }
2209 count --;
2210 }
2211 }
2212 return USEARCH_DONE;
2213}
2214
2215/**
2216* Take the rearranged start accents and tries matching. If match failed at
2217* a seperate following set of accents (seperated from the rearranged on by
729e4ab9 2218* at least a base character) then we rearrange the preceding accents and
b75a7d8f 2219* tries matching again.
729e4ab9 2220* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f 2221* However if the failure is found before the accent set, it fails.
729e4ab9 2222* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
2223* before calling this method.
2224* @param strsrch string search data
2225* @param textoffset of the ends of the rearranged accent
2226* @param status output error status if any
2227* @return USEARCH_DONE if a match is not found, otherwise return the ending
2228* offset of the match. Note this start includes all following accents.
2229*/
2230static
729e4ab9 2231int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f
A
2232 int32_t textoffset,
2233 UErrorCode *status)
2234{
2235 const UChar *text = strsrch->search->text;
2236 const UCollator *collator = strsrch->collator;
2237 int32_t safelength = 0;
2238 UChar *safetext;
2239 int32_t safetextlength;
2240 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2241 int32_t safeoffset = textoffset;
2242
729e4ab9 2243 if (textoffset &&
b75a7d8f
A
2244 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2245 u_strlen(strsrch->canonicalPrefixAccents) - 1
2246 ], collator)) {
729e4ab9 2247 safeoffset = getNextSafeOffset(collator, text, textoffset,
b75a7d8f
A
2248 strsrch->search->textLength);
2249 safelength = safeoffset - textoffset;
2250 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
2251 safetext = addToUCharArray(safebuffer, &safetextlength,
2252 strsrch->canonicalPrefixAccents,
2253 text + textoffset, safelength,
b75a7d8f
A
2254 NULL, status);
2255 }
2256 else {
2257 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2258 safetext = strsrch->canonicalPrefixAccents;
2259 }
2260
2261 UCollationElements *coleiter = strsrch->utilIter;
2262 // if status is a failure, ucol_setText does nothing
2263 ucol_setText(coleiter, safetext, safetextlength, status);
2264 // status checked in loop below
729e4ab9 2265
b331163b
A
2266 int32_t *ce = strsrch->pattern.ces;
2267 int32_t celength = strsrch->pattern.cesLength;
b75a7d8f
A
2268 int ceindex = 0;
2269 UBool isSafe = TRUE; // safe zone indication flag for position
2270 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
729e4ab9 2271
b75a7d8f 2272 while (ceindex < celength) {
374ca955 2273 int32_t textce = ucol_next(coleiter, status);
b75a7d8f
A
2274 if (U_FAILURE(*status)) {
2275 if (isSafe) {
2276 cleanUpSafeText(strsrch, safetext, safebuffer);
2277 }
2278 return USEARCH_DONE;
2279 }
2280 if (textce == UCOL_NULLORDER) {
2281 // check if we have passed the safe buffer
2282 if (coleiter == strsrch->textIter) {
2283 cleanUpSafeText(strsrch, safetext, safebuffer);
2284 return USEARCH_DONE;
2285 }
2286 cleanUpSafeText(strsrch, safetext, safebuffer);
2287 safetext = safebuffer;
2288 coleiter = strsrch->textIter;
2289 setColEIterOffset(coleiter, safeoffset);
2290 // status checked at the start of the loop
2291 isSafe = FALSE;
2292 continue;
2293 }
2294 textce = getCE(strsrch, textce);
2295 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2296 // do the beginning stuff
2297 int32_t failedoffset = ucol_getOffset(coleiter);
2298 if (isSafe && failedoffset <= prefixlength) {
2299 // alas... no hope. failed at rearranged accent set
2300 cleanUpSafeText(strsrch, safetext, safebuffer);
2301 return USEARCH_DONE;
2302 }
2303 else {
2304 if (isSafe) {
2305 failedoffset = safeoffset - failedoffset;
2306 cleanUpSafeText(strsrch, safetext, safebuffer);
2307 }
729e4ab9 2308
b75a7d8f 2309 // try rearranging the end accents
729e4ab9 2310 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
b75a7d8f
A
2311 textoffset, failedoffset, status);
2312 if (result != USEARCH_DONE) {
2313 // if status is a failure, ucol_setOffset does nothing
2314 setColEIterOffset(strsrch->textIter, result);
2315 }
2316 if (U_FAILURE(*status)) {
2317 return USEARCH_DONE;
2318 }
2319 return result;
2320 }
2321 }
2322 if (textce == ce[ceindex]) {
2323 ceindex ++;
2324 }
2325 }
2326 // set offset here
2327 if (isSafe) {
2328 int32_t result = ucol_getOffset(coleiter);
2329 // sets the text iterator here with the correct expansion and offset
2330 int32_t leftoverces = getExpansionSuffix(coleiter);
2331 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 2332 if (result <= prefixlength) {
b75a7d8f
A
2333 result = textoffset;
2334 }
2335 else {
2336 result = textoffset + (safeoffset - result);
2337 }
2338 setColEIterOffset(strsrch->textIter, result);
2339 setExpansionSuffix(strsrch->textIter, leftoverces);
2340 return result;
2341 }
729e4ab9
A
2342
2343 return ucol_getOffset(coleiter);
b75a7d8f
A
2344}
2345
2346/**
2347* Trying out the substring and sees if it can be a canonical match.
729e4ab9 2348* This will try normalizing the starting accents and arranging them into
b75a7d8f 2349* canonical equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
2350* Prefix accents in the text will be grouped according to their combining
2351* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2352* match with the pattern.
2353* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2354* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2355* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2356* "\u0301\u0325".
2357* step 2: check if any of the generated substrings matches the pattern.
2358* Internal method, status assumed to be success, caller has to check status
2359* before calling this method.
2360* @param strsrch string search data
729e4ab9 2361* @param textoffset start offset in the collation element text that starts
b75a7d8f
A
2362* with the accents to be rearranged
2363* @param status output error status if any
2364* @return TRUE if the match is valid, FALSE otherwise
2365*/
2366static
729e4ab9
A
2367UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2368 int32_t textoffset,
b75a7d8f
A
2369 UErrorCode *status)
2370{
2371 const UChar *text = strsrch->search->text;
2372 int32_t temp = textoffset;
2373 int32_t textlength = strsrch->search->textLength;
2374 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2375 UCollationElements *coleiter = strsrch->textIter;
2376 int32_t offset = ucol_getOffset(coleiter);
2377 if (strsrch->pattern.hasSuffixAccents) {
729e4ab9 2378 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
b75a7d8f
A
2379 offset, status);
2380 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2381 setColEIterOffset(coleiter, offset);
2382 return TRUE;
2383 }
2384 }
2385 return FALSE;
2386 }
2387
2388 if (!strsrch->pattern.hasPrefixAccents) {
2389 return FALSE;
2390 }
2391
2392 UChar accents[INITIAL_ARRAY_SIZE_];
2393 // offset to the last base character in substring to search
2394 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2395 // normalizing the offensive string
729e4ab9
A
2396 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2397 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 2398 // status checked in loop
729e4ab9 2399
b75a7d8f
A
2400 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2401 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2402
374ca955 2403 // 2 power n - 1 plus the full set of accents
729e4ab9 2404 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
2405 while (U_SUCCESS(*status) && count > 0) {
2406 UChar *rearrange = strsrch->canonicalPrefixAccents;
2407 // copy the base characters
2408 for (int k = 0; k < accentsindex[0]; k ++) {
2409 *rearrange ++ = accents[k];
2410 }
2411 // forming all possible canonical rearrangement by dropping
2412 // sets of accents
2413 for (int i = 0; i <= size - 1; i ++) {
2414 int32_t mask = 1 << (size - i - 1);
2415 if (count & mask) {
2416 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2417 *rearrange ++ = accents[j];
2418 }
2419 }
2420 }
2421 *rearrange = 0;
729e4ab9 2422 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
b75a7d8f
A
2423 baseoffset, status);
2424 if (offset != USEARCH_DONE) {
2425 return TRUE; // match found
2426 }
2427 count --;
2428 }
2429 return FALSE;
2430}
2431
2432/**
729e4ab9 2433* Checks match for contraction.
b75a7d8f
A
2434* If the match starts with a partial contraction we fail.
2435* Internal method, status assumed to be success, caller has to check status
2436* before calling this method.
2437* @param strsrch string search data
2438* @param start offset of potential match, to be modified if necessary
2439* @param end offset of potential match, to be modified if necessary
2440* @param status only error status if any
2441* @return TRUE if match passes the contraction test, FALSE otherwise
2442*/
2443static
729e4ab9
A
2444UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2445 int32_t *start,
2446 int32_t *end, UErrorCode *status)
b75a7d8f
A
2447{
2448 UCollationElements *coleiter = strsrch->textIter;
2449 int32_t textlength = strsrch->search->textLength;
2450 int32_t temp = *end;
2451 const UCollator *collator = strsrch->collator;
2452 const UChar *text = strsrch->search->text;
729e4ab9 2453 // This part checks if either if the start of the match contains potential
b75a7d8f 2454 // contraction. If so we'll have to iterate through them
729e4ab9 2455 // Since we used ucol_next while previously looking for the potential
374ca955
A
2456 // match, this guarantees that our end will not be a partial contraction,
2457 // or a partial supplementary character.
b75a7d8f
A
2458 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2459 int32_t expansion = getExpansionSuffix(coleiter);
2460 UBool expandflag = expansion > 0;
2461 setColEIterOffset(coleiter, *end);
2462 while (expansion > 0) {
2463 // getting rid of the redundant ce
2464 // since forward contraction/expansion may have extra ces
2465 // if we are in the normalization buffer, hasAccentsBeforeMatch
2466 // would have taken care of it.
2467 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2468 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2469 // last ce in the expansion buffer
2470 ucol_previous(coleiter, status);
374ca955
A
2471 if (U_FAILURE(*status)) {
2472 return FALSE;
2473 }
b75a7d8f
A
2474 if (ucol_getOffset(coleiter) != temp) {
2475 *end = temp;
2476 temp = ucol_getOffset(coleiter);
2477 }
2478 expansion --;
2479 }
2480
b331163b
A
2481 int32_t *patternce = strsrch->pattern.ces;
2482 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
2483 int32_t count = patterncelength;
2484 while (count > 0) {
374ca955 2485 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2486 // status checked below, note that if status is a failure
2487 // ucol_previous returns UCOL_NULLORDER
2488 if (ce == UCOL_IGNORABLE) {
2489 continue;
2490 }
729e4ab9 2491 if (expandflag && count == 0 &&
b75a7d8f
A
2492 getColElemIterOffset(coleiter, FALSE) != temp) {
2493 *end = temp;
2494 temp = ucol_getOffset(coleiter);
2495 }
729e4ab9 2496 if (count == patterncelength &&
b75a7d8f 2497 ce != patternce[patterncelength - 1]) {
729e4ab9 2498 // accents may have extra starting ces, this occurs when a
b75a7d8f 2499 // pure accent pattern is matched without rearrangement
374ca955 2500 int32_t expected = patternce[patterncelength - 1];
4388f060 2501 U16_BACK_1(text, 0, *end);
b75a7d8f
A
2502 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2503 ce = getCE(strsrch, ucol_previous(coleiter, status));
729e4ab9 2504 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
2505 ce != UCOL_NULLORDER &&
2506 ucol_getOffset(coleiter) <= *start) {
2507 ce = getCE(strsrch, ucol_previous(coleiter, status));
2508 }
2509 }
2510 }
2511 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2512 (*start) --;
2513 *start = getPreviousBaseOffset(text, *start);
2514 return FALSE;
2515 }
2516 count --;
2517 }
729e4ab9 2518 }
b75a7d8f
A
2519 return TRUE;
2520}
2521
2522/**
2523* Checks and sets the match information if found.
729e4ab9 2524* Checks
b75a7d8f
A
2525* <ul>
2526* <li> the potential match does not repeat the previous match
2527* <li> boundaries are correct
2528* <li> potential match does not end in the middle of a contraction
2529* <li> identical matches
2530* <\ul>
2531* Otherwise the offset will be shifted to the next character.
2532* Internal method, status assumed to be success, caller has to check status
2533* before calling this method.
2534* @param strsrch string search data
2535* @param textoffset offset in the collation element text. the returned value
729e4ab9 2536* will be the truncated start offset of the match or the new start
b75a7d8f
A
2537* search offset.
2538* @param status only error status if any
2539* @return TRUE if the match is valid, FALSE otherwise
2540*/
2541static
729e4ab9
A
2542inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2543 int32_t *textoffset,
b75a7d8f
A
2544 UErrorCode *status)
2545{
2546 // to ensure that the start and ends are not composite characters
2547 UCollationElements *coleiter = strsrch->textIter;
2548 // if we have a canonical accent match
729e4ab9
A
2549 if ((strsrch->pattern.hasSuffixAccents &&
2550 strsrch->canonicalSuffixAccents[0]) ||
2551 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
2552 strsrch->canonicalPrefixAccents[0])) {
2553 strsrch->search->matchedIndex = *textoffset;
729e4ab9
A
2554 strsrch->search->matchedLength =
2555 getNextUStringSearchBaseOffset(strsrch,
b75a7d8f
A
2556 getColElemIterOffset(coleiter, FALSE))
2557 - *textoffset;
2558 return TRUE;
2559 }
2560
2561 int32_t end = ucol_getOffset(coleiter);
2562 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
729e4ab9 2563 status) ||
b75a7d8f
A
2564 U_FAILURE(*status)) {
2565 return FALSE;
2566 }
2567
2568 end = getNextUStringSearchBaseOffset(strsrch, end);
2569 // this totally matches, however we need to check if it is repeating
729e4ab9
A
2570 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2571 !isBreakUnit(strsrch, *textoffset, end) ||
b75a7d8f
A
2572 !checkIdentical(strsrch, *textoffset, end)) {
2573 (*textoffset) --;
729e4ab9 2574 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2575 *textoffset);
2576 return FALSE;
2577 }
729e4ab9 2578
b75a7d8f
A
2579 strsrch->search->matchedIndex = *textoffset;
2580 strsrch->search->matchedLength = end - *textoffset;
2581 return TRUE;
2582}
46f4442e 2583#endif // #if BOYER_MOORE
b75a7d8f
A
2584
2585// constructors and destructor -------------------------------------------
2586
729e4ab9
A
2587U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2588 int32_t patternlength,
2589 const UChar *text,
b75a7d8f
A
2590 int32_t textlength,
2591 const char *locale,
2592 UBreakIterator *breakiter,
729e4ab9 2593 UErrorCode *status)
b75a7d8f
A
2594{
2595 if (U_FAILURE(*status)) {
2596 return NULL;
2597 }
2598#if UCONFIG_NO_BREAK_ITERATION
2599 if (breakiter != NULL) {
2600 *status = U_UNSUPPORTED_ERROR;
2601 return NULL;
2602 }
2603#endif
2604 if (locale) {
2605 // ucol_open internally checks for status
2606 UCollator *collator = ucol_open(locale, status);
2607 // pattern, text checks are done in usearch_openFromCollator
729e4ab9
A
2608 UStringSearch *result = usearch_openFromCollator(pattern,
2609 patternlength, text, textlength,
b75a7d8f
A
2610 collator, breakiter, status);
2611
2612 if (result == NULL || U_FAILURE(*status)) {
2613 if (collator) {
2614 ucol_close(collator);
2615 }
2616 return NULL;
2617 }
2618 else {
2619 result->ownCollator = TRUE;
2620 }
2621 return result;
2622 }
2623 *status = U_ILLEGAL_ARGUMENT_ERROR;
2624 return NULL;
2625}
2626
2627U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
729e4ab9 2628 const UChar *pattern,
b75a7d8f 2629 int32_t patternlength,
729e4ab9 2630 const UChar *text,
b75a7d8f
A
2631 int32_t textlength,
2632 const UCollator *collator,
2633 UBreakIterator *breakiter,
729e4ab9 2634 UErrorCode *status)
b75a7d8f
A
2635{
2636 if (U_FAILURE(*status)) {
2637 return NULL;
2638 }
2639#if UCONFIG_NO_BREAK_ITERATION
2640 if (breakiter != NULL) {
2641 *status = U_UNSUPPORTED_ERROR;
2642 return NULL;
2643 }
2644#endif
2645 if (pattern == NULL || text == NULL || collator == NULL) {
2646 *status = U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 2647 return NULL;
b75a7d8f
A
2648 }
2649
374ca955
A
2650 // string search does not really work when numeric collation is turned on
2651 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2652 *status = U_UNSUPPORTED_ERROR;
73c04bcf 2653 return NULL;
374ca955
A
2654 }
2655
b75a7d8f
A
2656 if (U_SUCCESS(*status)) {
2657 initializeFCD(status);
2658 if (U_FAILURE(*status)) {
2659 return NULL;
2660 }
2661
2662 UStringSearch *result;
2663 if (textlength == -1) {
2664 textlength = u_strlen(text);
2665 }
2666 if (patternlength == -1) {
2667 patternlength = u_strlen(pattern);
2668 }
2669 if (textlength <= 0 || patternlength <= 0) {
2670 *status = U_ILLEGAL_ARGUMENT_ERROR;
2671 return NULL;
2672 }
729e4ab9 2673
b75a7d8f
A
2674 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2675 if (result == NULL) {
2676 *status = U_MEMORY_ALLOCATION_ERROR;
2677 return NULL;
2678 }
2679
2680 result->collator = collator;
2681 result->strength = ucol_getStrength(collator);
2682 result->ceMask = getMask(result->strength);
729e4ab9
A
2683 result->toShift =
2684 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
2685 UCOL_SHIFTED;
2686 result->variableTop = ucol_getVariableTop(collator, status);
2687
b331163b 2688 result->nfd = Normalizer2::getNFDInstance(*status);
729e4ab9 2689
b75a7d8f
A
2690 if (U_FAILURE(*status)) {
2691 uprv_free(result);
2692 return NULL;
2693 }
2694
2695 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2696 if (result->search == NULL) {
2697 *status = U_MEMORY_ALLOCATION_ERROR;
2698 uprv_free(result);
2699 return NULL;
2700 }
2701
2702 result->search->text = text;
2703 result->search->textLength = textlength;
2704
2705 result->pattern.text = pattern;
2706 result->pattern.textLength = patternlength;
b331163b
A
2707 result->pattern.ces = NULL;
2708 result->pattern.pces = NULL;
729e4ab9 2709
b75a7d8f
A
2710 result->search->breakIter = breakiter;
2711#if !UCONFIG_NO_BREAK_ITERATION
729e4ab9 2712 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
b75a7d8f 2713 if (breakiter) {
4388f060 2714 ubrk_setText(breakiter, text, textlength, status);
b75a7d8f
A
2715 }
2716#endif
2717
2718 result->ownCollator = FALSE;
2719 result->search->matchedLength = 0;
2720 result->search->matchedIndex = USEARCH_DONE;
46f4442e 2721 result->utilIter = NULL;
729e4ab9 2722 result->textIter = ucol_openElements(collator, text,
b75a7d8f 2723 textlength, status);
57a6839d 2724 result->textProcessedIter = NULL;
b75a7d8f
A
2725 if (U_FAILURE(*status)) {
2726 usearch_close(result);
2727 return NULL;
2728 }
2729
b75a7d8f
A
2730 result->search->isOverlap = FALSE;
2731 result->search->isCanonicalMatch = FALSE;
729e4ab9 2732 result->search->elementComparisonType = 0;
b75a7d8f
A
2733 result->search->isForwardSearching = TRUE;
2734 result->search->reset = TRUE;
729e4ab9 2735
b75a7d8f
A
2736 initialize(result, status);
2737
2738 if (U_FAILURE(*status)) {
2739 usearch_close(result);
2740 return NULL;
2741 }
2742
2743 return result;
2744 }
2745 return NULL;
2746}
2747
2748U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2749{
2750 if (strsrch) {
b331163b
A
2751 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2752 strsrch->pattern.ces) {
2753 uprv_free(strsrch->pattern.ces);
b75a7d8f 2754 }
46f4442e 2755
b331163b
A
2756 if (strsrch->pattern.pces != NULL &&
2757 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2758 uprv_free(strsrch->pattern.pces);
46f4442e
A
2759 }
2760
57a6839d 2761 delete strsrch->textProcessedIter;
b75a7d8f
A
2762 ucol_closeElements(strsrch->textIter);
2763 ucol_closeElements(strsrch->utilIter);
46f4442e 2764
b75a7d8f
A
2765 if (strsrch->ownCollator && strsrch->collator) {
2766 ucol_close((UCollator *)strsrch->collator);
2767 }
46f4442e
A
2768
2769#if !UCONFIG_NO_BREAK_ITERATION
2770 if (strsrch->search->internalBreakIter) {
4388f060 2771 ubrk_close(strsrch->search->internalBreakIter);
46f4442e
A
2772 }
2773#endif
2774
b75a7d8f
A
2775 uprv_free(strsrch->search);
2776 uprv_free(strsrch);
2777 }
2778}
2779
57a6839d
A
2780namespace {
2781
2782UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2783 if (U_FAILURE(*status)) { return FALSE; }
2784 if (strsrch->textProcessedIter == NULL) {
2785 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2786 if (strsrch->textProcessedIter == NULL) {
2787 *status = U_MEMORY_ALLOCATION_ERROR;
2788 return FALSE;
2789 }
2790 } else {
2791 strsrch->textProcessedIter->init(strsrch->textIter);
2792 }
2793 return TRUE;
2794}
2795
2796}
2797
b75a7d8f
A
2798// set and get methods --------------------------------------------------
2799
729e4ab9 2800U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
b75a7d8f
A
2801 int32_t position,
2802 UErrorCode *status)
2803{
2804 if (U_SUCCESS(*status) && strsrch) {
2805 if (isOutOfBounds(strsrch->search->textLength, position)) {
2806 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2807 }
2808 else {
2809 setColEIterOffset(strsrch->textIter, position);
2810 }
2811 strsrch->search->matchedIndex = USEARCH_DONE;
2812 strsrch->search->matchedLength = 0;
729e4ab9 2813 strsrch->search->reset = FALSE;
b75a7d8f
A
2814 }
2815}
2816
2817U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2818{
2819 if (strsrch) {
2820 int32_t result = ucol_getOffset(strsrch->textIter);
2821 if (isOutOfBounds(strsrch->search->textLength, result)) {
2822 return USEARCH_DONE;
2823 }
2824 return result;
2825 }
2826 return USEARCH_DONE;
2827}
729e4ab9
A
2828
2829U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
b75a7d8f
A
2830 USearchAttribute attribute,
2831 USearchAttributeValue value,
2832 UErrorCode *status)
2833{
2834 if (U_SUCCESS(*status) && strsrch) {
2835 switch (attribute)
2836 {
2837 case USEARCH_OVERLAP :
2838 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2839 break;
2840 case USEARCH_CANONICAL_MATCH :
729e4ab9 2841 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
b75a7d8f
A
2842 FALSE);
2843 break;
729e4ab9
A
2844 case USEARCH_ELEMENT_COMPARISON :
2845 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2846 strsrch->search->elementComparisonType = (int16_t)value;
2847 } else {
2848 strsrch->search->elementComparisonType = 0;
2849 }
2850 break;
b75a7d8f
A
2851 case USEARCH_ATTRIBUTE_COUNT :
2852 default:
2853 *status = U_ILLEGAL_ARGUMENT_ERROR;
2854 }
2855 }
2856 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2857 *status = U_ILLEGAL_ARGUMENT_ERROR;
2858 }
2859}
729e4ab9 2860
b75a7d8f
A
2861U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2862 const UStringSearch *strsrch,
2863 USearchAttribute attribute)
2864{
2865 if (strsrch) {
2866 switch (attribute) {
2867 case USEARCH_OVERLAP :
729e4ab9 2868 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
b75a7d8f
A
2869 USEARCH_OFF);
2870 case USEARCH_CANONICAL_MATCH :
729e4ab9 2871 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
b75a7d8f 2872 USEARCH_OFF);
729e4ab9
A
2873 case USEARCH_ELEMENT_COMPARISON :
2874 {
2875 int16_t value = strsrch->search->elementComparisonType;
2876 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2877 return (USearchAttributeValue)value;
2878 } else {
2879 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2880 }
2881 }
b75a7d8f
A
2882 case USEARCH_ATTRIBUTE_COUNT :
2883 return USEARCH_DEFAULT;
2884 }
2885 }
2886 return USEARCH_DEFAULT;
2887}
2888
2889U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890 const UStringSearch *strsrch)
2891{
2892 if (strsrch == NULL) {
2893 return USEARCH_DONE;
2894 }
2895 return strsrch->search->matchedIndex;
2896}
2897
2898
729e4ab9
A
2899U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900 UChar *result,
2901 int32_t resultCapacity,
b75a7d8f
A
2902 UErrorCode *status)
2903{
2904 if (U_FAILURE(*status)) {
2905 return USEARCH_DONE;
2906 }
729e4ab9 2907 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
b75a7d8f
A
2908 result == NULL)) {
2909 *status = U_ILLEGAL_ARGUMENT_ERROR;
2910 return USEARCH_DONE;
2911 }
2912
2913 int32_t copylength = strsrch->search->matchedLength;
2914 int32_t copyindex = strsrch->search->matchedIndex;
2915 if (copyindex == USEARCH_DONE) {
2916 u_terminateUChars(result, resultCapacity, 0, status);
2917 return USEARCH_DONE;
2918 }
2919
2920 if (resultCapacity < copylength) {
2921 copylength = resultCapacity;
2922 }
2923 if (copylength > 0) {
729e4ab9 2924 uprv_memcpy(result, strsrch->search->text + copyindex,
b75a7d8f
A
2925 copylength * sizeof(UChar));
2926 }
729e4ab9 2927 return u_terminateUChars(result, resultCapacity,
b75a7d8f
A
2928 strsrch->search->matchedLength, status);
2929}
729e4ab9 2930
b75a7d8f
A
2931U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932 const UStringSearch *strsrch)
2933{
2934 if (strsrch) {
2935 return strsrch->search->matchedLength;
2936 }
2937 return USEARCH_DONE;
2938}
2939
2940#if !UCONFIG_NO_BREAK_ITERATION
2941
729e4ab9 2942U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
b75a7d8f
A
2943 UBreakIterator *breakiter,
2944 UErrorCode *status)
2945{
2946 if (U_SUCCESS(*status) && strsrch) {
4388f060 2947 strsrch->search->breakIter = breakiter;
b75a7d8f 2948 if (breakiter) {
729e4ab9 2949 ubrk_setText(breakiter, strsrch->search->text,
b75a7d8f
A
2950 strsrch->search->textLength, status);
2951 }
2952 }
2953}
2954
729e4ab9 2955U_CAPI const UBreakIterator* U_EXPORT2
b75a7d8f
A
2956usearch_getBreakIterator(const UStringSearch *strsrch)
2957{
2958 if (strsrch) {
2959 return strsrch->search->breakIter;
2960 }
2961 return NULL;
2962}
729e4ab9 2963
b75a7d8f 2964#endif
729e4ab9
A
2965
2966U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
b75a7d8f
A
2967 const UChar *text,
2968 int32_t textlength,
2969 UErrorCode *status)
2970{
2971 if (U_SUCCESS(*status)) {
729e4ab9 2972 if (strsrch == NULL || text == NULL || textlength < -1 ||
b75a7d8f
A
2973 textlength == 0) {
2974 *status = U_ILLEGAL_ARGUMENT_ERROR;
2975 }
2976 else {
2977 if (textlength == -1) {
2978 textlength = u_strlen(text);
2979 }
2980 strsrch->search->text = text;
2981 strsrch->search->textLength = textlength;
2982 ucol_setText(strsrch->textIter, text, textlength, status);
2983 strsrch->search->matchedIndex = USEARCH_DONE;
2984 strsrch->search->matchedLength = 0;
2985 strsrch->search->reset = TRUE;
2986#if !UCONFIG_NO_BREAK_ITERATION
374ca955 2987 if (strsrch->search->breakIter != NULL) {
729e4ab9 2988 ubrk_setText(strsrch->search->breakIter, text,
374ca955
A
2989 textlength, status);
2990 }
46f4442e 2991 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
b75a7d8f
A
2992#endif
2993 }
2994 }
2995}
2996
729e4ab9 2997U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
b75a7d8f
A
2998 int32_t *length)
2999{
3000 if (strsrch) {
3001 *length = strsrch->search->textLength;
3002 return strsrch->search->text;
3003 }
3004 return NULL;
3005}
3006
729e4ab9 3007U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
b75a7d8f
A
3008 const UCollator *collator,
3009 UErrorCode *status)
3010{
3011 if (U_SUCCESS(*status)) {
3012 if (collator == NULL) {
3013 *status = U_ILLEGAL_ARGUMENT_ERROR;
3014 return;
3015 }
46f4442e 3016
b75a7d8f 3017 if (strsrch) {
57a6839d
A
3018 delete strsrch->textProcessedIter;
3019 strsrch->textProcessedIter = NULL;
3020 ucol_closeElements(strsrch->textIter);
3021 ucol_closeElements(strsrch->utilIter);
3022 strsrch->textIter = strsrch->utilIter = NULL;
b75a7d8f
A
3023 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3024 ucol_close((UCollator *)strsrch->collator);
3025 strsrch->ownCollator = FALSE;
3026 }
3027 strsrch->collator = collator;
3028 strsrch->strength = ucol_getStrength(collator);
3029 strsrch->ceMask = getMask(strsrch->strength);
46f4442e 3030#if !UCONFIG_NO_BREAK_ITERATION
4388f060
A
3031 ubrk_close(strsrch->search->internalBreakIter);
3032 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3033 strsrch->search->text, strsrch->search->textLength, status);
46f4442e 3034#endif
b75a7d8f 3035 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9
A
3036 strsrch->toShift =
3037 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
3038 UCOL_SHIFTED;
3039 // if status is a failure, ucol_getVariableTop returns 0
3040 strsrch->variableTop = ucol_getVariableTop(collator, status);
57a6839d
A
3041 strsrch->textIter = ucol_openElements(collator,
3042 strsrch->search->text,
3043 strsrch->search->textLength,
3044 status);
3045 strsrch->utilIter = ucol_openElements(
3046 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3047 // initialize() _after_ setting the iterators for the new collator.
3048 initialize(strsrch, status);
b75a7d8f 3049 }
46f4442e
A
3050
3051 // **** are these calls needed?
3052 // **** we call uprv_init_pce in initializePatternPCETable
b331163b 3053 // **** and the CEIBuffer constructor...
46f4442e
A
3054#if 0
3055 uprv_init_pce(strsrch->textIter);
3056 uprv_init_pce(strsrch->utilIter);
3057#endif
b75a7d8f
A
3058 }
3059}
3060
3061U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3062{
3063 if (strsrch) {
3064 return (UCollator *)strsrch->collator;
3065 }
3066 return NULL;
3067}
3068
729e4ab9 3069U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
b75a7d8f
A
3070 const UChar *pattern,
3071 int32_t patternlength,
3072 UErrorCode *status)
3073{
3074 if (U_SUCCESS(*status)) {
3075 if (strsrch == NULL || pattern == NULL) {
3076 *status = U_ILLEGAL_ARGUMENT_ERROR;
3077 }
3078 else {
3079 if (patternlength == -1) {
3080 patternlength = u_strlen(pattern);
3081 }
3082 if (patternlength == 0) {
3083 *status = U_ILLEGAL_ARGUMENT_ERROR;
3084 return;
3085 }
3086 strsrch->pattern.text = pattern;
3087 strsrch->pattern.textLength = patternlength;
3088 initialize(strsrch, status);
3089 }
3090 }
3091}
3092
729e4ab9
A
3093U_CAPI const UChar* U_EXPORT2
3094usearch_getPattern(const UStringSearch *strsrch,
b75a7d8f
A
3095 int32_t *length)
3096{
3097 if (strsrch) {
3098 *length = strsrch->pattern.textLength;
3099 return strsrch->pattern.text;
3100 }
3101 return NULL;
3102}
3103
3104// miscellanous methods --------------------------------------------------
3105
729e4ab9
A
3106U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3107 UErrorCode *status)
b75a7d8f
A
3108{
3109 if (strsrch && U_SUCCESS(*status)) {
3110 strsrch->search->isForwardSearching = TRUE;
3111 usearch_setOffset(strsrch, 0, status);
3112 if (U_SUCCESS(*status)) {
3113 return usearch_next(strsrch, status);
3114 }
3115 }
3116 return USEARCH_DONE;
3117}
3118
729e4ab9 3119U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
b75a7d8f
A
3120 int32_t position,
3121 UErrorCode *status)
3122{
3123 if (strsrch && U_SUCCESS(*status)) {
3124 strsrch->search->isForwardSearching = TRUE;
3125 // position checked in usearch_setOffset
3126 usearch_setOffset(strsrch, position, status);
3127 if (U_SUCCESS(*status)) {
729e4ab9 3128 return usearch_next(strsrch, status);
b75a7d8f
A
3129 }
3130 }
3131 return USEARCH_DONE;
3132}
729e4ab9
A
3133
3134U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
b75a7d8f
A
3135 UErrorCode *status)
3136{
3137 if (strsrch && U_SUCCESS(*status)) {
3138 strsrch->search->isForwardSearching = FALSE;
3139 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3140 if (U_SUCCESS(*status)) {
3141 return usearch_previous(strsrch, status);
3142 }
3143 }
3144 return USEARCH_DONE;
3145}
3146
729e4ab9 3147U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
b75a7d8f
A
3148 int32_t position,
3149 UErrorCode *status)
3150{
3151 if (strsrch && U_SUCCESS(*status)) {
3152 strsrch->search->isForwardSearching = FALSE;
3153 // position checked in usearch_setOffset
3154 usearch_setOffset(strsrch, position, status);
3155 if (U_SUCCESS(*status)) {
729e4ab9 3156 return usearch_previous(strsrch, status);
b75a7d8f
A
3157 }
3158 }
3159 return USEARCH_DONE;
3160}
729e4ab9 3161
b75a7d8f 3162/**
729e4ab9
A
3163* If a direction switch is required, we'll count the number of ces till the
3164* beginning of the collation element iterator and iterate forwards that
3165* number of times. This is so that we get to the correct point within the
b75a7d8f
A
3166* string to continue the search in. Imagine when we are in the middle of the
3167* normalization buffer when the change in direction is request. arrrgghh....
3168* After searching the offset within the collation element iterator will be
3169* shifted to the start of the match. If a match is not found, the offset would
729e4ab9 3170* have been set to the end of the text string in the collation element
b75a7d8f
A
3171* iterator.
3172* Okay, here's my take on normalization buffer. The only time when there can
3173* be 2 matches within the same normalization is when the pattern is consists
3174* of all accents. But since the offset returned is from the text string, we
729e4ab9 3175* should not confuse the caller by returning the second match within the
b75a7d8f
A
3176* same normalization buffer. If we do, the 2 results will have the same match
3177* offsets, and that'll be confusing. I'll return the next match that doesn't
729e4ab9 3178* fall within the same normalization buffer. Note this does not affect the
b75a7d8f
A
3179* results of matches spanning the text and the normalization buffer.
3180* The position to start searching is taken from the collation element
3181* iterator. Callers of this API would have to set the offset in the collation
3182* element iterator before using this method.
3183*/
3184U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3185 UErrorCode *status)
729e4ab9 3186{
b75a7d8f 3187 if (U_SUCCESS(*status) && strsrch) {
374ca955
A
3188 // note offset is either equivalent to the start of the previous match
3189 // or is set by the user
3190 int32_t offset = usearch_getOffset(strsrch);
3191 USearch *search = strsrch->search;
3192 search->reset = FALSE;
3193 int32_t textlength = search->textLength;
b75a7d8f 3194 if (search->isForwardSearching) {
46f4442e 3195#if BOYER_MOORE
374ca955 3196 if (offset == textlength
729e4ab9 3197 || (!search->isOverlap &&
b75a7d8f 3198 (offset + strsrch->pattern.defaultShiftSize > textlength ||
729e4ab9 3199 (search->matchedIndex != USEARCH_DONE &&
374ca955 3200 offset + search->matchedLength >= textlength)))) {
b75a7d8f
A
3201 // not enough characters to match
3202 setMatchNotFound(strsrch);
729e4ab9 3203 return USEARCH_DONE;
b75a7d8f 3204 }
46f4442e
A
3205#else
3206 if (offset == textlength ||
3207 (! search->isOverlap &&
3208 (search->matchedIndex != USEARCH_DONE &&
3209 offset + search->matchedLength > textlength))) {
3210 // not enough characters to match
3211 setMatchNotFound(strsrch);
3212 return USEARCH_DONE;
3213 }
3214#endif
b75a7d8f
A
3215 }
3216 else {
729e4ab9
A
3217 // switching direction.
3218 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3219 // setOffset has been called or that previous ran off the text
729e4ab9 3220 // string. the iterator would have been set to offset 0 if a
b75a7d8f
A
3221 // match is not found.
3222 search->isForwardSearching = TRUE;
374ca955 3223 if (search->matchedIndex != USEARCH_DONE) {
b75a7d8f
A
3224 // there's no need to set the collation element iterator
3225 // the next call to next will set the offset.
374ca955 3226 return search->matchedIndex;
b75a7d8f
A
3227 }
3228 }
3229
3230 if (U_SUCCESS(*status)) {
b331163b 3231 if (strsrch->pattern.cesLength == 0) {
374ca955 3232 if (search->matchedIndex == USEARCH_DONE) {
b75a7d8f
A
3233 search->matchedIndex = offset;
3234 }
3235 else { // moves by codepoints
4388f060 3236 U16_FWD_1(search->text, search->matchedIndex, textlength);
b75a7d8f 3237 }
729e4ab9 3238
b75a7d8f
A
3239 search->matchedLength = 0;
3240 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3241 // status checked below
3242 if (search->matchedIndex == textlength) {
3243 search->matchedIndex = USEARCH_DONE;
3244 }
3245 }
3246 else {
374ca955
A
3247 if (search->matchedLength > 0) {
3248 // if matchlength is 0 we are at the start of the iteration
3249 if (search->isOverlap) {
3250 ucol_setOffset(strsrch->textIter, offset + 1, status);
3251 }
3252 else {
729e4ab9 3253 ucol_setOffset(strsrch->textIter,
374ca955
A
3254 offset + search->matchedLength, status);
3255 }
3256 }
3257 else {
3258 // for boundary check purposes. this will ensure that the
3259 // next match will not preceed the current offset
3260 // note search->matchedIndex will always be set to something
3261 // in the code
3262 search->matchedIndex = offset - 1;
3263 }
3264
3265 if (search->isCanonicalMatch) {
3266 // can't use exact here since extra accents are allowed.
3267 usearch_handleNextCanonical(strsrch, status);
3268 }
3269 else {
3270 usearch_handleNextExact(strsrch, status);
3271 }
3272 }
3273
b75a7d8f
A
3274 if (U_FAILURE(*status)) {
3275 return USEARCH_DONE;
3276 }
374ca955 3277
46f4442e
A
3278#if !BOYER_MOORE
3279 if (search->matchedIndex == USEARCH_DONE) {
3280 ucol_setOffset(strsrch->textIter, search->textLength, status);
3281 } else {
3282 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3283 }
3284#endif
3285
b75a7d8f
A
3286 return search->matchedIndex;
3287 }
3288 }
3289 return USEARCH_DONE;
3290}
3291
3292U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3293 UErrorCode *status)
3294{
3295 if (U_SUCCESS(*status) && strsrch) {
3296 int32_t offset;
3297 USearch *search = strsrch->search;
3298 if (search->reset) {
3299 offset = search->textLength;
3300 search->isForwardSearching = FALSE;
3301 search->reset = FALSE;
3302 setColEIterOffset(strsrch->textIter, offset);
3303 }
3304 else {
3305 offset = usearch_getOffset(strsrch);
3306 }
729e4ab9 3307
b75a7d8f
A
3308 int32_t matchedindex = search->matchedIndex;
3309 if (search->isForwardSearching == TRUE) {
729e4ab9
A
3310 // switching direction.
3311 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3312 // setOffset has been called or that next ran off the text
729e4ab9 3313 // string. the iterator would have been set to offset textLength if
b75a7d8f
A
3314 // a match is not found.
3315 search->isForwardSearching = FALSE;
3316 if (matchedindex != USEARCH_DONE) {
3317 return matchedindex;
3318 }
3319 }
3320 else {
46f4442e 3321#if BOYER_MOORE
b75a7d8f 3322 if (offset == 0 || matchedindex == 0 ||
729e4ab9 3323 (!search->isOverlap &&
b75a7d8f 3324 (offset < strsrch->pattern.defaultShiftSize ||
729e4ab9 3325 (matchedindex != USEARCH_DONE &&
b75a7d8f
A
3326 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3327 // not enough characters to match
3328 setMatchNotFound(strsrch);
729e4ab9 3329 return USEARCH_DONE;
b75a7d8f 3330 }
46f4442e
A
3331#else
3332 // Could check pattern length, but the
3333 // linear search will do the right thing
3334 if (offset == 0 || matchedindex == 0) {
3335 setMatchNotFound(strsrch);
3336 return USEARCH_DONE;
3337 }
3338#endif
b75a7d8f
A
3339 }
3340
3341 if (U_SUCCESS(*status)) {
b331163b 3342 if (strsrch->pattern.cesLength == 0) {
729e4ab9 3343 search->matchedIndex =
b75a7d8f
A
3344 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3345 if (search->matchedIndex == 0) {
3346 setMatchNotFound(strsrch);
3347 // status checked below
3348 }
3349 else { // move by codepoints
4388f060 3350 U16_BACK_1(search->text, 0, search->matchedIndex);
b75a7d8f
A
3351 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3352 // status checked below
3353 search->matchedLength = 0;
3354 }
3355 }
3356 else {
3357 if (strsrch->search->isCanonicalMatch) {
3358 // can't use exact here since extra accents are allowed.
3359 usearch_handlePreviousCanonical(strsrch, status);
3360 // status checked below
3361 }
3362 else {
3363 usearch_handlePreviousExact(strsrch, status);
3364 // status checked below
3365 }
3366 }
3367
3368 if (U_FAILURE(*status)) {
3369 return USEARCH_DONE;
3370 }
729e4ab9 3371
b75a7d8f
A
3372 return search->matchedIndex;
3373 }
3374 }
3375 return USEARCH_DONE;
3376}
3377
3378
729e4ab9 3379
b75a7d8f
A
3380U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3381{
729e4ab9
A
3382 /*
3383 reset is setting the attributes that are already in
b75a7d8f
A
3384 string search, hence all attributes in the collator should
3385 be retrieved without any problems
3386 */
3387 if (strsrch) {
3388 UErrorCode status = U_ZERO_ERROR;
3389 UBool sameCollAttribute = TRUE;
3390 uint32_t ceMask;
3391 UBool shift;
3392 uint32_t varTop;
3393
729e4ab9
A
3394 // **** hack to deal w/ how processed CEs encode quaternary ****
3395 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3396 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3397 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3398 sameCollAttribute = FALSE;
3399 }
3400
b75a7d8f
A
3401 strsrch->strength = ucol_getStrength(strsrch->collator);
3402 ceMask = getMask(strsrch->strength);
3403 if (strsrch->ceMask != ceMask) {
3404 strsrch->ceMask = ceMask;
3405 sameCollAttribute = FALSE;
3406 }
729e4ab9 3407
b75a7d8f 3408 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9 3409 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
b75a7d8f
A
3410 &status) == UCOL_SHIFTED;
3411 if (strsrch->toShift != shift) {
3412 strsrch->toShift = shift;
3413 sameCollAttribute = FALSE;
3414 }
3415
3416 // if status is a failure, ucol_getVariableTop returns 0
3417 varTop = ucol_getVariableTop(strsrch->collator, &status);
3418 if (strsrch->variableTop != varTop) {
3419 strsrch->variableTop = varTop;
3420 sameCollAttribute = FALSE;
3421 }
3422 if (!sameCollAttribute) {
3423 initialize(strsrch, &status);
3424 }
57a6839d 3425 ucol_setText(strsrch->textIter, strsrch->search->text,
729e4ab9 3426 strsrch->search->textLength,
729e4ab9 3427 &status);
b75a7d8f
A
3428 strsrch->search->matchedLength = 0;
3429 strsrch->search->matchedIndex = USEARCH_DONE;
3430 strsrch->search->isOverlap = FALSE;
3431 strsrch->search->isCanonicalMatch = FALSE;
729e4ab9 3432 strsrch->search->elementComparisonType = 0;
b75a7d8f
A
3433 strsrch->search->isForwardSearching = TRUE;
3434 strsrch->search->reset = TRUE;
3435 }
3436}
3437
46f4442e
A
3438//
3439// CEI Collation Element + source text index.
3440// These structs are kept in the circular buffer.
3441//
3442struct CEI {
3443 int64_t ce;
3444 int32_t lowIndex;
3445 int32_t highIndex;
3446};
3447
3448U_NAMESPACE_BEGIN
3449
57a6839d 3450namespace {
46f4442e 3451//
b331163b 3452// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
46f4442e 3453//
4388f060
A
3454#define DEFAULT_CEBUFFER_SIZE 96
3455#define CEBUFFER_EXTRA 32
3456// Some typical max values to make buffer size more reasonable for asymmetric search.
3457// #8694 is for a better long-term solution to allocation of this buffer.
3458#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3459#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3460#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
b331163b 3461struct CEIBuffer {
46f4442e
A
3462 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3463 CEI *buf;
3464 int32_t bufSize;
3465 int32_t firstIx;
3466 int32_t limitIx;
3467 UCollationElements *ceIter;
3468 UStringSearch *strSearch;
3469
3470
3471
b331163b
A
3472 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3473 ~CEIBuffer();
46f4442e
A
3474 const CEI *get(int32_t index);
3475 const CEI *getPrevious(int32_t index);
3476};
3477
3478
b331163b 3479CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
46f4442e
A
3480 buf = defBuf;
3481 strSearch = ss;
b331163b 3482 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
4388f060
A
3483 if (ss->search->elementComparisonType != 0) {
3484 const UChar * patText = ss->pattern.text;
3485 if (patText) {
3486 const UChar * patTextLimit = patText + ss->pattern.textLength;
3487 while ( patText < patTextLimit ) {
3488 UChar c = *patText++;
3489 if (MIGHT_BE_JAMO_L(c)) {
3490 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3491 } else {
3492 // No check for surrogates, we might allocate slightly more buffer than necessary.
3493 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3494 }
3495 }
3496 }
3497 }
46f4442e
A
3498 ceIter = ss->textIter;
3499 firstIx = 0;
3500 limitIx = 0;
3501
57a6839d 3502 if (!initTextProcessedIter(ss, status)) { return; }
46f4442e
A
3503
3504 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3505 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3506 if (buf == NULL) {
3507 *status = U_MEMORY_ALLOCATION_ERROR;
3508 }
3509 }
3510}
3511
3512// TODO: add a reset or init function so that allocated
3513// buffers can be retained & reused.
3514
b331163b 3515CEIBuffer::~CEIBuffer() {
46f4442e
A
3516 if (buf != defBuf) {
3517 uprv_free(buf);
3518 }
3519}
3520
3521
3522// Get the CE with the specified index.
3523// Index must be in the range
3524// n-history_size < index < n+1
3525// where n is the largest index to have been fetched by some previous call to this function.
3526// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3527//
b331163b 3528const CEI *CEIBuffer::get(int32_t index) {
46f4442e
A
3529 int i = index % bufSize;
3530
3531 if (index>=firstIx && index<limitIx) {
3532 // The request was for an entry already in our buffer.
3533 // Just return it.
3534 return &buf[i];
3535 }
3536
3537 // Caller is requesting a new, never accessed before, CE.
3538 // Verify that it is the next one in sequence, which is all
3539 // that is allowed.
3540 if (index != limitIx) {
3541 U_ASSERT(FALSE);
3542
3543 return NULL;
3544 }
3545
3546 // Manage the circular CE buffer indexing
3547 limitIx++;
3548
3549 if (limitIx - firstIx >= bufSize) {
3550 // The buffer is full, knock out the lowest-indexed entry.
3551 firstIx++;
3552 }
3553
3554 UErrorCode status = U_ZERO_ERROR;
3555
57a6839d 3556 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3557
3558 return &buf[i];
3559}
3560
3561// Get the CE with the specified index.
3562// Index must be in the range
3563// n-history_size < index < n+1
3564// where n is the largest index to have been fetched by some previous call to this function.
3565// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3566//
b331163b 3567const CEI *CEIBuffer::getPrevious(int32_t index) {
46f4442e
A
3568 int i = index % bufSize;
3569
3570 if (index>=firstIx && index<limitIx) {
3571 // The request was for an entry already in our buffer.
3572 // Just return it.
3573 return &buf[i];
3574 }
3575
3576 // Caller is requesting a new, never accessed before, CE.
3577 // Verify that it is the next one in sequence, which is all
3578 // that is allowed.
3579 if (index != limitIx) {
3580 U_ASSERT(FALSE);
3581
3582 return NULL;
3583 }
3584
3585 // Manage the circular CE buffer indexing
3586 limitIx++;
3587
3588 if (limitIx - firstIx >= bufSize) {
3589 // The buffer is full, knock out the lowest-indexed entry.
3590 firstIx++;
3591 }
3592
3593 UErrorCode status = U_ZERO_ERROR;
3594
57a6839d 3595 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3596
3597 return &buf[i];
3598}
3599
57a6839d
A
3600}
3601
46f4442e
A
3602U_NAMESPACE_END
3603
3604
3605// #define USEARCH_DEBUG
3606
3607#ifdef USEARCH_DEBUG
3608#include <stdio.h>
3609#include <stdlib.h>
3610#endif
3611
3612/*
3613 * Find the next break boundary after startIndex. If the UStringSearch object
3614 * has an external break iterator, use that. Otherwise use the internal character
3615 * break iterator.
3616 */
3617static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3618#if 0
3619 const UChar *text = strsrch->search->text;
3620 int32_t textLen = strsrch->search->textLength;
729e4ab9 3621
46f4442e
A
3622 U_ASSERT(startIndex>=0);
3623 U_ASSERT(startIndex<=textLen);
729e4ab9 3624
46f4442e
A
3625 if (startIndex >= textLen) {
3626 return startIndex;
3627 }
3628
3629 UChar32 c;
3630 int32_t i = startIndex;
3631 U16_NEXT(text, i, textLen, c);
729e4ab9 3632
46f4442e
A
3633 // If we are on a control character, stop without looking for combining marks.
3634 // Control characters do not combine.
3635 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3636 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3637 return i;
3638 }
729e4ab9 3639
46f4442e
A
3640 // The initial character was not a control, and can thus accept trailing
3641 // combining characters. Advance over however many of them there are.
3642 int32_t indexOfLastCharChecked;
3643 for (;;) {
3644 indexOfLastCharChecked = i;
3645 if (i>=textLen) {
3646 break;
3647 }
3648 U16_NEXT(text, i, textLen, c);
3649 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3650 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3651 break;
3652 }
3653 }
3654 return indexOfLastCharChecked;
3655#elif !UCONFIG_NO_BREAK_ITERATION
3656 UBreakIterator *breakiterator = strsrch->search->breakIter;
3657
3658 if (breakiterator == NULL) {
3659 breakiterator = strsrch->search->internalBreakIter;
3660 }
3661
3662 if (breakiterator != NULL) {
4388f060 3663 return ubrk_following(breakiterator, startIndex);
46f4442e
A
3664 }
3665
3666 return startIndex;
3667#else
3668 // **** or should we use the original code? ****
3669 return startIndex;
3670#endif
3671
3672}
3673
3674/*
3675 * Returns TRUE if index is on a break boundary. If the UStringSearch
3676 * has an external break iterator, test using that, otherwise test
3677 * using the internal character break iterator.
3678 */
3679static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3680#if 0
3681 const UChar *text = strsrch->search->text;
3682 int32_t textLen = strsrch->search->textLength;
729e4ab9 3683
46f4442e
A
3684 U_ASSERT(index>=0);
3685 U_ASSERT(index<=textLen);
729e4ab9 3686
46f4442e 3687 if (index>=textLen || index<=0) {
4388f060 3688 return TRUE;
46f4442e 3689 }
729e4ab9 3690
46f4442e
A
3691 // If the character at the current index is not a GRAPHEME_EXTEND
3692 // then we can not be within a combining sequence.
3693 UChar32 c;
3694 U16_GET(text, 0, index, textLen, c);
3695 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3696 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
4388f060 3697 return TRUE;
46f4442e 3698 }
729e4ab9 3699
46f4442e
A
3700 // We are at a combining mark. If the preceding character is anything
3701 // except a CONTROL, CR or LF, we are in a combining sequence.
729e4ab9 3702 U16_PREV(text, 0, index, c);
46f4442e 3703 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
729e4ab9 3704 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
4388f060 3705 return !combining;
46f4442e
A
3706#elif !UCONFIG_NO_BREAK_ITERATION
3707 UBreakIterator *breakiterator = strsrch->search->breakIter;
3708
3709 if (breakiterator == NULL) {
3710 breakiterator = strsrch->search->internalBreakIter;
3711 }
3712
4388f060 3713 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
46f4442e
A
3714#else
3715 // **** or use the original code? ****
4388f060 3716 return TRUE;
46f4442e 3717#endif
729e4ab9 3718}
46f4442e
A
3719
3720#if 0
3721static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3722{
3723#if !UCONFIG_NO_BREAK_ITERATION
3724 UBreakIterator *breakiterator = strsrch->search->breakIter;
3725
3726 if (breakiterator != NULL) {
3727 int32_t startindex = ubrk_first(breakiterator);
3728 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 3729
46f4442e
A
3730 // out-of-range indexes are never boundary positions
3731 if (start < startindex || start > endindex ||
3732 end < startindex || end > endindex) {
3733 return FALSE;
3734 }
3735
729e4ab9 3736 return ubrk_isBoundary(breakiterator, start) &&
46f4442e
A
3737 ubrk_isBoundary(breakiterator, end);
3738 }
3739#endif
3740
3741 return TRUE;
3742}
3743#endif
3744
729e4ab9
A
3745typedef enum {
3746 U_CE_MATCH = -1,
3747 U_CE_NO_MATCH = 0,
3748 U_CE_SKIP_TARG,
3749 U_CE_SKIP_PATN
3750} UCompareCEsResult;
3751#define U_CE_LEVEL2_BASE 0x00000005
3752#define U_CE_LEVEL3_BASE 0x00050000
3753
3754static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3755 if (targCE == patCE) {
3756 return U_CE_MATCH;
3757 }
3758 if (compareType == 0) {
3759 return U_CE_NO_MATCH;
3760 }
46f4442e 3761
729e4ab9
A
3762 int64_t targCEshifted = targCE >> 32;
3763 int64_t patCEshifted = patCE >> 32;
3764 int64_t mask;
3765
3766 mask = 0xFFFF0000;
3767 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3768 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3769 if ( targLev1 != patLev1 ) {
3770 if ( targLev1 == 0 ) {
3771 return U_CE_SKIP_TARG;
3772 }
3773 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3774 return U_CE_SKIP_PATN;
3775 }
3776 return U_CE_NO_MATCH;
3777 }
3778
3779 mask = 0x0000FFFF;
3780 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3781 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3782 if ( targLev2 != patLev2 ) {
3783 if ( targLev2 == 0 ) {
3784 return U_CE_SKIP_TARG;
3785 }
3786 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3787 return U_CE_SKIP_PATN;
3788 }
3789 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3790 U_CE_MATCH: U_CE_NO_MATCH;
3791 }
3792
3793 mask = 0xFFFF0000;
3794 int32_t targLev3 = (int32_t)(targCE & mask);
3795 int32_t patLev3 = (int32_t)(patCE & mask);
3796 if ( targLev3 != patLev3 ) {
3797 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3798 U_CE_MATCH: U_CE_NO_MATCH;
3799 }
3800
3801 return U_CE_MATCH;
3802}
3803
3804#if BOYER_MOORE
3805// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3806#endif
3807
46f4442e
A
3808U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3809 int32_t startIdx,
3810 int32_t *matchStart,
3811 int32_t *matchLimit,
729e4ab9 3812 UErrorCode *status)
46f4442e
A
3813{
3814 if (U_FAILURE(*status)) {
3815 return FALSE;
3816 }
3817
3818 // TODO: reject search patterns beginning with a combining char.
3819
3820#ifdef USEARCH_DEBUG
3821 if (getenv("USEARCH_DEBUG") != NULL) {
3822 printf("Pattern CEs\n");
b331163b
A
3823 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3824 printf(" %8x", strsrch->pattern.ces[ii]);
46f4442e
A
3825 }
3826 printf("\n");
3827 }
729e4ab9 3828
46f4442e
A
3829#endif
3830 // Input parameter sanity check.
3831 // TODO: should input indicies clip to the text length
3832 // in the same way that UText does.
b331163b 3833 if(strsrch->pattern.cesLength == 0 ||
46f4442e
A
3834 startIdx < 0 ||
3835 startIdx > strsrch->search->textLength ||
b331163b 3836 strsrch->pattern.ces == NULL) {
46f4442e
A
3837 *status = U_ILLEGAL_ARGUMENT_ERROR;
3838 return FALSE;
3839 }
3840
b331163b 3841 if (strsrch->pattern.pces == NULL) {
46f4442e
A
3842 initializePatternPCETable(strsrch, status);
3843 }
3844
3845 ucol_setOffset(strsrch->textIter, startIdx, status);
b331163b 3846 CEIBuffer ceb(strsrch, status);
46f4442e 3847
729e4ab9
A
3848
3849 int32_t targetIx = 0;
3850 const CEI *targetCEI = NULL;
46f4442e
A
3851 int32_t patIx;
3852 UBool found;
3853
3854 int32_t mStart = -1;
3855 int32_t mLimit = -1;
3856 int32_t minLimit;
3857 int32_t maxLimit;
729e4ab9
A
3858
3859
3860
46f4442e
A
3861 // Outer loop moves over match starting positions in the
3862 // target CE space.
729e4ab9
A
3863 // Here we see the target as a sequence of collation elements, resulting from the following:
3864 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3865 // (for example, digraphs such as IJ may be broken into two characters).
3866 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3867 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3868 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3869 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3870 // then the CE is deleted, so the following code sees only CEs that are relevant.
3871 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3872 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3873 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3874 //
46f4442e
A
3875 for(targetIx=0; ; targetIx++)
3876 {
3877 found = TRUE;
3878 // Inner loop checks for a match beginning at each
3879 // position from the outer loop.
729e4ab9
A
3880 int32_t targetIxOffset = 0;
3881 int64_t patCE = 0;
4388f060
A
3882 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3883 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3884 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3885 const CEI *firstCEI = ceb.get(targetIx);
3886 if (firstCEI == NULL) {
3887 *status = U_INTERNAL_PROGRAM_ERROR;
3888 found = FALSE;
3889 break;
3890 }
3891
b331163b
A
3892 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3893 patCE = strsrch->pattern.pces[patIx];
729e4ab9 3894 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
46f4442e 3895 // Compare CE from target string with CE from the pattern.
729e4ab9 3896 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
46f4442e 3897 // which will fail the compare, below.
729e4ab9
A
3898 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3899 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
3900 found = FALSE;
3901 break;
729e4ab9
A
3902 } else if ( ceMatch > U_CE_NO_MATCH ) {
3903 if ( ceMatch == U_CE_SKIP_TARG ) {
3904 // redo with same patCE, next targCE
3905 patIx--;
3906 targetIxOffset++;
3907 } else { // ceMatch == U_CE_SKIP_PATN
3908 // redo with same targCE, next patCE
3909 targetIxOffset--;
3910 }
46f4442e
A
3911 }
3912 }
b331163b 3913 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
46f4442e 3914
729e4ab9 3915 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
3916 // No match at this targetIx. Try again at the next.
3917 continue;
3918 }
3919
3920 if (!found) {
3921 // No match at all, we have run off the end of the target text.
3922 break;
3923 }
3924
3925
3926 // We have found a match in CE space.
3927 // Now determine the bounds in string index space.
3928 // There still is a chance of match failure if the CE range not correspond to
3929 // an acceptable character range.
3930 //
729e4ab9 3931 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
46f4442e 3932
46f4442e
A
3933 mStart = firstCEI->lowIndex;
3934 minLimit = lastCEI->lowIndex;
46f4442e
A
3935
3936 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3937 // extended to the end of input, and the match is good.
3938
3939 // Look at the high and low indices of the CE following the match. If
3940 // they are the same it means one of two things:
3941 // 1. The match extended to the last CE from the target text, which is OK, or
3942 // 2. The last CE that was part of the match is in an expansion that extends
3943 // to the first CE after the match. In this case, we reject the match.
4388f060 3944 const CEI *nextCEI = 0;
729e4ab9 3945 if (strsrch->search->elementComparisonType == 0) {
4388f060 3946 nextCEI = ceb.get(targetIx + targetIxOffset);
729e4ab9
A
3947 maxLimit = nextCEI->lowIndex;
3948 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3949 found = FALSE;
3950 }
3951 } else {
729e4ab9
A
3952 for ( ; ; ++targetIxOffset ) {
3953 nextCEI = ceb.get(targetIx + targetIxOffset);
3954 maxLimit = nextCEI->lowIndex;
4388f060 3955 // If we are at the end of the target too, match succeeds
729e4ab9
A
3956 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3957 break;
3958 }
3959 // As long as the next CE has primary weight of 0,
3960 // it is part of the last target element matched by the pattern;
3961 // make sure it can be part of a match with the last patCE
3962 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
4388f060
A
3963 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3964 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3965 found = FALSE;
3966 break;
3967 }
729e4ab9
A
3968 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3969 // target element, but it has non-zero primary weight => match fails
3970 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4388f060
A
3971 found = false;
3972 break;
729e4ab9
A
3973 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3974 } else {
4388f060 3975 break;
729e4ab9
A
3976 }
3977 }
46f4442e 3978 }
729e4ab9 3979
46f4442e
A
3980
3981 // Check for the start of the match being within a combining sequence.
3982 // This can happen if the pattern itself begins with a combining char, and
3983 // the match found combining marks in the target text that were attached
3984 // to something else.
3985 // This type of match should be rejected for not completely consuming a
3986 // combining sequence.
4388f060 3987 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
3988 found = FALSE;
3989 }
3990
3991 // Check for the start of the match being within an Collation Element Expansion,
3992 // meaning that the first char of the match is only partially matched.
729e4ab9 3993 // With exapnsions, the first CE will report the index of the source
46f4442e 3994 // character, and all subsequent (expansions) CEs will report the source index of the
729e4ab9 3995 // _following_ character.
46f4442e
A
3996 int32_t secondIx = firstCEI->highIndex;
3997 if (mStart == secondIx) {
3998 found = FALSE;
3999 }
729e4ab9 4000
46f4442e
A
4001 // Advance the match end position to the first acceptable match boundary.
4002 // This advances the index over any combining charcters.
4003 mLimit = maxLimit;
4004 if (minLimit < maxLimit) {
4388f060
A
4005 // When the last CE's low index is same with its high index, the CE is likely
4006 // a part of expansion. In this case, the index is located just after the
4007 // character corresponding to the CEs compared above. If the index is right
4008 // at the break boundary, move the position to the next boundary will result
4009 // incorrect match length when there are ignorable characters exist between
4010 // the position and the next character produces CE(s). See ticket#8482.
4011 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4012 mLimit = minLimit;
4013 } else {
4014 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4015 if (nba >= lastCEI->highIndex) {
4016 mLimit = nba;
4017 }
46f4442e
A
4018 }
4019 }
729e4ab9 4020
46f4442e
A
4021 #ifdef USEARCH_DEBUG
4022 if (getenv("USEARCH_DEBUG") != NULL) {
4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4024 }
4025 #endif
729e4ab9 4026
b331163b
A
4027 // If default breakIter is being used, and next collation element belonging to this
4028 // combining sequence has non-zero primary weight and corresponds to a separate
4029 // character following the one at end of the current match, then do NOT require
4030 // that match end position be on a breakIter boundary, or that end of the
4031 // combining sequence not extend beyond the match in CE space. Only do those
4032 // tests if the conditions above are not met. Added this to make prefix search
4033 // work in Indic scripts per <rdar://problem/18063262>.
4034 UBool doLimitTests = !(strsrch->search->breakIter == NULL &&
4035 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4036 nextCEI->lowIndex >= lastCEI->highIndex && nextCEI->highIndex > nextCEI->lowIndex);
4037
4038 if (doLimitTests) { // <rdar://problem/18063262>
4039 // If advancing to the end of a combining sequence in character indexing space
4040 // advanced us beyond the end of the match in CE space, reject this match.
4041 if (mLimit > maxLimit) {
4042 found = FALSE;
4043 }
46f4442e 4044
b331163b
A
4045 if (!isBreakBoundary(strsrch, mLimit)) {
4046 found = FALSE;
4047 }
46f4442e
A
4048 }
4049
729e4ab9
A
4050 if (! checkIdentical(strsrch, mStart, mLimit)) {
4051 found = FALSE;
4052 }
4053
46f4442e
A
4054 if (found) {
4055 break;
4056 }
4057 }
4058
4059 #ifdef USEARCH_DEBUG
4060 if (getenv("USEARCH_DEBUG") != NULL) {
4061 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4062 int32_t lastToPrint = ceb.limitIx+2;
4063 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4064 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4065 }
4066 printf("\n%s\n", found? "match found" : "no match");
4067 }
4068 #endif
4069
4070 // All Done. Store back the match bounds to the caller.
4071 //
4072 if (found==FALSE) {
4073 mLimit = -1;
4074 mStart = -1;
4075 }
4076
4077 if (matchStart != NULL) {
4078 *matchStart= mStart;
4079 }
4080
4081 if (matchLimit != NULL) {
4082 *matchLimit = mLimit;
4083 }
4084
4085 return found;
4086}
4087
46f4442e
A
4088U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4089 int32_t startIdx,
4090 int32_t *matchStart,
4091 int32_t *matchLimit,
729e4ab9 4092 UErrorCode *status)
46f4442e
A
4093{
4094 if (U_FAILURE(*status)) {
4095 return FALSE;
4096 }
4097
4098 // TODO: reject search patterns beginning with a combining char.
4099
4100#ifdef USEARCH_DEBUG
4101 if (getenv("USEARCH_DEBUG") != NULL) {
4102 printf("Pattern CEs\n");
b331163b
A
4103 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4104 printf(" %8x", strsrch->pattern.ces[ii]);
46f4442e
A
4105 }
4106 printf("\n");
4107 }
729e4ab9 4108
46f4442e
A
4109#endif
4110 // Input parameter sanity check.
4111 // TODO: should input indicies clip to the text length
4112 // in the same way that UText does.
b331163b 4113 if(strsrch->pattern.cesLength == 0 ||
46f4442e
A
4114 startIdx < 0 ||
4115 startIdx > strsrch->search->textLength ||
b331163b 4116 strsrch->pattern.ces == NULL) {
46f4442e
A
4117 *status = U_ILLEGAL_ARGUMENT_ERROR;
4118 return FALSE;
4119 }
4120
b331163b 4121 if (strsrch->pattern.pces == NULL) {
46f4442e
A
4122 initializePatternPCETable(strsrch, status);
4123 }
4124
b331163b 4125 CEIBuffer ceb(strsrch, status);
729e4ab9 4126 int32_t targetIx = 0;
46f4442e
A
4127
4128 /*
4129 * Pre-load the buffer with the CE's for the grapheme
4130 * after our starting position so that we're sure that
4131 * we can look at the CE following the match when we
4132 * check the match boundaries.
4133 *
4134 * This will also pre-fetch the first CE that we'll
4135 * consider for the match.
4136 */
4137 if (startIdx < strsrch->search->textLength) {
4138 UBreakIterator *bi = strsrch->search->internalBreakIter;
4139 int32_t next = ubrk_following(bi, startIdx);
4140
4141 ucol_setOffset(strsrch->textIter, next, status);
4142
4143 for (targetIx = 0; ; targetIx += 1) {
4144 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4145 break;
4146 }
4147 }
4148 } else {
4149 ucol_setOffset(strsrch->textIter, startIdx, status);
4150 }
46f4442e 4151
729e4ab9
A
4152
4153 const CEI *targetCEI = NULL;
46f4442e
A
4154 int32_t patIx;
4155 UBool found;
4156
4157 int32_t limitIx = targetIx;
4158 int32_t mStart = -1;
4159 int32_t mLimit = -1;
4160 int32_t minLimit;
4161 int32_t maxLimit;
729e4ab9
A
4162
4163
4164
46f4442e
A
4165 // Outer loop moves over match starting positions in the
4166 // target CE space.
729e4ab9
A
4167 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4168 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4169 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4170 // and the beginning of the base text.
46f4442e
A
4171 for(targetIx = limitIx; ; targetIx += 1)
4172 {
4173 found = TRUE;
4388f060
A
4174 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4175 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4176 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4177 const CEI *lastCEI = ceb.getPrevious(targetIx);
4178 if (lastCEI == NULL) {
4179 *status = U_INTERNAL_PROGRAM_ERROR;
4180 found = FALSE;
4181 break;
4182 }
46f4442e
A
4183 // Inner loop checks for a match beginning at each
4184 // position from the outer loop.
729e4ab9 4185 int32_t targetIxOffset = 0;
b331163b
A
4186 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4187 int64_t patCE = strsrch->pattern.pces[patIx];
46f4442e 4188
b331163b 4189 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
46f4442e
A
4190 // Compare CE from target string with CE from the pattern.
4191 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4192 // which will fail the compare, below.
729e4ab9
A
4193 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4194 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
4195 found = FALSE;
4196 break;
729e4ab9
A
4197 } else if ( ceMatch > U_CE_NO_MATCH ) {
4198 if ( ceMatch == U_CE_SKIP_TARG ) {
4199 // redo with same patCE, next targCE
4200 patIx++;
4201 targetIxOffset++;
4202 } else { // ceMatch == U_CE_SKIP_PATN
4203 // redo with same targCE, next patCE
4204 targetIxOffset--;
4205 }
46f4442e
A
4206 }
4207 }
4208
729e4ab9 4209 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
4210 // No match at this targetIx. Try again at the next.
4211 continue;
4212 }
4213
4214 if (!found) {
4215 // No match at all, we have run off the end of the target text.
4216 break;
4217 }
4218
4219
4220 // We have found a match in CE space.
4221 // Now determine the bounds in string index space.
4222 // There still is a chance of match failure if the CE range not correspond to
4223 // an acceptable character range.
4224 //
b331163b 4225 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
46f4442e 4226 mStart = firstCEI->lowIndex;
46f4442e
A
4227
4228 // Check for the start of the match being within a combining sequence.
4229 // This can happen if the pattern itself begins with a combining char, and
4230 // the match found combining marks in the target text that were attached
4231 // to something else.
4232 // This type of match should be rejected for not completely consuming a
4233 // combining sequence.
4388f060 4234 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
4235 found = FALSE;
4236 }
4237
4238 // Look at the high index of the first CE in the match. If it's the same as the
4239 // low index, the first CE in the match is in the middle of an expansion.
4240 if (mStart == firstCEI->highIndex) {
4241 found = FALSE;
4242 }
729e4ab9 4243
46f4442e 4244
4388f060
A
4245 minLimit = lastCEI->lowIndex;
4246
4247 if (targetIx > 0) {
4248 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4249 // extended to the end of input, and the match is good.
4250
4251 // Look at the high and low indices of the CE following the match. If
4252 // they are the same it means one of two things:
4253 // 1. The match extended to the last CE from the target text, which is OK, or
4254 // 2. The last CE that was part of the match is in an expansion that extends
4255 // to the first CE after the match. In this case, we reject the match.
4256 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4257
4258 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4259 found = FALSE;
4260 }
4261
4262 mLimit = maxLimit = nextCEI->lowIndex;
4263
4264 // Advance the match end position to the first acceptable match boundary.
b331163b 4265 // This advances the index over any combining characters.
4388f060
A
4266 if (minLimit < maxLimit) {
4267 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4268
4269 if (nba >= lastCEI->highIndex) {
4270 mLimit = nba;
4271 }
46f4442e 4272 }
4388f060 4273
b331163b
A
4274 // If default breakIter is being used, and next collation element belonging to this
4275 // combining sequence has non-zero primary weight and corresponds to a separate
4276 // character following the one at end of the current match, then do NOT require
4277 // that match end position be on a breakIter boundary, or that end of the
4278 // combining sequence not extend beyond the match in CE space. Only do those
4279 // tests if the conditions above are not met. Added this to make prefix search
4280 // work in Indic scripts per <rdar://problem/18063262>.
4281 UBool doLimitTests = !(strsrch->search->breakIter == NULL &&
4282 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4283 nextCEI->lowIndex >= lastCEI->highIndex && nextCEI->highIndex > nextCEI->lowIndex);
4284
4285 if (doLimitTests) { // <rdar://problem/18063262>
4286 // If advancing to the end of a combining sequence in character indexing space
4287 // advanced us beyond the end of the match in CE space, reject this match.
4288 if (mLimit > maxLimit) {
4289 found = FALSE;
4290 }
4388f060 4291
b331163b
A
4292 // Make sure the end of the match is on a break boundary
4293 if (!isBreakBoundary(strsrch, mLimit)) {
4294 found = FALSE;
4295 }
4388f060
A
4296 }
4297
4298 } else {
4299 // No non-ignorable CEs after this point.
4300 // The maximum position is detected by boundary after
4301 // the last non-ignorable CE. Combining sequence
4302 // across the start index will be truncated.
4303 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4304 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
46f4442e 4305 }
729e4ab9 4306
46f4442e
A
4307 #ifdef USEARCH_DEBUG
4308 if (getenv("USEARCH_DEBUG") != NULL) {
4309 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4310 }
4311 #endif
729e4ab9 4312
46f4442e 4313
729e4ab9
A
4314 if (! checkIdentical(strsrch, mStart, mLimit)) {
4315 found = FALSE;
4316 }
4317
46f4442e
A
4318 if (found) {
4319 break;
4320 }
4321 }
4322
4323 #ifdef USEARCH_DEBUG
4324 if (getenv("USEARCH_DEBUG") != NULL) {
4325 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4326 int32_t lastToPrint = ceb.limitIx+2;
4327 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4328 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4329 }
4330 printf("\n%s\n", found? "match found" : "no match");
4331 }
4332 #endif
4333
4334 // All Done. Store back the match bounds to the caller.
4335 //
4336 if (found==FALSE) {
4337 mLimit = -1;
4338 mStart = -1;
4339 }
4340
4341 if (matchStart != NULL) {
4342 *matchStart= mStart;
4343 }
4344
4345 if (matchLimit != NULL) {
4346 *matchLimit = mLimit;
4347 }
4348
4349 return found;
4350}
4351
b75a7d8f
A
4352// internal use methods declared in usrchimp.h -----------------------------
4353
4354UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4355{
4356 if (U_FAILURE(*status)) {
4357 setMatchNotFound(strsrch);
4358 return FALSE;
4359 }
4360
46f4442e 4361#if BOYER_MOORE
374ca955 4362 UCollationElements *coleiter = strsrch->textIter;
b75a7d8f 4363 int32_t textlength = strsrch->search->textLength;
b331163b
A
4364 int32_t *patternce = strsrch->pattern.ces;
4365 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
4366 int32_t textoffset = ucol_getOffset(coleiter);
4367
374ca955 4368 // status used in setting coleiter offset, since offset is checked in
729e4ab9 4369 // shiftForward before setting the coleiter offset, status never
374ca955 4370 // a failure
729e4ab9 4371 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4372 patterncelength);
4373 while (textoffset <= textlength)
4374 {
4375 uint32_t patternceindex = patterncelength - 1;
374ca955 4376 int32_t targetce;
b75a7d8f 4377 UBool found = FALSE;
374ca955
A
4378 int32_t lastce = UCOL_NULLORDER;
4379
4380 setColEIterOffset(coleiter, textoffset);
4381
46f4442e 4382 for (;;) {
b75a7d8f
A
4383 // finding the last pattern ce match, imagine composite characters
4384 // for example: search for pattern A in text \u00C0
4385 // we'll have to skip \u0300 the grave first before we get to A
4386 targetce = ucol_previous(coleiter, status);
4387 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4388 found = FALSE;
4389 break;
4390 }
4391 targetce = getCE(strsrch, targetce);
729e4ab9
A
4392 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4393 // this is for the text \u0315\u0300 that requires
b75a7d8f
A
4394 // normalization and pattern \u0300, where \u0315 is ignorable
4395 continue;
4396 }
4397 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4398 lastce = targetce;
4399 }
729e4ab9 4400 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4401 if (targetce == patternce[patternceindex]) {
4402 // the first ce can be a contraction
4403 found = TRUE;
4404 break;
4405 }
4406 if (!hasExpansion(coleiter)) {
4407 found = FALSE;
4408 break;
4409 }
4410 }
4411
46f4442e 4412 //targetce = lastce;
729e4ab9 4413
b75a7d8f 4414 while (found && patternceindex > 0) {
4388f060 4415 lastce = targetce;
b75a7d8f
A
4416 targetce = ucol_previous(coleiter, status);
4417 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4418 found = FALSE;
4419 break;
4420 }
4421 targetce = getCE(strsrch, targetce);
4422 if (targetce == UCOL_IGNORABLE) {
4423 continue;
4424 }
4425
4426 patternceindex --;
729e4ab9
A
4427 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4428 found = found && targetce == patternce[patternceindex];
b75a7d8f 4429 }
729e4ab9 4430
46f4442e 4431 targetce = lastce;
b75a7d8f
A
4432
4433 if (!found) {
374ca955
A
4434 if (U_FAILURE(*status)) {
4435 break;
4436 }
729e4ab9 4437 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4438 patternceindex);
4439 // status checked at loop.
4440 patternceindex = patterncelength;
4441 continue;
4442 }
374ca955
A
4443
4444 if (checkNextExactMatch(strsrch, &textoffset, status)) {
b75a7d8f 4445 // status checked in ucol_setOffset
374ca955
A
4446 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4447 return TRUE;
b75a7d8f
A
4448 }
4449 }
4450 setMatchNotFound(strsrch);
374ca955 4451 return FALSE;
46f4442e
A
4452#else
4453 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4454 int32_t start = -1;
4455 int32_t end = -1;
4456
4457 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4458 strsrch->search->matchedIndex = start;
4459 strsrch->search->matchedLength = end - start;
4460 return TRUE;
4461 } else {
4462 setMatchNotFound(strsrch);
4463 return FALSE;
4464 }
4465#endif
b75a7d8f
A
4466}
4467
4468UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4469{
4470 if (U_FAILURE(*status)) {
4471 setMatchNotFound(strsrch);
4472 return FALSE;
4473 }
4474
46f4442e 4475#if BOYER_MOORE
b75a7d8f
A
4476 UCollationElements *coleiter = strsrch->textIter;
4477 int32_t textlength = strsrch->search->textLength;
b331163b
A
4478 int32_t *patternce = strsrch->pattern.ces;
4479 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4480 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4481 UBool hasPatternAccents =
b75a7d8f 4482 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9
A
4483
4484 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4485 patterncelength);
4486 strsrch->canonicalPrefixAccents[0] = 0;
4487 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4488
b75a7d8f
A
4489 while (textoffset <= textlength)
4490 {
4491 int32_t patternceindex = patterncelength - 1;
374ca955 4492 int32_t targetce;
b75a7d8f 4493 UBool found = FALSE;
374ca955
A
4494 int32_t lastce = UCOL_NULLORDER;
4495
4496 setColEIterOffset(coleiter, textoffset);
4497
4498 for (;;) {
b75a7d8f
A
4499 // finding the last pattern ce match, imagine composite characters
4500 // for example: search for pattern A in text \u00C0
4501 // we'll have to skip \u0300 the grave first before we get to A
4502 targetce = ucol_previous(coleiter, status);
4503 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4504 found = FALSE;
4505 break;
4506 }
4507 targetce = getCE(strsrch, targetce);
4508 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4509 lastce = targetce;
4510 }
729e4ab9 4511 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4512 if (targetce == patternce[patternceindex]) {
4513 // the first ce can be a contraction
4514 found = TRUE;
4515 break;
4516 }
4517 if (!hasExpansion(coleiter)) {
4518 found = FALSE;
4519 break;
4520 }
4521 }
729e4ab9 4522
b75a7d8f
A
4523 while (found && patternceindex > 0) {
4524 targetce = ucol_previous(coleiter, status);
4525 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4526 found = FALSE;
4527 break;
4528 }
4529 targetce = getCE(strsrch, targetce);
4530 if (targetce == UCOL_IGNORABLE) {
4531 continue;
4532 }
4533
4534 patternceindex --;
729e4ab9
A
4535 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4536 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4537 }
4538
4539 // initializing the rearranged accent array
4540 if (hasPatternAccents && !found) {
4541 strsrch->canonicalPrefixAccents[0] = 0;
4542 strsrch->canonicalSuffixAccents[0] = 0;
374ca955
A
4543 if (U_FAILURE(*status)) {
4544 break;
4545 }
b75a7d8f
A
4546 found = doNextCanonicalMatch(strsrch, textoffset, status);
4547 }
4548
4549 if (!found) {
374ca955
A
4550 if (U_FAILURE(*status)) {
4551 break;
4552 }
729e4ab9 4553 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4554 patternceindex);
4555 // status checked at loop
4556 patternceindex = patterncelength;
4557 continue;
4558 }
729e4ab9 4559
b75a7d8f
A
4560 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4561 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4562 return TRUE;
4563 }
4564 }
4565 setMatchNotFound(strsrch);
4566 return FALSE;
46f4442e
A
4567#else
4568 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4569 int32_t start = -1;
4570 int32_t end = -1;
4571
4572 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4573 strsrch->search->matchedIndex = start;
4574 strsrch->search->matchedLength = end - start;
4575 return TRUE;
4576 } else {
4577 setMatchNotFound(strsrch);
4578 return FALSE;
4579 }
4580#endif
b75a7d8f
A
4581}
4582
4583UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4584{
4585 if (U_FAILURE(*status)) {
4586 setMatchNotFound(strsrch);
4587 return FALSE;
4588 }
4589
46f4442e 4590#if BOYER_MOORE
b75a7d8f 4591 UCollationElements *coleiter = strsrch->textIter;
b331163b
A
4592 int32_t *patternce = strsrch->pattern.ces;
4593 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4594 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
4595
4596 // shifting it check for setting offset
4597 // if setOffset is called previously or there was no previous match, we
4598 // leave the offset as it is.
4599 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4600 textoffset = strsrch->search->matchedIndex;
4601 }
729e4ab9
A
4602
4603 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f 4604 patterncelength);
729e4ab9 4605
b75a7d8f
A
4606 while (textoffset >= 0)
4607 {
4608 int32_t patternceindex = 1;
374ca955 4609 int32_t targetce;
b75a7d8f 4610 UBool found = FALSE;
374ca955 4611 int32_t firstce = UCOL_NULLORDER;
b75a7d8f 4612
374ca955 4613 // if status is a failure, ucol_setOffset does nothing
b75a7d8f 4614 setColEIterOffset(coleiter, textoffset);
374ca955
A
4615
4616 for (;;) {
729e4ab9
A
4617 // finding the first pattern ce match, imagine composite
4618 // characters. for example: search for pattern \u0300 in text
4619 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4620 // \u0300 the grave accent
4621 targetce = ucol_next(coleiter, status);
4622 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4623 found = FALSE;
4624 break;
4625 }
4626 targetce = getCE(strsrch, targetce);
4627 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4628 firstce = targetce;
4629 }
46f4442e 4630 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
b75a7d8f 4631 continue;
729e4ab9
A
4632 }
4633 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4634 if (targetce == patternce[0]) {
4635 found = TRUE;
4636 break;
4637 }
4638 if (!hasExpansion(coleiter)) {
4639 // checking for accents in composite character
4640 found = FALSE;
4641 break;
4642 }
4643 }
4644
46f4442e 4645 //targetce = firstce;
729e4ab9 4646
b75a7d8f 4647 while (found && (patternceindex < patterncelength)) {
4388f060 4648 firstce = targetce;
b75a7d8f
A
4649 targetce = ucol_next(coleiter, status);
4650 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4651 found = FALSE;
4652 break;
4653 }
4654 targetce = getCE(strsrch, targetce);
4655 if (targetce == UCOL_IGNORABLE) {
4656 continue;
4657 }
4658
729e4ab9
A
4659 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4660 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4661 patternceindex ++;
4662 }
729e4ab9 4663
46f4442e 4664 targetce = firstce;
b75a7d8f
A
4665
4666 if (!found) {
374ca955
A
4667 if (U_FAILURE(*status)) {
4668 break;
4669 }
729e4ab9
A
4670
4671 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4672 patternceindex);
4673 patternceindex = 0;
4674 continue;
4675 }
729e4ab9 4676
b75a7d8f
A
4677 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4678 setColEIterOffset(coleiter, textoffset);
4679 return TRUE;
4680 }
4681 }
4682 setMatchNotFound(strsrch);
4683 return FALSE;
46f4442e 4684#else
4388f060
A
4685 int32_t textOffset;
4686
4687 if (strsrch->search->isOverlap) {
4688 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4689 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4690 } else {
4691 // move the start position at the end of possible match
4692 initializePatternPCETable(strsrch, status);
57a6839d
A
4693 if (!initTextProcessedIter(strsrch, status)) {
4694 setMatchNotFound(strsrch);
4695 return FALSE;
4696 }
b331163b 4697 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
57a6839d 4698 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4699 if (pce == UCOL_PROCESSED_NULLORDER) {
4700 // at the end of the text
4701 break;
4702 }
4703 }
4704 if (U_FAILURE(*status)) {
4705 setMatchNotFound(strsrch);
4706 return FALSE;
4707 }
4708 textOffset = ucol_getOffset(strsrch->textIter);
4709 }
4710 } else {
4711 textOffset = ucol_getOffset(strsrch->textIter);
4712 }
4713
46f4442e
A
4714 int32_t start = -1;
4715 int32_t end = -1;
4716
4717 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4718 strsrch->search->matchedIndex = start;
4719 strsrch->search->matchedLength = end - start;
4720 return TRUE;
4721 } else {
4722 setMatchNotFound(strsrch);
4723 return FALSE;
4724 }
4725#endif
b75a7d8f
A
4726}
4727
729e4ab9 4728UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
b75a7d8f
A
4729 UErrorCode *status)
4730{
4731 if (U_FAILURE(*status)) {
4732 setMatchNotFound(strsrch);
4733 return FALSE;
4734 }
4735
46f4442e 4736#if BOYER_MOORE
b75a7d8f 4737 UCollationElements *coleiter = strsrch->textIter;
b331163b
A
4738 int32_t *patternce = strsrch->pattern.ces;
4739 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4740 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4741 UBool hasPatternAccents =
b75a7d8f 4742 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9 4743
b75a7d8f
A
4744 // shifting it check for setting offset
4745 // if setOffset is called previously or there was no previous match, we
4746 // leave the offset as it is.
4747 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4748 textoffset = strsrch->search->matchedIndex;
4749 }
729e4ab9
A
4750
4751 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4752 patterncelength);
4753 strsrch->canonicalPrefixAccents[0] = 0;
4754 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4755
b75a7d8f
A
4756 while (textoffset >= 0)
4757 {
4758 int32_t patternceindex = 1;
374ca955 4759 int32_t targetce;
b75a7d8f 4760 UBool found = FALSE;
374ca955 4761 int32_t firstce = UCOL_NULLORDER;
b75a7d8f
A
4762
4763 setColEIterOffset(coleiter, textoffset);
46f4442e 4764 for (;;) {
729e4ab9
A
4765 // finding the first pattern ce match, imagine composite
4766 // characters. for example: search for pattern \u0300 in text
4767 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4768 // \u0300 the grave accent
4769 targetce = ucol_next(coleiter, status);
4770 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4771 found = FALSE;
4772 break;
4773 }
4774 targetce = getCE(strsrch, targetce);
4775 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4776 firstce = targetce;
4777 }
729e4ab9
A
4778
4779 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4780 if (targetce == patternce[0]) {
4781 // the first ce can be a contraction
4782 found = TRUE;
4783 break;
4784 }
4785 if (!hasExpansion(coleiter)) {
4786 // checking for accents in composite character
4787 found = FALSE;
4788 break;
4789 }
4790 }
4791
4792 targetce = firstce;
729e4ab9 4793
b75a7d8f
A
4794 while (found && patternceindex < patterncelength) {
4795 targetce = ucol_next(coleiter, status);
4796 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4797 found = FALSE;
4798 break;
4799 }
4800 targetce = getCE(strsrch, targetce);
4801 if (targetce == UCOL_IGNORABLE) {
4802 continue;
4803 }
4804
729e4ab9
A
4805 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4806 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4807 patternceindex ++;
4808 }
4809
4810 // initializing the rearranged accent array
4811 if (hasPatternAccents && !found) {
4812 strsrch->canonicalPrefixAccents[0] = 0;
4813 strsrch->canonicalSuffixAccents[0] = 0;
374ca955 4814 if (U_FAILURE(*status)) {
b75a7d8f
A
4815 break;
4816 }
4817 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4818 }
4819
4820 if (!found) {
374ca955 4821 if (U_FAILURE(*status)) {
b75a7d8f
A
4822 break;
4823 }
729e4ab9 4824 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4825 patternceindex);
4826 patternceindex = 0;
4827 continue;
4828 }
4829
4830 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4831 setColEIterOffset(coleiter, textoffset);
4832 return TRUE;
4833 }
4834 }
4835 setMatchNotFound(strsrch);
4836 return FALSE;
46f4442e 4837#else
4388f060
A
4838 int32_t textOffset;
4839
4840 if (strsrch->search->isOverlap) {
4841 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4842 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4843 } else {
4844 // move the start position at the end of possible match
4845 initializePatternPCETable(strsrch, status);
57a6839d
A
4846 if (!initTextProcessedIter(strsrch, status)) {
4847 setMatchNotFound(strsrch);
4848 return FALSE;
4849 }
b331163b 4850 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
57a6839d 4851 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4852 if (pce == UCOL_PROCESSED_NULLORDER) {
4853 // at the end of the text
4854 break;
4855 }
4856 }
4857 if (U_FAILURE(*status)) {
4858 setMatchNotFound(strsrch);
4859 return FALSE;
4860 }
4861 textOffset = ucol_getOffset(strsrch->textIter);
4862 }
4863 } else {
4864 textOffset = ucol_getOffset(strsrch->textIter);
4865 }
4866
46f4442e
A
4867 int32_t start = -1;
4868 int32_t end = -1;
4869
4870 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4871 strsrch->search->matchedIndex = start;
4872 strsrch->search->matchedLength = end - start;
4873 return TRUE;
4874 } else {
4875 setMatchNotFound(strsrch);
4876 return FALSE;
4877 }
4878#endif
b75a7d8f
A
4879}
4880
4881#endif /* #if !UCONFIG_NO_COLLATION */