]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/usearch.cpp
ICU-531.31.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
57a6839d 3* Copyright (C) 2001-2014 IBM and others. All rights reserved.
b75a7d8f
A
4**********************************************************************
5* Date Name Description
6* 07/02/2001 synwee Creation.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11
46f4442e 12#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
b75a7d8f
A
13
14#include "unicode/usearch.h"
15#include "unicode/ustring.h"
16#include "unicode/uchar.h"
4388f060 17#include "unicode/utf16.h"
729e4ab9 18#include "normalizer2impl.h"
b75a7d8f
A
19#include "usrchimp.h"
20#include "cmemory.h"
374ca955 21#include "ucln_in.h"
46f4442e 22#include "uassert.h"
729e4ab9 23#include "ustr_imp.h"
46f4442e
A
24
25U_NAMESPACE_USE
26
27// don't use Boyer-Moore
729e4ab9 28// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
46f4442e 29#define BOYER_MOORE 0
b75a7d8f 30
73c04bcf
A
31#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
32
b75a7d8f
A
33// internal definition ---------------------------------------------------
34
35#define LAST_BYTE_MASK_ 0xFF
36#define SECOND_LAST_BYTE_SHIFT_ 8
37#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38
4388f060 39static const Normalizer2Impl *g_nfcImpl = NULL;
b75a7d8f
A
40
41// internal methods -------------------------------------------------
42
43/**
44* Fast collation element iterator setOffset.
45* This function does not check for bounds.
46* @param coleiter collation element iterator
47* @param offset to set
48*/
729e4ab9 49static
b75a7d8f
A
50inline void setColEIterOffset(UCollationElements *elems,
51 int32_t offset)
52{
57a6839d
A
53 // Note: Not "fast" any more after the 2013 collation rewrite.
54 // We do not want to expose more internals than necessary.
55 UErrorCode status = U_ZERO_ERROR;
56 ucol_setOffset(elems, offset, &status);
b75a7d8f
A
57}
58
59/**
60* Getting the mask for collation strength
61* @param strength collation strength
62* @return collation element mask
63*/
64static
729e4ab9 65inline uint32_t getMask(UCollationStrength strength)
b75a7d8f 66{
729e4ab9 67 switch (strength)
b75a7d8f
A
68 {
69 case UCOL_PRIMARY:
70 return UCOL_PRIMARYORDERMASK;
71 case UCOL_SECONDARY:
72 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
73 default:
729e4ab9 74 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
b75a7d8f
A
75 UCOL_PRIMARYORDERMASK;
76 }
77}
78
79/**
80* This is to squeeze the 21bit ces into a 256 table
81* @param ce collation element
82* @return collapsed version of the collation element
83*/
84static
729e4ab9 85inline int hash(uint32_t ce)
b75a7d8f
A
86{
87 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
88 // well with the new collation where most of the latin 1 characters
89 // are of the value xx000xxx. their hashes will most of the time be 0
90 // to be discussed on the hash algo.
91 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
92}
93
374ca955
A
94U_CDECL_BEGIN
95static UBool U_CALLCONV
96usearch_cleanup(void) {
4388f060 97 g_nfcImpl = NULL;
374ca955
A
98 return TRUE;
99}
100U_CDECL_END
101
b75a7d8f
A
102/**
103* Initializing the fcd tables.
104* Internal method, status assumed to be a success.
729e4ab9 105* @param status output error if any, caller to check status before calling
b75a7d8f
A
106* method, status assumed to be success when passed in.
107*/
108static
729e4ab9 109inline void initializeFCD(UErrorCode *status)
b75a7d8f 110{
4388f060
A
111 if (g_nfcImpl == NULL) {
112 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
374ca955 113 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
b75a7d8f
A
114 }
115}
116
117/**
118* Gets the fcd value for a character at the argument index.
119* This method takes into accounts of the supplementary characters.
120* @param str UTF16 string where character for fcd retrieval resides
729e4ab9
A
121* @param offset position of the character whose fcd is to be retrieved, to be
122* overwritten with the next character position, taking
b75a7d8f
A
123* surrogate characters into consideration.
124* @param strlength length of the argument string
125* @return fcd value
126*/
127static
729e4ab9 128uint16_t getFCD(const UChar *str, int32_t *offset,
b75a7d8f
A
129 int32_t strlength)
130{
729e4ab9 131 const UChar *temp = str + *offset;
4388f060 132 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
729e4ab9 133 *offset = (int32_t)(temp - str);
b75a7d8f
A
134 return result;
135}
136
137/**
729e4ab9 138* Getting the modified collation elements taking into account the collation
b75a7d8f
A
139* attributes
140* @param strsrch string search data
729e4ab9 141* @param sourcece
b75a7d8f
A
142* @return the modified collation element
143*/
144static
374ca955 145inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
b75a7d8f
A
146{
147 // note for tertiary we can't use the collator->tertiaryMask, that
148 // is a preprocessed mask that takes into account case options. since
149 // we are only concerned with exact matches, we don't need that.
150 sourcece &= strsrch->ceMask;
729e4ab9 151
b75a7d8f
A
152 if (strsrch->toShift) {
153 // alternate handling here, since only the 16 most significant digits
154 // is only used, we can safely do a compare without masking
155 // if the ce is a variable, we mask and get only the primary values
156 // no shifting to quartenary is required since all primary values
157 // less than variabletop will need to be masked off anyway.
158 if (strsrch->variableTop > sourcece) {
729e4ab9 159 if (strsrch->strength >= UCOL_QUATERNARY) {
b75a7d8f
A
160 sourcece &= UCOL_PRIMARYORDERMASK;
161 }
729e4ab9 162 else {
b75a7d8f
A
163 sourcece = UCOL_IGNORABLE;
164 }
165 }
729e4ab9
A
166 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
167 sourcece = 0xFFFF;
b75a7d8f
A
168 }
169
170 return sourcece;
171}
172
729e4ab9 173/**
b75a7d8f
A
174* Allocate a memory and returns NULL if it failed.
175* Internal method, status assumed to be a success.
176* @param size to allocate
729e4ab9 177* @param status output error if any, caller to check status before calling
b75a7d8f
A
178* method, status assumed to be success when passed in.
179* @return newly allocated array, NULL otherwise
180*/
181static
729e4ab9 182inline void * allocateMemory(uint32_t size, UErrorCode *status)
b75a7d8f
A
183{
184 uint32_t *result = (uint32_t *)uprv_malloc(size);
185 if (result == NULL) {
186 *status = U_MEMORY_ALLOCATION_ERROR;
187 }
188 return result;
189}
190
191/**
192* Adds a uint32_t value to a destination array.
729e4ab9 193* Creates a new array if we run out of space. The caller will have to
b75a7d8f 194* manually deallocate the newly allocated array.
729e4ab9
A
195* Internal method, status assumed to be success, caller has to check status
196* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
197* size destinationlength.
198* @param destination target array
199* @param offset destination offset to add value
200* @param destinationlength target array size, return value for the new size
201* @param value to be added
202* @param increments incremental size expected
729e4ab9 203* @param status output error if any, caller to check status before calling
b75a7d8f
A
204* method, status assumed to be success when passed in.
205* @return new destination array, destination if there was no new allocation
206*/
207static
729e4ab9
A
208inline int32_t * addTouint32_tArray(int32_t *destination,
209 uint32_t offset,
210 uint32_t *destinationlength,
374ca955 211 uint32_t value,
729e4ab9
A
212 uint32_t increments,
213 UErrorCode *status)
b75a7d8f
A
214{
215 uint32_t newlength = *destinationlength;
216 if (offset + 1 == newlength) {
217 newlength += increments;
374ca955
A
218 int32_t *temp = (int32_t *)allocateMemory(
219 sizeof(int32_t) * newlength, status);
b75a7d8f
A
220 if (U_FAILURE(*status)) {
221 return NULL;
222 }
374ca955 223 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
b75a7d8f
A
224 *destinationlength = newlength;
225 destination = temp;
226 }
227 destination[offset] = value;
228 return destination;
229}
230
46f4442e
A
231/**
232* Adds a uint64_t value to a destination array.
729e4ab9 233* Creates a new array if we run out of space. The caller will have to
46f4442e 234* manually deallocate the newly allocated array.
729e4ab9
A
235* Internal method, status assumed to be success, caller has to check status
236* before calling this method. destination not to be NULL and has at least
46f4442e
A
237* size destinationlength.
238* @param destination target array
239* @param offset destination offset to add value
240* @param destinationlength target array size, return value for the new size
241* @param value to be added
242* @param increments incremental size expected
729e4ab9 243* @param status output error if any, caller to check status before calling
46f4442e
A
244* method, status assumed to be success when passed in.
245* @return new destination array, destination if there was no new allocation
246*/
247static
729e4ab9
A
248inline int64_t * addTouint64_tArray(int64_t *destination,
249 uint32_t offset,
250 uint32_t *destinationlength,
46f4442e 251 uint64_t value,
729e4ab9
A
252 uint32_t increments,
253 UErrorCode *status)
46f4442e
A
254{
255 uint32_t newlength = *destinationlength;
256 if (offset + 1 == newlength) {
257 newlength += increments;
258 int64_t *temp = (int64_t *)allocateMemory(
259 sizeof(int64_t) * newlength, status);
729e4ab9 260
46f4442e
A
261 if (U_FAILURE(*status)) {
262 return NULL;
263 }
264
265 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
266 *destinationlength = newlength;
267 destination = temp;
268 }
269
270 destination[offset] = value;
271
272 return destination;
273}
274
b75a7d8f
A
275/**
276* Initializing the ce table for a pattern.
277* Stores non-ignorable collation keys.
729e4ab9
A
278* Table size will be estimated by the size of the pattern text. Table
279* expansion will be perform as we go along. Adding 1 to ensure that the table
b75a7d8f
A
280* size definitely increases.
281* Internal method, status assumed to be a success.
282* @param strsrch string search data
729e4ab9 283* @param status output error if any, caller to check status before calling
b75a7d8f 284* method, status assumed to be success when passed in.
729e4ab9 285* @return total number of expansions
b75a7d8f
A
286*/
287static
729e4ab9 288inline uint16_t initializePatternCETable(UStringSearch *strsrch,
b75a7d8f
A
289 UErrorCode *status)
290{
291 UPattern *pattern = &(strsrch->pattern);
292 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
374ca955 293 int32_t *cetable = pattern->CEBuffer;
b75a7d8f
A
294 uint32_t patternlength = pattern->textLength;
295 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 296
b75a7d8f 297 if (coleiter == NULL) {
729e4ab9 298 coleiter = ucol_openElements(strsrch->collator, pattern->text,
b75a7d8f 299 patternlength, status);
729e4ab9
A
300 // status will be checked in ucol_next(..) later and if it is an
301 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
b75a7d8f
A
302 // returned.
303 strsrch->utilIter = coleiter;
304 }
305 else {
57a6839d 306 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
307 }
308 if(U_FAILURE(*status)) {
309 return 0;
b75a7d8f 310 }
729e4ab9 311
b75a7d8f
A
312 if (pattern->CE != cetable && pattern->CE) {
313 uprv_free(pattern->CE);
314 }
729e4ab9 315
b75a7d8f
A
316 uint16_t offset = 0;
317 uint16_t result = 0;
374ca955 318 int32_t ce;
b75a7d8f
A
319
320 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
321 U_SUCCESS(*status)) {
322 uint32_t newce = getCE(strsrch, ce);
323 if (newce) {
729e4ab9 324 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
b75a7d8f 325 newce,
729e4ab9 326 patternlength - ucol_getOffset(coleiter) + 1,
b75a7d8f
A
327 status);
328 if (U_FAILURE(*status)) {
329 return 0;
330 }
331 offset ++;
332 if (cetable != temp && cetable != pattern->CEBuffer) {
333 uprv_free(cetable);
334 }
335 cetable = temp;
336 }
337 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
338 }
339
340 cetable[offset] = 0;
341 pattern->CE = cetable;
342 pattern->CELength = offset;
343
344 return result;
345}
346
46f4442e
A
347/**
348* Initializing the pce table for a pattern.
349* Stores non-ignorable collation keys.
729e4ab9
A
350* Table size will be estimated by the size of the pattern text. Table
351* expansion will be perform as we go along. Adding 1 to ensure that the table
46f4442e
A
352* size definitely increases.
353* Internal method, status assumed to be a success.
354* @param strsrch string search data
729e4ab9 355* @param status output error if any, caller to check status before calling
46f4442e 356* method, status assumed to be success when passed in.
729e4ab9 357* @return total number of expansions
46f4442e
A
358*/
359static
729e4ab9 360inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
46f4442e
A
361 UErrorCode *status)
362{
363 UPattern *pattern = &(strsrch->pattern);
364 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
365 int64_t *pcetable = pattern->PCEBuffer;
366 uint32_t patternlength = pattern->textLength;
367 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 368
46f4442e 369 if (coleiter == NULL) {
729e4ab9 370 coleiter = ucol_openElements(strsrch->collator, pattern->text,
46f4442e 371 patternlength, status);
729e4ab9
A
372 // status will be checked in ucol_next(..) later and if it is an
373 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
46f4442e
A
374 // returned.
375 strsrch->utilIter = coleiter;
376 } else {
57a6839d 377 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
378 }
379 if(U_FAILURE(*status)) {
380 return 0;
46f4442e 381 }
729e4ab9 382
46f4442e
A
383 if (pattern->PCE != pcetable && pattern->PCE != NULL) {
384 uprv_free(pattern->PCE);
385 }
729e4ab9 386
46f4442e
A
387 uint16_t offset = 0;
388 uint16_t result = 0;
389 int64_t pce;
390
57a6839d 391 icu::UCollationPCE iter(coleiter);
46f4442e
A
392
393 // ** Should processed CEs be signed or unsigned?
729e4ab9 394 // ** (the rest of the code in this file seems to play fast-and-loose with
46f4442e 395 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
57a6839d 396 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
46f4442e 397 U_SUCCESS(*status)) {
729e4ab9 398 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
46f4442e 399 pce,
729e4ab9 400 patternlength - ucol_getOffset(coleiter) + 1,
46f4442e
A
401 status);
402
403 if (U_FAILURE(*status)) {
404 return 0;
405 }
406
407 offset += 1;
408
409 if (pcetable != temp && pcetable != pattern->PCEBuffer) {
410 uprv_free(pcetable);
411 }
412
413 pcetable = temp;
414 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
415 }
416
417 pcetable[offset] = 0;
418 pattern->PCE = pcetable;
419 pattern->PCELength = offset;
420
421 return result;
422}
423
b75a7d8f
A
424/**
425* Initializes the pattern struct.
426* Internal method, status assumed to be success.
427* @param strsrch UStringSearch data storage
729e4ab9 428* @param status output error if any, caller to check status before calling
b75a7d8f
A
429* method, status assumed to be success when passed in.
430* @return expansionsize the total expansion size of the pattern
729e4ab9 431*/
b75a7d8f 432static
729e4ab9 433inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 434{
57a6839d 435 if (U_FAILURE(*status)) { return 0; }
b75a7d8f
A
436 UPattern *pattern = &(strsrch->pattern);
437 const UChar *patterntext = pattern->text;
438 int32_t length = pattern->textLength;
439 int32_t index = 0;
729e4ab9 440
46f4442e
A
441 // Since the strength is primary, accents are ignored in the pattern.
442 if (strsrch->strength == UCOL_PRIMARY) {
4388f060
A
443 pattern->hasPrefixAccents = 0;
444 pattern->hasSuffixAccents = 0;
46f4442e 445 } else {
4388f060
A
446 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
447 SECOND_LAST_BYTE_SHIFT_;
448 index = length;
449 U16_BACK_1(patterntext, 0, index);
450 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
451 LAST_BYTE_MASK_;
46f4442e
A
452 }
453
454 // ** HACK **
455 if (strsrch->pattern.PCE != NULL) {
456 if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
457 uprv_free(strsrch->pattern.PCE);
458 }
459
460 strsrch->pattern.PCE = NULL;
461 }
b75a7d8f 462
b75a7d8f 463 // since intializePattern is an internal method status is a success.
729e4ab9 464 return initializePatternCETable(strsrch, status);
b75a7d8f
A
465}
466
467/**
468* Initializing shift tables, with the default values.
469* If a corresponding default value is 0, the shift table is not set.
729e4ab9 470* @param shift table for forwards shift
b75a7d8f
A
471* @param backshift table for backwards shift
472* @param cetable table containing pattern ce
473* @param cesize size of the pattern ces
474* @param expansionsize total size of the expansions
475* @param defaultforward the default forward value
476* @param defaultbackward the default backward value
477*/
478static
729e4ab9
A
479inline void setShiftTable(int16_t shift[], int16_t backshift[],
480 int32_t *cetable, int32_t cesize,
b75a7d8f
A
481 int16_t expansionsize,
482 int16_t defaultforward,
483 int16_t defaultbackward)
484{
729e4ab9 485 // estimate the value to shift. to do that we estimate the smallest
b75a7d8f 486 // number of characters to give the relevant ces, ie approximately
729e4ab9 487 // the number of ces minus their expansion, since expansions can come
b75a7d8f
A
488 // from a character.
489 int32_t count;
490 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
491 shift[count] = defaultforward;
492 }
493 cesize --; // down to the last index
494 for (count = 0; count < cesize; count ++) {
495 // number of ces from right of array to the count
496 int temp = defaultforward - count - 1;
497 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
498 }
499 shift[hash(cetable[cesize])] = 1;
500 // for ignorables we just shift by one. see test examples.
501 shift[hash(0)] = 1;
729e4ab9 502
b75a7d8f
A
503 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
504 backshift[count] = defaultbackward;
505 }
506 for (count = cesize; count > 0; count --) {
507 // the original value count does not seem to work
729e4ab9 508 backshift[hash(cetable[count])] = count > expansionsize ?
b75a7d8f
A
509 (int16_t)(count - expansionsize) : 1;
510 }
511 backshift[hash(cetable[0])] = 1;
512 backshift[hash(0)] = 1;
513}
514
515/**
516* Building of the pattern collation element list and the boyer moore strsrch
517* table.
518* The canonical match will only be performed after the default match fails.
519* For both cases we need to remember the size of the composed and decomposed
520* versions of the string. Since the Boyer-Moore shift calculations shifts by
729e4ab9
A
521* a number of characters in the text and tries to match the pattern from that
522* offset, the shift value can not be too large in case we miss some
523* characters. To choose a right shift size, we estimate the NFC form of the
524* and use its size as a shift guide. The NFC form should be the small
b75a7d8f
A
525* possible representation of the pattern. Anyways, we'll err on the smaller
526* shift size. Hence the calculation for minlength.
729e4ab9
A
527* Canonical match will be performed slightly differently. We'll split the
528* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
529* the first and last base character (MS), the ending accents (EA). Matches
b75a7d8f
A
530* will be done on MS first, and only when we match MS then some processing
531* will be required for the prefix and end accents in order to determine if
729e4ab9
A
532* they match PA and EA. Hence the default shift values
533* for the canonical match will take the size of either end's accent into
b75a7d8f
A
534* consideration. Forwards search will take the end accents into consideration
535* for the default shift values and the backwards search will take the prefix
536* accents into consideration.
537* If pattern has no non-ignorable ce, we return a illegal argument error.
538* Internal method, status assumed to be success.
539* @param strsrch UStringSearch data storage
540* @param status for output errors if it occurs, status is assumed to be a
541* success when it is passed in.
729e4ab9 542*/
b75a7d8f 543static
729e4ab9 544inline void initialize(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 545{
729e4ab9 546 int16_t expandlength = initializePattern(strsrch, status);
b75a7d8f
A
547 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
548 UPattern *pattern = &strsrch->pattern;
549 int32_t cesize = pattern->CELength;
550
729e4ab9 551 int16_t minlength = cesize > expandlength
374ca955 552 ? (int16_t)cesize - expandlength : 1;
b75a7d8f
A
553 pattern->defaultShiftSize = minlength;
554 setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
555 cesize, expandlength, minlength, minlength);
556 return;
557 }
558 strsrch->pattern.defaultShiftSize = 0;
559}
560
46f4442e
A
561#if BOYER_MOORE
562/**
729e4ab9 563* Check to make sure that the match length is at the end of the character by
46f4442e 564* using the breakiterator.
729e4ab9 565* @param strsrch string search data
46f4442e
A
566* @param start target text start offset
567* @param end target text end offset
568*/
569static
729e4ab9 570void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
46f4442e
A
571 int32_t *end)
572{
573#if !UCONFIG_NO_BREAK_ITERATION
574 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
575 if (breakiterator) {
4388f060
A
576 int32_t matchend = *end;
577 //int32_t matchstart = *start;
729e4ab9 578
4388f060
A
579 if (!ubrk_isBoundary(breakiterator, matchend)) {
580 *end = ubrk_following(breakiterator, matchend);
46f4442e 581 }
729e4ab9 582
4388f060
A
583 /* Check the start of the matched text to make sure it doesn't have any accents
584 * before it. This code may not be necessary and so it is commented out */
585 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
586 *start = ubrk_preceding(breakiterator, matchstart);
587 }*/
46f4442e
A
588 }
589#endif
590}
591
b75a7d8f 592/**
729e4ab9
A
593* Determine whether the target text in UStringSearch bounded by the offset
594* start and end is one or more whole units of text as
b75a7d8f 595* determined by the breakiterator in UStringSearch.
729e4ab9 596* @param strsrch string search data
b75a7d8f
A
597* @param start target text start offset
598* @param end target text end offset
599*/
600static
729e4ab9 601UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
b75a7d8f
A
602 int32_t end)
603{
604#if !UCONFIG_NO_BREAK_ITERATION
605 UBreakIterator *breakiterator = strsrch->search->breakIter;
46f4442e 606 //TODO: Add here.
b75a7d8f
A
607 if (breakiterator) {
608 int32_t startindex = ubrk_first(breakiterator);
609 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 610
b75a7d8f
A
611 // out-of-range indexes are never boundary positions
612 if (start < startindex || start > endindex ||
613 end < startindex || end > endindex) {
614 return FALSE;
615 }
729e4ab9
A
616 // otherwise, we can use following() on the position before the
617 // specified one and return true of the position we get back is the
b75a7d8f 618 // one the user specified
729e4ab9
A
619 UBool result = (start == startindex ||
620 ubrk_following(breakiterator, start - 1) == start) &&
621 (end == endindex ||
b75a7d8f
A
622 ubrk_following(breakiterator, end - 1) == end);
623 if (result) {
624 // iterates the individual ces
625 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 626 const UChar *text = strsrch->search->text +
b75a7d8f
A
627 start;
628 UErrorCode status = U_ZERO_ERROR;
629 ucol_setText(coleiter, text, end - start, &status);
630 for (int32_t count = 0; count < strsrch->pattern.CELength;
631 count ++) {
374ca955 632 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
633 if (ce == UCOL_IGNORABLE) {
634 count --;
635 continue;
636 }
637 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
638 return FALSE;
639 }
640 }
374ca955 641 int32_t nextce = ucol_next(coleiter, &status);
b75a7d8f
A
642 while (ucol_getOffset(coleiter) == (end - start)
643 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
644 nextce = ucol_next(coleiter, &status);
645 }
646 if (ucol_getOffset(coleiter) == (end - start)
647 && nextce != UCOL_NULLORDER) {
648 // extra collation elements at the end of the match
649 return FALSE;
650 }
651 }
652 return result;
653 }
654#endif
655 return TRUE;
656}
657
658/**
729e4ab9
A
659* Getting the next base character offset if current offset is an accent,
660* or the current offset if the current character contains a base character.
b75a7d8f
A
661* accents the following base character will be returned
662* @param text string
663* @param textoffset current offset
664* @param textlength length of text string
665* @return the next base character or the current offset
666* if the current character is contains a base character.
667*/
668static
729e4ab9 669inline int32_t getNextBaseOffset(const UChar *text,
b75a7d8f
A
670 int32_t textoffset,
671 int32_t textlength)
672{
673 if (textoffset < textlength) {
674 int32_t temp = textoffset;
675 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
729e4ab9 676 while (temp < textlength) {
b75a7d8f 677 int32_t result = temp;
729e4ab9 678 if ((getFCD(text, &temp, textlength) >>
b75a7d8f
A
679 SECOND_LAST_BYTE_SHIFT_) == 0) {
680 return result;
681 }
682 }
683 return textlength;
684 }
685 }
686 return textoffset;
687}
688
689/**
690* Gets the next base character offset depending on the string search pattern
691* data
692* @param strsrch string search data
693* @param textoffset current offset, one offset away from the last character
694* to search for.
695* @return start index of the next base character or the current offset
696* if the current character is contains a base character.
697*/
698static
729e4ab9 699inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
700 int32_t textoffset)
701{
374ca955 702 int32_t textlength = strsrch->search->textLength;
729e4ab9 703 if (strsrch->pattern.hasSuffixAccents &&
b75a7d8f
A
704 textoffset < textlength) {
705 int32_t temp = textoffset;
706 const UChar *text = strsrch->search->text;
4388f060 707 U16_BACK_1(text, 0, temp);
b75a7d8f
A
708 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
709 return getNextBaseOffset(text, textoffset, textlength);
710 }
711 }
712 return textoffset;
713}
714
715/**
716* Shifting the collation element iterator position forward to prepare for
717* a following match. If the last character is a unsafe character, we'll only
718* shift by 1 to capture contractions, normalization etc.
719* Internal method, status assumed to be success.
720* @param text strsrch string search data
721* @param textoffset start text position to do search
722* @param ce the text ce which failed the match.
723* @param patternceindex index of the ce within the pattern ce buffer which
724* failed the match
725* @return final offset
726*/
727static
728inline int32_t shiftForward(UStringSearch *strsrch,
729 int32_t textoffset,
374ca955 730 int32_t ce,
b75a7d8f
A
731 int32_t patternceindex)
732{
374ca955 733 UPattern *pattern = &(strsrch->pattern);
b75a7d8f
A
734 if (ce != UCOL_NULLORDER) {
735 int32_t shift = pattern->shift[hash(ce)];
729e4ab9 736 // this is to adjust for characters in the middle of the
b75a7d8f
A
737 // substring for matching that failed.
738 int32_t adjust = pattern->CELength - patternceindex;
739 if (adjust > 1 && shift >= adjust) {
740 shift -= adjust - 1;
741 }
742 textoffset += shift;
743 }
744 else {
745 textoffset += pattern->defaultShiftSize;
746 }
729e4ab9 747
b75a7d8f
A
748 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
749 // check for unsafe characters
729e4ab9 750 // * if it is the start or middle of a contraction: to be done after
b75a7d8f
A
751 // a initial match is found
752 // * thai or lao base consonant character: similar to contraction
753 // * high surrogate character: similar to contraction
754 // * next character is a accent: shift to the next base character
755 return textoffset;
756}
46f4442e 757#endif // #if BOYER_MOORE
b75a7d8f
A
758
759/**
729e4ab9 760* sets match not found
b75a7d8f
A
761* @param strsrch string search data
762*/
763static
729e4ab9 764inline void setMatchNotFound(UStringSearch *strsrch)
b75a7d8f
A
765{
766 // this method resets the match result regardless of the error status.
767 strsrch->search->matchedIndex = USEARCH_DONE;
768 strsrch->search->matchedLength = 0;
769 if (strsrch->search->isForwardSearching) {
770 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
771 }
772 else {
773 setColEIterOffset(strsrch->textIter, 0);
774 }
775}
776
46f4442e 777#if BOYER_MOORE
b75a7d8f
A
778/**
779* Gets the offset to the next safe point in text.
780* ie. not the middle of a contraction, swappable characters or supplementary
781* characters.
782* @param collator collation sata
783* @param text string to work with
784* @param textoffset offset in string
785* @param textlength length of text string
786* @return offset to the next safe character
787*/
788static
729e4ab9 789inline int32_t getNextSafeOffset(const UCollator *collator,
b75a7d8f
A
790 const UChar *text,
791 int32_t textoffset,
792 int32_t textlength)
793{
794 int32_t result = textoffset; // first contraction character
795 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
796 result ++;
797 }
729e4ab9 798 return result;
b75a7d8f
A
799}
800
729e4ab9 801/**
b75a7d8f
A
802* This checks for accents in the potential match started with a .
803* composite character.
729e4ab9
A
804* This is really painful... we have to check that composite character do not
805* have any extra accents. We have to normalize the potential match and find
b75a7d8f 806* the immediate decomposed character before the match.
729e4ab9 807* The first composite character would have been taken care of by the fcd
b75a7d8f 808* checks in checkForwardExactMatch.
729e4ab9
A
809* This is the slow path after the fcd of the first character and
810* the last character has been checked by checkForwardExactMatch and we
b75a7d8f
A
811* determine that the potential match has extra non-ignorable preceding
812* ces.
729e4ab9 813* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
814* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
815* Note here that accents checking are slow and cautioned in the API docs.
816* Internal method, status assumed to be a success, caller should check status
817* before calling this method
818* @param strsrch string search data
819* @param start index of the potential unfriendly composite character
820* @param end index of the potential unfriendly composite character
821* @param status output error status if any.
822* @return TRUE if there is non-ignorable accents before at the beginning
823* of the match, FALSE otherwise.
824*/
825
826static
827UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
729e4ab9 828 int32_t end,
b75a7d8f
A
829 UErrorCode *status)
830{
831 UBool result = FALSE;
832 if (strsrch->pattern.hasPrefixAccents) {
833 int32_t length = end - start;
834 int32_t offset = 0;
835 const UChar *text = strsrch->search->text + start;
729e4ab9 836
4388f060 837 U16_FWD_1(text, offset, length);
b75a7d8f
A
838 // we are only concerned with the first composite character
839 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
729e4ab9 840 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
b75a7d8f
A
841 text, 0, length);
842 if (safeoffset != length) {
843 safeoffset ++;
844 }
845 UChar *norm = NULL;
846 UChar buffer[INITIAL_ARRAY_SIZE_];
729e4ab9
A
847 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
848 buffer, INITIAL_ARRAY_SIZE_,
849 status);
b75a7d8f
A
850 if (U_FAILURE(*status)) {
851 return FALSE;
852 }
853 if (size >= INITIAL_ARRAY_SIZE_) {
854 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
855 status);
729e4ab9 856 // if allocation failed, status will be set to
b75a7d8f
A
857 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
858 // checks for it.
729e4ab9 859 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
b75a7d8f
A
860 size, status);
861 if (U_FAILURE(*status) && norm != NULL) {
862 uprv_free(norm);
863 return FALSE;
864 }
865 }
866 else {
867 norm = buffer;
868 }
869
870 UCollationElements *coleiter = strsrch->utilIter;
871 ucol_setText(coleiter, norm, size, status);
872 uint32_t firstce = strsrch->pattern.CE[0];
873 UBool ignorable = TRUE;
874 uint32_t ce = UCOL_IGNORABLE;
46f4442e 875 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
b75a7d8f
A
876 offset = ucol_getOffset(coleiter);
877 if (ce != firstce && ce != UCOL_IGNORABLE) {
878 ignorable = FALSE;
879 }
880 ce = ucol_next(coleiter, status);
881 }
882 UChar32 codepoint;
4388f060 883 U16_PREV(norm, 0, offset, codepoint);
b75a7d8f
A
884 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
885
886 if (norm != buffer) {
887 uprv_free(norm);
888 }
889 }
890 }
891
892 return result;
893}
894
895/**
729e4ab9 896* Used by exact matches, checks if there are accents before the match.
b75a7d8f 897* This is really painful... we have to check that composite characters at
729e4ab9
A
898* the start of the matches have to not have any extra accents.
899* We check the FCD of the character first, if it starts with an accent and
b75a7d8f 900* the first pattern ce does not match the first ce of the character, we bail.
729e4ab9
A
901* Otherwise we try normalizing the first composite
902* character and find the immediate decomposed character before the match to
b75a7d8f 903* see if it is an non-ignorable accent.
729e4ab9
A
904* Now normalizing the first composite character is enough because we ensure
905* that when the match is passed in here with extra beginning ces, the
b75a7d8f 906* first or last ce that match has to occur within the first character.
729e4ab9 907* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
908* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
909* Note here that accents checking are slow and cautioned in the API docs.
910* @param strsrch string search data
729e4ab9 911* @param start offset
b75a7d8f 912* @param end offset
729e4ab9 913* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
914* FALSE otherwise
915*/
916static
917UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
729e4ab9 918 int32_t end)
b75a7d8f
A
919{
920 if (strsrch->pattern.hasPrefixAccents) {
921 UCollationElements *coleiter = strsrch->textIter;
922 UErrorCode status = U_ZERO_ERROR;
923 // we have been iterating forwards previously
924 uint32_t ignorable = TRUE;
374ca955 925 int32_t firstce = strsrch->pattern.CE[0];
b75a7d8f 926
374ca955
A
927 setColEIterOffset(coleiter, start);
928 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
929 if (U_FAILURE(status)) {
b75a7d8f
A
930 return TRUE;
931 }
932 while (ce != firstce) {
933 if (ce != UCOL_IGNORABLE) {
934 ignorable = FALSE;
935 }
936 ce = getCE(strsrch, ucol_next(coleiter, &status));
46f4442e 937 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
938 return TRUE;
939 }
940 }
374ca955 941 if (!ignorable && inNormBuf(coleiter)) {
b75a7d8f 942 // within normalization buffer, discontiguous handled here
374ca955 943 return TRUE;
b75a7d8f
A
944 }
945
374ca955 946 // within text
b75a7d8f 947 int32_t temp = start;
374ca955 948 // original code
729e4ab9
A
949 // accent = (getFCD(strsrch->search->text, &temp,
950 // strsrch->search->textLength)
951 // >> SECOND_LAST_BYTE_SHIFT_);
374ca955 952 // however this code does not work well with VC7 .net in release mode.
729e4ab9 953 // maybe the inlines for getFCD combined with shifting has bugs in
374ca955 954 // VC7. anyways this is a work around.
729e4ab9 955 UBool accent = getFCD(strsrch->search->text, &temp,
b75a7d8f
A
956 strsrch->search->textLength) > 0xFF;
957 if (!accent) {
374ca955 958 return checkExtraMatchAccents(strsrch, start, end, &status);
b75a7d8f 959 }
374ca955 960 if (!ignorable) {
b75a7d8f
A
961 return TRUE;
962 }
963 if (start > 0) {
964 temp = start;
4388f060 965 U16_BACK_1(strsrch->search->text, 0, temp);
729e4ab9 966 if (getFCD(strsrch->search->text, &temp,
b75a7d8f
A
967 strsrch->search->textLength) & LAST_BYTE_MASK_) {
968 setColEIterOffset(coleiter, start);
969 ce = ucol_previous(coleiter, &status);
729e4ab9 970 if (U_FAILURE(status) ||
b75a7d8f
A
971 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
972 return TRUE;
973 }
974 }
975 }
976 }
729e4ab9 977
b75a7d8f
A
978 return FALSE;
979}
980
981/**
982* Used by exact matches, checks if there are accents bounding the match.
983* Note this is the initial boundary check. If the potential match
984* starts or ends with composite characters, the accents in those
985* characters will be determined later.
729e4ab9 986* Not doing backwards iteration here, since discontiguos contraction for
b75a7d8f 987* backwards collation element iterator, use up too many characters.
729e4ab9 988* E.g. looking for \u030A ring in \u01FA A ring above and acute,
b75a7d8f
A
989* should fail since there is a acute at the end of \u01FA
990* Note here that accents checking are slow and cautioned in the API docs.
991* @param strsrch string search data
992* @param start offset of match
993* @param end end offset of the match
729e4ab9 994* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
995* FALSE otherwise
996*/
997static
729e4ab9
A
998UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
999 int32_t end)
b75a7d8f
A
1000{
1001 if (strsrch->pattern.hasSuffixAccents) {
1002 const UChar *text = strsrch->search->text;
1003 int32_t temp = end;
1004 int32_t textlength = strsrch->search->textLength;
4388f060 1005 U16_BACK_1(text, 0, temp);
b75a7d8f 1006 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
374ca955 1007 int32_t firstce = strsrch->pattern.CE[0];
b75a7d8f
A
1008 UCollationElements *coleiter = strsrch->textIter;
1009 UErrorCode status = U_ZERO_ERROR;
4388f060 1010 int32_t ce;
b75a7d8f 1011 setColEIterOffset(coleiter, start);
46f4442e
A
1012 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1013 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
1014 return TRUE;
1015 }
1016 }
1017 int32_t count = 1;
1018 while (count < strsrch->pattern.CELength) {
729e4ab9 1019 if (getCE(strsrch, ucol_next(coleiter, &status))
b75a7d8f
A
1020 == UCOL_IGNORABLE) {
1021 // Thai can give an ignorable here.
1022 count --;
1023 }
1024 if (U_FAILURE(status)) {
1025 return TRUE;
1026 }
1027 count ++;
1028 }
729e4ab9 1029
4388f060 1030 ce = ucol_next(coleiter, &status);
b75a7d8f
A
1031 if (U_FAILURE(status)) {
1032 return TRUE;
1033 }
46f4442e 1034 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
4388f060 1035 ce = getCE(strsrch, ce);
46f4442e 1036 }
b75a7d8f
A
1037 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1038 if (ucol_getOffset(coleiter) <= end) {
1039 return TRUE;
1040 }
1041 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1042 return TRUE;
1043 }
1044 }
1045 }
1046 }
1047 return FALSE;
1048}
46f4442e 1049#endif // #if BOYER_MOORE
b75a7d8f
A
1050
1051/**
1052* Checks if the offset runs out of the text string
729e4ab9 1053* @param offset
b75a7d8f
A
1054* @param textlength of the text string
1055* @return TRUE if offset is out of bounds, FALSE otherwise
1056*/
1057static
1058inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1059{
1060 return offset < 0 || offset > textlength;
1061}
1062
1063/**
1064* Checks for identical match
1065* @param strsrch string search data
1066* @param start offset of possible match
1067* @param end offset of possible match
1068* @return TRUE if identical match is found
1069*/
1070static
729e4ab9
A
1071inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1072 int32_t end)
b75a7d8f 1073{
b75a7d8f
A
1074 if (strsrch->strength != UCOL_IDENTICAL) {
1075 return TRUE;
1076 }
1077
729e4ab9
A
1078 // Note: We could use Normalizer::compare() or similar, but for short strings
1079 // which may not be in FCD it might be faster to just NFD them.
1080 UErrorCode status = U_ZERO_ERROR;
1081 UnicodeString t2, p2;
1082 strsrch->nfd->normalize(
1083 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1084 strsrch->nfd->normalize(
1085 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
73c04bcf 1086 // return FALSE if NFD failed
729e4ab9 1087 return U_SUCCESS(status) && t2 == p2;
b75a7d8f
A
1088}
1089
729e4ab9 1090#if BOYER_MOORE
b75a7d8f
A
1091/**
1092* Checks to see if the match is repeated
1093* @param strsrch string search data
1094* @param start new match start index
1095* @param end new match end index
1096* @return TRUE if the the match is repeated, FALSE otherwise
1097*/
1098static
1099inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1100 int32_t start,
1101 int32_t end)
1102{
1103 int32_t lastmatchindex = strsrch->search->matchedIndex;
1104 UBool result;
1105 if (lastmatchindex == USEARCH_DONE) {
1106 return FALSE;
1107 }
1108 if (strsrch->search->isForwardSearching) {
1109 result = start <= lastmatchindex;
1110 }
1111 else {
1112 result = start >= lastmatchindex;
1113 }
374ca955 1114 if (!result && !strsrch->search->isOverlap) {
b75a7d8f
A
1115 if (strsrch->search->isForwardSearching) {
1116 result = start < lastmatchindex + strsrch->search->matchedLength;
1117 }
1118 else {
1119 result = end > lastmatchindex;
1120 }
1121 }
1122 return result;
1123}
1124
1125/**
1126* Gets the collation element iterator's current offset.
1127* @param coleiter collation element iterator
1128* @param forwards flag TRUE if we are moving in th forwards direction
729e4ab9 1129* @return current offset
b75a7d8f
A
1130*/
1131static
1132inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1133 UBool forwards)
1134{
1135 int32_t result = ucol_getOffset(coleiter);
1136 // intricacies of the the backwards collation element iterator
46f4442e 1137 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
b75a7d8f
A
1138 result ++;
1139 }
1140 return result;
1141}
1142
1143/**
729e4ab9 1144* Checks match for contraction.
b75a7d8f
A
1145* If the match ends with a partial contraction we fail.
1146* If the match starts too far off (because of backwards iteration) we try to
1147* chip off the extra characters depending on whether a breakiterator has
1148* been used.
729e4ab9 1149* Internal method, error assumed to be success, caller has to check status
b75a7d8f
A
1150* before calling this method.
1151* @param strsrch string search data
1152* @param start offset of potential match, to be modified if necessary
1153* @param end offset of potential match, to be modified if necessary
1154* @param status output error status if any
1155* @return TRUE if match passes the contraction test, FALSE otherwise
1156*/
1157
1158static
729e4ab9
A
1159UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1160 int32_t *start,
1161 int32_t *end, UErrorCode *status)
b75a7d8f
A
1162{
1163 UCollationElements *coleiter = strsrch->textIter;
1164 int32_t textlength = strsrch->search->textLength;
46f4442e 1165 int32_t temp = *start;
b75a7d8f
A
1166 const UCollator *collator = strsrch->collator;
1167 const UChar *text = strsrch->search->text;
729e4ab9 1168 // This part checks if either ends of the match contains potential
b75a7d8f 1169 // contraction. If so we'll have to iterate through them
374ca955
A
1170 // The start contraction needs to be checked since ucol_previous dumps
1171 // all characters till the first safe character into the buffer.
729e4ab9
A
1172 // *start + 1 is used to test for the unsafe characters instead of *start
1173 // because ucol_prev takes all unsafe characters till the first safe
1174 // character ie *start. so by testing *start + 1, we can estimate if
1175 // excess prefix characters has been included in the potential search
374ca955 1176 // results.
729e4ab9
A
1177 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1178 (*start + 1 < textlength
b75a7d8f
A
1179 && ucol_unsafeCP(text[*start + 1], collator))) {
1180 int32_t expansion = getExpansionPrefix(coleiter);
1181 UBool expandflag = expansion > 0;
1182 setColEIterOffset(coleiter, *start);
1183 while (expansion > 0) {
1184 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1185 // since backward contraction/expansion may have extra ces if we
1186 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1187 // have taken care of it.
1188 // E.g. the character \u01FA will have an expansion of 3, but if
1189 // we are only looking for acute and ring \u030A and \u0301, we'll
1190 // have to skip the first ce in the expansion buffer.
1191 ucol_next(coleiter, status);
374ca955
A
1192 if (U_FAILURE(*status)) {
1193 return FALSE;
1194 }
b75a7d8f
A
1195 if (ucol_getOffset(coleiter) != temp) {
1196 *start = temp;
1197 temp = ucol_getOffset(coleiter);
1198 }
1199 expansion --;
1200 }
1201
374ca955 1202 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1203 int32_t patterncelength = strsrch->pattern.CELength;
1204 int32_t count = 0;
1205 while (count < patterncelength) {
374ca955 1206 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1207 if (ce == UCOL_IGNORABLE) {
1208 continue;
1209 }
1210 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1211 *start = temp;
1212 temp = ucol_getOffset(coleiter);
1213 }
1214 if (U_FAILURE(*status) || ce != patternce[count]) {
1215 (*end) ++;
729e4ab9 1216 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1217 return FALSE;
1218 }
1219 count ++;
1220 }
729e4ab9 1221 }
b75a7d8f
A
1222 return TRUE;
1223}
1224
1225/**
1226* Checks and sets the match information if found.
729e4ab9 1227* Checks
b75a7d8f
A
1228* <ul>
1229* <li> the potential match does not repeat the previous match
1230* <li> boundaries are correct
1231* <li> exact matches has no extra accents
1232* <li> identical matchesb
1233* <li> potential match does not end in the middle of a contraction
1234* <\ul>
1235* Otherwise the offset will be shifted to the next character.
729e4ab9 1236* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1237* before calling this method.
1238* @param strsrch string search data
1239* @param textoffset offset in the collation element text. the returned value
729e4ab9 1240* will be the truncated end offset of the match or the new start
b75a7d8f
A
1241* search offset.
1242* @param status output error status if any
1243* @return TRUE if the match is valid, FALSE otherwise
1244*/
1245static
729e4ab9 1246inline UBool checkNextExactMatch(UStringSearch *strsrch,
b75a7d8f
A
1247 int32_t *textoffset, UErrorCode *status)
1248{
1249 UCollationElements *coleiter = strsrch->textIter;
729e4ab9
A
1250 int32_t start = getColElemIterOffset(coleiter, FALSE);
1251
374ca955
A
1252 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1253 return FALSE;
b75a7d8f
A
1254 }
1255
1256 // this totally matches, however we need to check if it is repeating
1257 if (!isBreakUnit(strsrch, start, *textoffset) ||
729e4ab9
A
1258 checkRepeatedMatch(strsrch, start, *textoffset) ||
1259 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
b75a7d8f
A
1260 !checkIdentical(strsrch, start, *textoffset) ||
1261 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
374ca955
A
1262
1263 (*textoffset) ++;
729e4ab9 1264 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
374ca955 1265 return FALSE;
b75a7d8f 1266 }
46f4442e
A
1267
1268 //Add breakiterator boundary check for primary strength search.
1269 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 1270 checkBreakBoundary(strsrch, &start, textoffset);
46f4442e 1271 }
729e4ab9 1272
b75a7d8f
A
1273 // totally match, we will get rid of the ending ignorables.
1274 strsrch->search->matchedIndex = start;
1275 strsrch->search->matchedLength = *textoffset - start;
374ca955 1276 return TRUE;
b75a7d8f
A
1277}
1278
1279/**
729e4ab9 1280* Getting the previous base character offset, or the current offset if the
b75a7d8f
A
1281* current character is a base character
1282* @param text string
1283* @param textoffset one offset after the current character
729e4ab9 1284* @return the offset of the next character after the base character or the first
b75a7d8f
A
1285* composed character with accents
1286*/
1287static
729e4ab9 1288inline int32_t getPreviousBaseOffset(const UChar *text,
b75a7d8f
A
1289 int32_t textoffset)
1290{
1291 if (textoffset > 0) {
46f4442e 1292 for (;;) {
b75a7d8f 1293 int32_t result = textoffset;
4388f060 1294 U16_BACK_1(text, 0, textoffset);
b75a7d8f
A
1295 int32_t temp = textoffset;
1296 uint16_t fcd = getFCD(text, &temp, result);
1297 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1298 if (fcd & LAST_BYTE_MASK_) {
1299 return textoffset;
1300 }
1301 return result;
1302 }
1303 if (textoffset == 0) {
1304 return 0;
1305 }
1306 }
1307 }
1308 return textoffset;
1309}
1310
1311/**
1312* Getting the indexes of the accents that are not blocked in the argument
1313* accent array
1314* @param accents array of accents in nfd terminated by a 0.
1315* @param accentsindex array of indexes of the accents that are not blocked
1316*/
1317static
1318inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1319{
1320 int32_t index = 0;
1321 int32_t length = u_strlen(accents);
1322 UChar32 codepoint = 0;
1323 int cclass = 0;
1324 int result = 0;
1325 int32_t temp;
1326 while (index < length) {
1327 temp = index;
4388f060 1328 U16_NEXT(accents, index, length, codepoint);
b75a7d8f
A
1329 if (u_getCombiningClass(codepoint) != cclass) {
1330 cclass = u_getCombiningClass(codepoint);
1331 accentsindex[result] = temp;
1332 result ++;
1333 }
1334 }
1335 accentsindex[result] = length;
1336 return result;
1337}
1338
1339/**
1340* Appends 3 UChar arrays to a destination array.
729e4ab9 1341* Creates a new array if we run out of space. The caller will have to
b75a7d8f 1342* manually deallocate the newly allocated array.
729e4ab9
A
1343* Internal method, status assumed to be success, caller has to check status
1344* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
1345* size destinationlength.
1346* @param destination target array
1347* @param destinationlength target array size, returning the appended length
1348* @param source1 null-terminated first array
1349* @param source2 second array
1350* @param source2length length of seond array
1351* @param source3 null-terminated third array
1352* @param status error status if any
1353* @return new destination array, destination if there was no new allocation
1354*/
1355static
729e4ab9
A
1356inline UChar * addToUCharArray( UChar *destination,
1357 int32_t *destinationlength,
1358 const UChar *source1,
b75a7d8f 1359 const UChar *source2,
729e4ab9
A
1360 int32_t source2length,
1361 const UChar *source3,
1362 UErrorCode *status)
b75a7d8f
A
1363{
1364 int32_t source1length = source1 ? u_strlen(source1) : 0;
729e4ab9
A
1365 int32_t source3length = source3 ? u_strlen(source3) : 0;
1366 if (*destinationlength < source1length + source2length + source3length +
1367 1)
b75a7d8f
A
1368 {
1369 destination = (UChar *)allocateMemory(
1370 (source1length + source2length + source3length + 1) * sizeof(UChar),
1371 status);
729e4ab9 1372 // if error allocating memory, status will be
b75a7d8f
A
1373 // U_MEMORY_ALLOCATION_ERROR
1374 if (U_FAILURE(*status)) {
1375 *destinationlength = 0;
1376 return NULL;
1377 }
1378 }
1379 if (source1length != 0) {
1380 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1381 }
1382 if (source2length != 0) {
729e4ab9 1383 uprv_memcpy(destination + source1length, source2,
b75a7d8f
A
1384 sizeof(UChar) * source2length);
1385 }
1386 if (source3length != 0) {
729e4ab9 1387 uprv_memcpy(destination + source1length + source2length, source3,
b75a7d8f
A
1388 sizeof(UChar) * source3length);
1389 }
1390 *destinationlength = source1length + source2length + source3length;
1391 return destination;
1392}
1393
1394/**
1395* Running through a collation element iterator to see if the contents matches
1396* pattern in string search data
1397* @param strsrch string search data
1398* @param coleiter collation element iterator
1399* @return TRUE if a match if found, FALSE otherwise
1400*/
1401static
729e4ab9 1402inline UBool checkCollationMatch(const UStringSearch *strsrch,
b75a7d8f
A
1403 UCollationElements *coleiter)
1404{
1405 int patternceindex = strsrch->pattern.CELength;
374ca955 1406 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1407 UErrorCode status = U_ZERO_ERROR;
1408 while (patternceindex > 0) {
374ca955 1409 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
1410 if (ce == UCOL_IGNORABLE) {
1411 continue;
1412 }
1413 if (U_FAILURE(status) || ce != *patternce) {
1414 return FALSE;
1415 }
1416 patternce ++;
1417 patternceindex --;
1418 }
1419 return TRUE;
1420}
1421
1422/**
1423* Rearranges the front accents to try matching.
729e4ab9
A
1424* Prefix accents in the text will be grouped according to their combining
1425* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1426* match with the pattern.
1427* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1428* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1429* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1430* "\u0301\u0325".
1431* step 2: check if any of the generated substrings matches the pattern.
1432* Internal method, status is assumed to be success, caller has to check status
1433* before calling this method.
1434* @param strsrch string search match
1435* @param start first offset of the accents to start searching
1436* @param end start of the last accent set
1437* @param status output error status if any
1438* @return USEARCH_DONE if a match is not found, otherwise return the starting
1439* offset of the match. Note this start includes all preceding accents.
1440*/
1441static
729e4ab9 1442int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f 1443 int32_t start,
729e4ab9 1444 int32_t end,
b75a7d8f
A
1445 UErrorCode *status)
1446{
1447 const UChar *text = strsrch->search->text;
1448 int32_t textlength = strsrch->search->textLength;
1449 int32_t tempstart = start;
1450
1451 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1452 // die... failed at a base character
1453 return USEARCH_DONE;
1454 }
1455
1456 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1457 start = getPreviousBaseOffset(text, tempstart);
1458
1459 UChar accents[INITIAL_ARRAY_SIZE_];
1460 // normalizing the offensive string
729e4ab9
A
1461 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1462 INITIAL_ARRAY_SIZE_, status);
b75a7d8f
A
1463 if (U_FAILURE(*status)) {
1464 return USEARCH_DONE;
1465 }
729e4ab9
A
1466
1467 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1468 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 1469 accentsindex);
729e4ab9 1470 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
1471 UChar buffer[INITIAL_ARRAY_SIZE_];
1472 UCollationElements *coleiter = strsrch->utilIter;
1473 while (U_SUCCESS(*status) && count > 0) {
1474 UChar *rearrange = strsrch->canonicalPrefixAccents;
1475 // copy the base characters
1476 for (int k = 0; k < accentsindex[0]; k ++) {
1477 *rearrange ++ = accents[k];
1478 }
1479 // forming all possible canonical rearrangement by dropping
1480 // sets of accents
1481 for (int i = 0; i <= accentsize - 1; i ++) {
1482 int32_t mask = 1 << (accentsize - i - 1);
1483 if (count & mask) {
1484 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1485 *rearrange ++ = accents[j];
1486 }
1487 }
1488 }
1489 *rearrange = 0;
1490 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1491 UChar *match = addToUCharArray(buffer, &matchsize,
1492 strsrch->canonicalPrefixAccents,
1493 strsrch->search->text + offset,
1494 end - offset,
1495 strsrch->canonicalSuffixAccents,
1496 status);
729e4ab9 1497
b75a7d8f
A
1498 // if status is a failure, ucol_setText does nothing.
1499 // run the collator iterator through this match
1500 ucol_setText(coleiter, match, matchsize, status);
1501 if (U_SUCCESS(*status)) {
1502 if (checkCollationMatch(strsrch, coleiter)) {
1503 if (match != buffer) {
1504 uprv_free(match);
1505 }
1506 return start;
1507 }
1508 }
1509 count --;
1510 }
1511 return USEARCH_DONE;
1512}
1513
1514/**
1515* Gets the offset to the safe point in text before textoffset.
1516* ie. not the middle of a contraction, swappable characters or supplementary
1517* characters.
1518* @param collator collation sata
1519* @param text string to work with
1520* @param textoffset offset in string
1521* @param textlength length of text string
1522* @return offset to the previous safe character
1523*/
1524static
729e4ab9 1525inline uint32_t getPreviousSafeOffset(const UCollator *collator,
b75a7d8f
A
1526 const UChar *text,
1527 int32_t textoffset)
1528{
1529 int32_t result = textoffset; // first contraction character
1530 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1531 result --;
1532 }
1533 if (result != 0) {
1534 // the first contraction character is consider unsafe here
1535 result --;
1536 }
729e4ab9 1537 return result;
b75a7d8f
A
1538}
1539
1540/**
1541* Cleaning up after we passed the safe zone
1542* @param strsrch string search data
1543* @param safetext safe text array
1544* @param safebuffer safe text buffer
1545* @param coleiter collation element iterator for safe text
1546*/
1547static
1548inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1549 UChar *safebuffer)
1550{
729e4ab9 1551 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
b75a7d8f
A
1552 {
1553 uprv_free(safetext);
1554 }
1555}
1556
1557/**
1558* Take the rearranged end accents and tries matching. If match failed at
1559* a seperate preceding set of accents (seperated from the rearranged on by
729e4ab9 1560* at least a base character) then we rearrange the preceding accents and
b75a7d8f 1561* tries matching again.
729e4ab9 1562* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f
A
1563* However if the failure is found before the accent set, it fails.
1564* Internal method, status assumed to be success, caller has to check status
1565* before calling this method.
1566* @param strsrch string search data
1567* @param textoffset of the start of the rearranged accent
1568* @param status output error status if any
1569* @return USEARCH_DONE if a match is not found, otherwise return the starting
1570* offset of the match. Note this start includes all preceding accents.
1571*/
1572static
729e4ab9 1573int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f
A
1574 int32_t textoffset,
1575 UErrorCode *status)
1576{
1577 const UChar *text = strsrch->search->text;
1578 const UCollator *collator = strsrch->collator;
1579 int32_t safelength = 0;
1580 UChar *safetext;
1581 int32_t safetextlength;
1582 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1583 UCollationElements *coleiter = strsrch->utilIter;
1584 int32_t safeoffset = textoffset;
1585
729e4ab9 1586 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
b75a7d8f
A
1587 collator)) {
1588 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1589 safelength = textoffset - safeoffset;
1590 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
1591 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1592 text + safeoffset, safelength,
1593 strsrch->canonicalSuffixAccents,
b75a7d8f
A
1594 status);
1595 }
1596 else {
1597 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1598 safetext = strsrch->canonicalSuffixAccents;
1599 }
1600
1601 // if status is a failure, ucol_setText does nothing
1602 ucol_setText(coleiter, safetext, safetextlength, status);
1603 // status checked in loop below
1604
374ca955
A
1605 int32_t *ce = strsrch->pattern.CE;
1606 int32_t celength = strsrch->pattern.CELength;
b75a7d8f
A
1607 int ceindex = celength - 1;
1608 UBool isSafe = TRUE; // indication flag for position in safe zone
729e4ab9 1609
b75a7d8f 1610 while (ceindex >= 0) {
374ca955 1611 int32_t textce = ucol_previous(coleiter, status);
b75a7d8f
A
1612 if (U_FAILURE(*status)) {
1613 if (isSafe) {
1614 cleanUpSafeText(strsrch, safetext, safebuffer);
1615 }
1616 return USEARCH_DONE;
1617 }
1618 if (textce == UCOL_NULLORDER) {
1619 // check if we have passed the safe buffer
1620 if (coleiter == strsrch->textIter) {
1621 cleanUpSafeText(strsrch, safetext, safebuffer);
1622 return USEARCH_DONE;
1623 }
1624 cleanUpSafeText(strsrch, safetext, safebuffer);
1625 safetext = safebuffer;
1626 coleiter = strsrch->textIter;
1627 setColEIterOffset(coleiter, safeoffset);
1628 // status checked at the start of the loop
1629 isSafe = FALSE;
1630 continue;
1631 }
1632 textce = getCE(strsrch, textce);
1633 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1634 // do the beginning stuff
1635 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1636 if (isSafe && failedoffset >= safelength) {
1637 // alas... no hope. failed at rearranged accent set
1638 cleanUpSafeText(strsrch, safetext, safebuffer);
1639 return USEARCH_DONE;
1640 }
1641 else {
1642 if (isSafe) {
1643 failedoffset += safeoffset;
1644 cleanUpSafeText(strsrch, safetext, safebuffer);
1645 }
729e4ab9 1646
b75a7d8f 1647 // try rearranging the front accents
729e4ab9 1648 int32_t result = doNextCanonicalPrefixMatch(strsrch,
b75a7d8f
A
1649 failedoffset, textoffset, status);
1650 if (result != USEARCH_DONE) {
1651 // if status is a failure, ucol_setOffset does nothing
1652 setColEIterOffset(strsrch->textIter, result);
1653 }
1654 if (U_FAILURE(*status)) {
1655 return USEARCH_DONE;
1656 }
1657 return result;
1658 }
1659 }
1660 if (textce == ce[ceindex]) {
1661 ceindex --;
1662 }
1663 }
1664 // set offset here
1665 if (isSafe) {
1666 int32_t result = getColElemIterOffset(coleiter, FALSE);
1667 // sets the text iterator here with the correct expansion and offset
1668 int32_t leftoverces = getExpansionPrefix(coleiter);
1669 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 1670 if (result >= safelength) {
b75a7d8f
A
1671 result = textoffset;
1672 }
1673 else {
1674 result += safeoffset;
1675 }
1676 setColEIterOffset(strsrch->textIter, result);
729e4ab9 1677 strsrch->textIter->iteratordata_.toReturn =
b75a7d8f
A
1678 setExpansionPrefix(strsrch->textIter, leftoverces);
1679 return result;
1680 }
729e4ab9
A
1681
1682 return ucol_getOffset(coleiter);
b75a7d8f
A
1683}
1684
1685/**
1686* Trying out the substring and sees if it can be a canonical match.
1687* This will try normalizing the end accents and arranging them into canonical
1688* equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
1689* Suffix accents in the text will be grouped according to their combining
1690* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1691* match with the pattern.
1692* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1693* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1694* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1695* "\u0301\u0325".
1696* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 1697* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1698* before calling this method.
1699* @param strsrch string search data
729e4ab9 1700* @param textoffset end offset in the collation element text that ends with
b75a7d8f
A
1701* the accents to be rearranged
1702* @param status error status if any
1703* @return TRUE if the match is valid, FALSE otherwise
1704*/
1705static
729e4ab9
A
1706UBool doNextCanonicalMatch(UStringSearch *strsrch,
1707 int32_t textoffset,
b75a7d8f
A
1708 UErrorCode *status)
1709{
1710 const UChar *text = strsrch->search->text;
1711 int32_t temp = textoffset;
4388f060 1712 U16_BACK_1(text, 0, temp);
b75a7d8f
A
1713 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1714 UCollationElements *coleiter = strsrch->textIter;
1715 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1716 if (strsrch->pattern.hasPrefixAccents) {
729e4ab9 1717 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
b75a7d8f
A
1718 status);
1719 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1720 setColEIterOffset(coleiter, offset);
1721 return TRUE;
1722 }
1723 }
1724 return FALSE;
1725 }
1726
1727 if (!strsrch->pattern.hasSuffixAccents) {
1728 return FALSE;
1729 }
1730
1731 UChar accents[INITIAL_ARRAY_SIZE_];
1732 // offset to the last base character in substring to search
1733 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1734 // normalizing the offensive string
729e4ab9
A
1735 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1736 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 1737 // status checked in loop below
729e4ab9 1738
b75a7d8f
A
1739 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1740 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1741
374ca955
A
1742 // 2 power n - 1 plus the full set of accents
1743 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
1744 while (U_SUCCESS(*status) && count > 0) {
1745 UChar *rearrange = strsrch->canonicalSuffixAccents;
1746 // copy the base characters
1747 for (int k = 0; k < accentsindex[0]; k ++) {
1748 *rearrange ++ = accents[k];
1749 }
1750 // forming all possible canonical rearrangement by dropping
1751 // sets of accents
1752 for (int i = 0; i <= size - 1; i ++) {
1753 int32_t mask = 1 << (size - i - 1);
1754 if (count & mask) {
1755 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1756 *rearrange ++ = accents[j];
1757 }
1758 }
1759 }
1760 *rearrange = 0;
729e4ab9 1761 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
b75a7d8f
A
1762 status);
1763 if (offset != USEARCH_DONE) {
1764 return TRUE; // match found
1765 }
1766 count --;
1767 }
1768 return FALSE;
1769}
1770
1771/**
729e4ab9 1772* Gets the previous base character offset depending on the string search
b75a7d8f
A
1773* pattern data
1774* @param strsrch string search data
1775* @param textoffset current offset, current character
1776* @return the offset of the next character after this base character or itself
1777* if it is a composed character with accents
1778*/
1779static
729e4ab9 1780inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
1781 int32_t textoffset)
1782{
1783 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1784 const UChar *text = strsrch->search->text;
1785 int32_t offset = textoffset;
729e4ab9 1786 if (getFCD(text, &offset, strsrch->search->textLength) >>
b75a7d8f
A
1787 SECOND_LAST_BYTE_SHIFT_) {
1788 return getPreviousBaseOffset(text, textoffset);
1789 }
1790 }
1791 return textoffset;
1792}
1793
1794/**
729e4ab9 1795* Checks match for contraction.
b75a7d8f
A
1796* If the match ends with a partial contraction we fail.
1797* If the match starts too far off (because of backwards iteration) we try to
1798* chip off the extra characters
729e4ab9 1799* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1800* before calling this method.
1801* @param strsrch string search data
1802* @param start offset of potential match, to be modified if necessary
1803* @param end offset of potential match, to be modified if necessary
1804* @param status output error status if any
1805* @return TRUE if match passes the contraction test, FALSE otherwise
1806*/
1807static
729e4ab9
A
1808UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1809 int32_t *start,
1810 int32_t *end,
1811 UErrorCode *status)
b75a7d8f
A
1812{
1813 UCollationElements *coleiter = strsrch->textIter;
1814 int32_t textlength = strsrch->search->textLength;
1815 int32_t temp = *start;
1816 const UCollator *collator = strsrch->collator;
1817 const UChar *text = strsrch->search->text;
729e4ab9 1818 // This part checks if either ends of the match contains potential
b75a7d8f 1819 // contraction. If so we'll have to iterate through them
729e4ab9
A
1820 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1821 (*start + 1 < textlength
b75a7d8f
A
1822 && ucol_unsafeCP(text[*start + 1], collator))) {
1823 int32_t expansion = getExpansionPrefix(coleiter);
1824 UBool expandflag = expansion > 0;
1825 setColEIterOffset(coleiter, *start);
1826 while (expansion > 0) {
1827 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1828 // since backward contraction/expansion may have extra ces if we
1829 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1830 // have taken care of it.
1831 // E.g. the character \u01FA will have an expansion of 3, but if
1832 // we are only looking for acute and ring \u030A and \u0301, we'll
1833 // have to skip the first ce in the expansion buffer.
1834 ucol_next(coleiter, status);
374ca955
A
1835 if (U_FAILURE(*status)) {
1836 return FALSE;
1837 }
b75a7d8f
A
1838 if (ucol_getOffset(coleiter) != temp) {
1839 *start = temp;
1840 temp = ucol_getOffset(coleiter);
1841 }
1842 expansion --;
1843 }
1844
374ca955 1845 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1846 int32_t patterncelength = strsrch->pattern.CELength;
1847 int32_t count = 0;
1848 int32_t textlength = strsrch->search->textLength;
1849 while (count < patterncelength) {
374ca955 1850 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1851 // status checked below, note that if status is a failure
1852 // ucol_next returns UCOL_NULLORDER
1853 if (ce == UCOL_IGNORABLE) {
1854 continue;
1855 }
1856 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1857 *start = temp;
1858 temp = ucol_getOffset(coleiter);
1859 }
1860
1861 if (count == 0 && ce != patternce[0]) {
729e4ab9 1862 // accents may have extra starting ces, this occurs when a
b75a7d8f
A
1863 // pure accent pattern is matched without rearrangement
1864 // text \u0325\u0300 and looking for \u0300
729e4ab9 1865 int32_t expected = patternce[0];
b75a7d8f
A
1866 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1867 ce = getCE(strsrch, ucol_next(coleiter, status));
729e4ab9 1868 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
1869 ce != UCOL_NULLORDER &&
1870 ucol_getOffset(coleiter) <= *end) {
1871 ce = getCE(strsrch, ucol_next(coleiter, status));
1872 }
1873 }
1874 }
1875 if (U_FAILURE(*status) || ce != patternce[count]) {
1876 (*end) ++;
729e4ab9 1877 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1878 return FALSE;
1879 }
1880 count ++;
1881 }
729e4ab9 1882 }
b75a7d8f
A
1883 return TRUE;
1884}
1885
1886/**
1887* Checks and sets the match information if found.
729e4ab9 1888* Checks
b75a7d8f
A
1889* <ul>
1890* <li> the potential match does not repeat the previous match
1891* <li> boundaries are correct
1892* <li> potential match does not end in the middle of a contraction
1893* <li> identical matches
1894* <\ul>
1895* Otherwise the offset will be shifted to the next character.
729e4ab9 1896* Internal method, status assumed to be success, caller has to check the
b75a7d8f
A
1897* status before calling this method.
1898* @param strsrch string search data
1899* @param textoffset offset in the collation element text. the returned value
729e4ab9 1900* will be the truncated end offset of the match or the new start
b75a7d8f
A
1901* search offset.
1902* @param status output error status if any
1903* @return TRUE if the match is valid, FALSE otherwise
1904*/
1905static
729e4ab9
A
1906inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1907 int32_t *textoffset,
b75a7d8f
A
1908 UErrorCode *status)
1909{
1910 // to ensure that the start and ends are not composite characters
1911 UCollationElements *coleiter = strsrch->textIter;
1912 // if we have a canonical accent match
729e4ab9
A
1913 if ((strsrch->pattern.hasSuffixAccents &&
1914 strsrch->canonicalSuffixAccents[0]) ||
1915 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
1916 strsrch->canonicalPrefixAccents[0])) {
1917 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1918 strsrch,
1919 ucol_getOffset(coleiter));
729e4ab9 1920 strsrch->search->matchedLength = *textoffset -
b75a7d8f
A
1921 strsrch->search->matchedIndex;
1922 return TRUE;
1923 }
1924
1925 int32_t start = getColElemIterOffset(coleiter, FALSE);
729e4ab9 1926 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
b75a7d8f
A
1927 status) || U_FAILURE(*status)) {
1928 return FALSE;
1929 }
729e4ab9 1930
b75a7d8f
A
1931 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1932 // this totally matches, however we need to check if it is repeating
729e4ab9
A
1933 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1934 !isBreakUnit(strsrch, start, *textoffset) ||
b75a7d8f
A
1935 !checkIdentical(strsrch, start, *textoffset)) {
1936 (*textoffset) ++;
729e4ab9 1937 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
b75a7d8f
A
1938 strsrch->search->textLength);
1939 return FALSE;
1940 }
729e4ab9 1941
b75a7d8f
A
1942 strsrch->search->matchedIndex = start;
1943 strsrch->search->matchedLength = *textoffset - start;
1944 return TRUE;
1945}
1946
1947/**
1948* Shifting the collation element iterator position forward to prepare for
1949* a preceding match. If the first character is a unsafe character, we'll only
1950* shift by 1 to capture contractions, normalization etc.
729e4ab9 1951* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1952* before calling this method.
1953* @param text strsrch string search data
1954* @param textoffset start text position to do search
1955* @param ce the text ce which failed the match.
1956* @param patternceindex index of the ce within the pattern ce buffer which
1957* failed the match
1958* @return final offset
1959*/
1960static
1961inline int32_t reverseShift(UStringSearch *strsrch,
1962 int32_t textoffset,
374ca955 1963 int32_t ce,
b75a7d8f 1964 int32_t patternceindex)
729e4ab9 1965{
b75a7d8f
A
1966 if (strsrch->search->isOverlap) {
1967 if (textoffset != strsrch->search->textLength) {
1968 textoffset --;
1969 }
1970 else {
1971 textoffset -= strsrch->pattern.defaultShiftSize;
1972 }
1973 }
1974 else {
1975 if (ce != UCOL_NULLORDER) {
1976 int32_t shift = strsrch->pattern.backShift[hash(ce)];
729e4ab9
A
1977
1978 // this is to adjust for characters in the middle of the substring
b75a7d8f
A
1979 // for matching that failed.
1980 int32_t adjust = patternceindex;
1981 if (adjust > 1 && shift > adjust) {
1982 shift -= adjust - 1;
1983 }
1984 textoffset -= shift;
1985 }
1986 else {
1987 textoffset -= strsrch->pattern.defaultShiftSize;
1988 }
729e4ab9 1989 }
b75a7d8f
A
1990 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1991 return textoffset;
1992}
1993
1994/**
729e4ab9 1995* Checks match for contraction.
b75a7d8f 1996* If the match starts with a partial contraction we fail.
729e4ab9 1997* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1998* before calling this method.
1999* @param strsrch string search data
2000* @param start offset of potential match, to be modified if necessary
2001* @param end offset of potential match, to be modified if necessary
2002* @param status output error status if any
2003* @return TRUE if match passes the contraction test, FALSE otherwise
2004*/
2005static
729e4ab9
A
2006UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2007 int32_t *start,
2008 int32_t *end, UErrorCode *status)
b75a7d8f
A
2009{
2010 UCollationElements *coleiter = strsrch->textIter;
2011 int32_t textlength = strsrch->search->textLength;
2012 int32_t temp = *end;
2013 const UCollator *collator = strsrch->collator;
2014 const UChar *text = strsrch->search->text;
729e4ab9 2015 // This part checks if either if the start of the match contains potential
b75a7d8f 2016 // contraction. If so we'll have to iterate through them
729e4ab9 2017 // Since we used ucol_next while previously looking for the potential
374ca955
A
2018 // match, this guarantees that our end will not be a partial contraction,
2019 // or a partial supplementary character.
b75a7d8f
A
2020 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2021 int32_t expansion = getExpansionSuffix(coleiter);
2022 UBool expandflag = expansion > 0;
2023 setColEIterOffset(coleiter, *end);
2024 while (U_SUCCESS(*status) && expansion > 0) {
2025 // getting rid of the redundant ce
2026 // since forward contraction/expansion may have extra ces
2027 // if we are in the normalization buffer, hasAccentsBeforeMatch
2028 // would have taken care of it.
2029 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2030 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2031 // last ce in the expansion buffer
2032 ucol_previous(coleiter, status);
374ca955
A
2033 if (U_FAILURE(*status)) {
2034 return FALSE;
2035 }
b75a7d8f
A
2036 if (ucol_getOffset(coleiter) != temp) {
2037 *end = temp;
2038 temp = ucol_getOffset(coleiter);
2039 }
2040 expansion --;
2041 }
2042
374ca955 2043 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
2044 int32_t patterncelength = strsrch->pattern.CELength;
2045 int32_t count = patterncelength;
2046 while (count > 0) {
374ca955 2047 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2048 // status checked below, note that if status is a failure
2049 // ucol_previous returns UCOL_NULLORDER
2050 if (ce == UCOL_IGNORABLE) {
2051 continue;
2052 }
729e4ab9 2053 if (expandflag && count == 0 &&
b75a7d8f
A
2054 getColElemIterOffset(coleiter, FALSE) != temp) {
2055 *end = temp;
2056 temp = ucol_getOffset(coleiter);
2057 }
2058 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2059 (*start) --;
2060 *start = getPreviousBaseOffset(text, *start);
2061 return FALSE;
2062 }
2063 count --;
2064 }
729e4ab9 2065 }
b75a7d8f
A
2066 return TRUE;
2067}
2068
2069/**
2070* Checks and sets the match information if found.
729e4ab9 2071* Checks
b75a7d8f
A
2072* <ul>
2073* <li> the current match does not repeat the last match
2074* <li> boundaries are correct
2075* <li> exact matches has no extra accents
2076* <li> identical matches
2077* <\ul>
2078* Otherwise the offset will be shifted to the preceding character.
2079* Internal method, status assumed to be success, caller has to check status
2080* before calling this method.
2081* @param strsrch string search data
729e4ab9 2082* @param collator
b75a7d8f
A
2083* @param coleiter collation element iterator
2084* @param text string
2085* @param textoffset offset in the collation element text. the returned value
729e4ab9 2086* will be the truncated start offset of the match or the new start
b75a7d8f
A
2087* search offset.
2088* @param status output error status if any
2089* @return TRUE if the match is valid, FALSE otherwise
2090*/
2091static
729e4ab9
A
2092inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2093 int32_t *textoffset,
b75a7d8f
A
2094 UErrorCode *status)
2095{
2096 // to ensure that the start and ends are not composite characters
729e4ab9 2097 int32_t end = ucol_getOffset(strsrch->textIter);
b75a7d8f
A
2098 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2099 || U_FAILURE(*status)) {
2100 return FALSE;
2101 }
729e4ab9 2102
b75a7d8f
A
2103 // this totally matches, however we need to check if it is repeating
2104 // the old match
729e4ab9 2105 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
b75a7d8f
A
2106 !isBreakUnit(strsrch, *textoffset, end) ||
2107 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
729e4ab9 2108 !checkIdentical(strsrch, *textoffset, end) ||
b75a7d8f
A
2109 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2110 (*textoffset) --;
729e4ab9 2111 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2112 *textoffset);
2113 return FALSE;
2114 }
729e4ab9 2115
46f4442e
A
2116 //Add breakiterator boundary check for primary strength search.
2117 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 2118 checkBreakBoundary(strsrch, textoffset, &end);
46f4442e 2119 }
729e4ab9 2120
b75a7d8f
A
2121 strsrch->search->matchedIndex = *textoffset;
2122 strsrch->search->matchedLength = end - *textoffset;
2123 return TRUE;
2124}
2125
2126/**
2127* Rearranges the end accents to try matching.
729e4ab9
A
2128* Suffix accents in the text will be grouped according to their combining
2129* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2130* match with the pattern.
2131* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2132* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2133* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2134* "\u0301\u0325".
2135* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 2136* Internal method, status assumed to be success, user has to check status
b75a7d8f
A
2137* before calling this method.
2138* @param strsrch string search match
2139* @param start offset of the first base character
2140* @param end start of the last accent set
2141* @param status only error status if any
2142* @return USEARCH_DONE if a match is not found, otherwise return the ending
2143* offset of the match. Note this start includes all following accents.
2144*/
2145static
729e4ab9 2146int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f 2147 int32_t start,
729e4ab9 2148 int32_t end,
b75a7d8f
A
2149 UErrorCode *status)
2150{
2151 const UChar *text = strsrch->search->text;
2152 int32_t tempend = end;
2153
4388f060 2154 U16_BACK_1(text, 0, tempend);
729e4ab9 2155 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
b75a7d8f
A
2156 LAST_BYTE_MASK_)) {
2157 // die... failed at a base character
2158 return USEARCH_DONE;
2159 }
2160 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2161
2162 if (U_SUCCESS(*status)) {
2163 UChar accents[INITIAL_ARRAY_SIZE_];
2164 int32_t offset = getPreviousBaseOffset(text, end);
2165 // normalizing the offensive string
729e4ab9
A
2166 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2167 INITIAL_ARRAY_SIZE_, status);
2168
2169 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2170 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 2171 accentsindex);
729e4ab9 2172 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
2173 UChar buffer[INITIAL_ARRAY_SIZE_];
2174 UCollationElements *coleiter = strsrch->utilIter;
2175 while (U_SUCCESS(*status) && count > 0) {
2176 UChar *rearrange = strsrch->canonicalSuffixAccents;
2177 // copy the base characters
2178 for (int k = 0; k < accentsindex[0]; k ++) {
2179 *rearrange ++ = accents[k];
2180 }
2181 // forming all possible canonical rearrangement by dropping
2182 // sets of accents
2183 for (int i = 0; i <= accentsize - 1; i ++) {
2184 int32_t mask = 1 << (accentsize - i - 1);
2185 if (count & mask) {
2186 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2187 *rearrange ++ = accents[j];
2188 }
2189 }
2190 }
2191 *rearrange = 0;
2192 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2193 UChar *match = addToUCharArray(buffer, &matchsize,
2194 strsrch->canonicalPrefixAccents,
2195 strsrch->search->text + start,
2196 offset - start,
2197 strsrch->canonicalSuffixAccents,
2198 status);
729e4ab9 2199
b75a7d8f
A
2200 // run the collator iterator through this match
2201 // if status is a failure ucol_setText does nothing
2202 ucol_setText(coleiter, match, matchsize, status);
2203 if (U_SUCCESS(*status)) {
2204 if (checkCollationMatch(strsrch, coleiter)) {
2205 if (match != buffer) {
2206 uprv_free(match);
2207 }
2208 return end;
2209 }
2210 }
2211 count --;
2212 }
2213 }
2214 return USEARCH_DONE;
2215}
2216
2217/**
2218* Take the rearranged start accents and tries matching. If match failed at
2219* a seperate following set of accents (seperated from the rearranged on by
729e4ab9 2220* at least a base character) then we rearrange the preceding accents and
b75a7d8f 2221* tries matching again.
729e4ab9 2222* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f 2223* However if the failure is found before the accent set, it fails.
729e4ab9 2224* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
2225* before calling this method.
2226* @param strsrch string search data
2227* @param textoffset of the ends of the rearranged accent
2228* @param status output error status if any
2229* @return USEARCH_DONE if a match is not found, otherwise return the ending
2230* offset of the match. Note this start includes all following accents.
2231*/
2232static
729e4ab9 2233int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f
A
2234 int32_t textoffset,
2235 UErrorCode *status)
2236{
2237 const UChar *text = strsrch->search->text;
2238 const UCollator *collator = strsrch->collator;
2239 int32_t safelength = 0;
2240 UChar *safetext;
2241 int32_t safetextlength;
2242 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2243 int32_t safeoffset = textoffset;
2244
729e4ab9 2245 if (textoffset &&
b75a7d8f
A
2246 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2247 u_strlen(strsrch->canonicalPrefixAccents) - 1
2248 ], collator)) {
729e4ab9 2249 safeoffset = getNextSafeOffset(collator, text, textoffset,
b75a7d8f
A
2250 strsrch->search->textLength);
2251 safelength = safeoffset - textoffset;
2252 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
2253 safetext = addToUCharArray(safebuffer, &safetextlength,
2254 strsrch->canonicalPrefixAccents,
2255 text + textoffset, safelength,
b75a7d8f
A
2256 NULL, status);
2257 }
2258 else {
2259 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2260 safetext = strsrch->canonicalPrefixAccents;
2261 }
2262
2263 UCollationElements *coleiter = strsrch->utilIter;
2264 // if status is a failure, ucol_setText does nothing
2265 ucol_setText(coleiter, safetext, safetextlength, status);
2266 // status checked in loop below
729e4ab9 2267
374ca955 2268 int32_t *ce = strsrch->pattern.CE;
b75a7d8f
A
2269 int32_t celength = strsrch->pattern.CELength;
2270 int ceindex = 0;
2271 UBool isSafe = TRUE; // safe zone indication flag for position
2272 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
729e4ab9 2273
b75a7d8f 2274 while (ceindex < celength) {
374ca955 2275 int32_t textce = ucol_next(coleiter, status);
b75a7d8f
A
2276 if (U_FAILURE(*status)) {
2277 if (isSafe) {
2278 cleanUpSafeText(strsrch, safetext, safebuffer);
2279 }
2280 return USEARCH_DONE;
2281 }
2282 if (textce == UCOL_NULLORDER) {
2283 // check if we have passed the safe buffer
2284 if (coleiter == strsrch->textIter) {
2285 cleanUpSafeText(strsrch, safetext, safebuffer);
2286 return USEARCH_DONE;
2287 }
2288 cleanUpSafeText(strsrch, safetext, safebuffer);
2289 safetext = safebuffer;
2290 coleiter = strsrch->textIter;
2291 setColEIterOffset(coleiter, safeoffset);
2292 // status checked at the start of the loop
2293 isSafe = FALSE;
2294 continue;
2295 }
2296 textce = getCE(strsrch, textce);
2297 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2298 // do the beginning stuff
2299 int32_t failedoffset = ucol_getOffset(coleiter);
2300 if (isSafe && failedoffset <= prefixlength) {
2301 // alas... no hope. failed at rearranged accent set
2302 cleanUpSafeText(strsrch, safetext, safebuffer);
2303 return USEARCH_DONE;
2304 }
2305 else {
2306 if (isSafe) {
2307 failedoffset = safeoffset - failedoffset;
2308 cleanUpSafeText(strsrch, safetext, safebuffer);
2309 }
729e4ab9 2310
b75a7d8f 2311 // try rearranging the end accents
729e4ab9 2312 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
b75a7d8f
A
2313 textoffset, failedoffset, status);
2314 if (result != USEARCH_DONE) {
2315 // if status is a failure, ucol_setOffset does nothing
2316 setColEIterOffset(strsrch->textIter, result);
2317 }
2318 if (U_FAILURE(*status)) {
2319 return USEARCH_DONE;
2320 }
2321 return result;
2322 }
2323 }
2324 if (textce == ce[ceindex]) {
2325 ceindex ++;
2326 }
2327 }
2328 // set offset here
2329 if (isSafe) {
2330 int32_t result = ucol_getOffset(coleiter);
2331 // sets the text iterator here with the correct expansion and offset
2332 int32_t leftoverces = getExpansionSuffix(coleiter);
2333 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 2334 if (result <= prefixlength) {
b75a7d8f
A
2335 result = textoffset;
2336 }
2337 else {
2338 result = textoffset + (safeoffset - result);
2339 }
2340 setColEIterOffset(strsrch->textIter, result);
2341 setExpansionSuffix(strsrch->textIter, leftoverces);
2342 return result;
2343 }
729e4ab9
A
2344
2345 return ucol_getOffset(coleiter);
b75a7d8f
A
2346}
2347
2348/**
2349* Trying out the substring and sees if it can be a canonical match.
729e4ab9 2350* This will try normalizing the starting accents and arranging them into
b75a7d8f 2351* canonical equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
2352* Prefix accents in the text will be grouped according to their combining
2353* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2354* match with the pattern.
2355* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2356* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2357* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2358* "\u0301\u0325".
2359* step 2: check if any of the generated substrings matches the pattern.
2360* Internal method, status assumed to be success, caller has to check status
2361* before calling this method.
2362* @param strsrch string search data
729e4ab9 2363* @param textoffset start offset in the collation element text that starts
b75a7d8f
A
2364* with the accents to be rearranged
2365* @param status output error status if any
2366* @return TRUE if the match is valid, FALSE otherwise
2367*/
2368static
729e4ab9
A
2369UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2370 int32_t textoffset,
b75a7d8f
A
2371 UErrorCode *status)
2372{
2373 const UChar *text = strsrch->search->text;
2374 int32_t temp = textoffset;
2375 int32_t textlength = strsrch->search->textLength;
2376 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2377 UCollationElements *coleiter = strsrch->textIter;
2378 int32_t offset = ucol_getOffset(coleiter);
2379 if (strsrch->pattern.hasSuffixAccents) {
729e4ab9 2380 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
b75a7d8f
A
2381 offset, status);
2382 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2383 setColEIterOffset(coleiter, offset);
2384 return TRUE;
2385 }
2386 }
2387 return FALSE;
2388 }
2389
2390 if (!strsrch->pattern.hasPrefixAccents) {
2391 return FALSE;
2392 }
2393
2394 UChar accents[INITIAL_ARRAY_SIZE_];
2395 // offset to the last base character in substring to search
2396 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2397 // normalizing the offensive string
729e4ab9
A
2398 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2399 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 2400 // status checked in loop
729e4ab9 2401
b75a7d8f
A
2402 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2403 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2404
374ca955 2405 // 2 power n - 1 plus the full set of accents
729e4ab9 2406 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
2407 while (U_SUCCESS(*status) && count > 0) {
2408 UChar *rearrange = strsrch->canonicalPrefixAccents;
2409 // copy the base characters
2410 for (int k = 0; k < accentsindex[0]; k ++) {
2411 *rearrange ++ = accents[k];
2412 }
2413 // forming all possible canonical rearrangement by dropping
2414 // sets of accents
2415 for (int i = 0; i <= size - 1; i ++) {
2416 int32_t mask = 1 << (size - i - 1);
2417 if (count & mask) {
2418 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2419 *rearrange ++ = accents[j];
2420 }
2421 }
2422 }
2423 *rearrange = 0;
729e4ab9 2424 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
b75a7d8f
A
2425 baseoffset, status);
2426 if (offset != USEARCH_DONE) {
2427 return TRUE; // match found
2428 }
2429 count --;
2430 }
2431 return FALSE;
2432}
2433
2434/**
729e4ab9 2435* Checks match for contraction.
b75a7d8f
A
2436* If the match starts with a partial contraction we fail.
2437* Internal method, status assumed to be success, caller has to check status
2438* before calling this method.
2439* @param strsrch string search data
2440* @param start offset of potential match, to be modified if necessary
2441* @param end offset of potential match, to be modified if necessary
2442* @param status only error status if any
2443* @return TRUE if match passes the contraction test, FALSE otherwise
2444*/
2445static
729e4ab9
A
2446UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2447 int32_t *start,
2448 int32_t *end, UErrorCode *status)
b75a7d8f
A
2449{
2450 UCollationElements *coleiter = strsrch->textIter;
2451 int32_t textlength = strsrch->search->textLength;
2452 int32_t temp = *end;
2453 const UCollator *collator = strsrch->collator;
2454 const UChar *text = strsrch->search->text;
729e4ab9 2455 // This part checks if either if the start of the match contains potential
b75a7d8f 2456 // contraction. If so we'll have to iterate through them
729e4ab9 2457 // Since we used ucol_next while previously looking for the potential
374ca955
A
2458 // match, this guarantees that our end will not be a partial contraction,
2459 // or a partial supplementary character.
b75a7d8f
A
2460 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2461 int32_t expansion = getExpansionSuffix(coleiter);
2462 UBool expandflag = expansion > 0;
2463 setColEIterOffset(coleiter, *end);
2464 while (expansion > 0) {
2465 // getting rid of the redundant ce
2466 // since forward contraction/expansion may have extra ces
2467 // if we are in the normalization buffer, hasAccentsBeforeMatch
2468 // would have taken care of it.
2469 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2470 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2471 // last ce in the expansion buffer
2472 ucol_previous(coleiter, status);
374ca955
A
2473 if (U_FAILURE(*status)) {
2474 return FALSE;
2475 }
b75a7d8f
A
2476 if (ucol_getOffset(coleiter) != temp) {
2477 *end = temp;
2478 temp = ucol_getOffset(coleiter);
2479 }
2480 expansion --;
2481 }
2482
374ca955 2483 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
2484 int32_t patterncelength = strsrch->pattern.CELength;
2485 int32_t count = patterncelength;
2486 while (count > 0) {
374ca955 2487 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2488 // status checked below, note that if status is a failure
2489 // ucol_previous returns UCOL_NULLORDER
2490 if (ce == UCOL_IGNORABLE) {
2491 continue;
2492 }
729e4ab9 2493 if (expandflag && count == 0 &&
b75a7d8f
A
2494 getColElemIterOffset(coleiter, FALSE) != temp) {
2495 *end = temp;
2496 temp = ucol_getOffset(coleiter);
2497 }
729e4ab9 2498 if (count == patterncelength &&
b75a7d8f 2499 ce != patternce[patterncelength - 1]) {
729e4ab9 2500 // accents may have extra starting ces, this occurs when a
b75a7d8f 2501 // pure accent pattern is matched without rearrangement
374ca955 2502 int32_t expected = patternce[patterncelength - 1];
4388f060 2503 U16_BACK_1(text, 0, *end);
b75a7d8f
A
2504 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2505 ce = getCE(strsrch, ucol_previous(coleiter, status));
729e4ab9 2506 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
2507 ce != UCOL_NULLORDER &&
2508 ucol_getOffset(coleiter) <= *start) {
2509 ce = getCE(strsrch, ucol_previous(coleiter, status));
2510 }
2511 }
2512 }
2513 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2514 (*start) --;
2515 *start = getPreviousBaseOffset(text, *start);
2516 return FALSE;
2517 }
2518 count --;
2519 }
729e4ab9 2520 }
b75a7d8f
A
2521 return TRUE;
2522}
2523
2524/**
2525* Checks and sets the match information if found.
729e4ab9 2526* Checks
b75a7d8f
A
2527* <ul>
2528* <li> the potential match does not repeat the previous match
2529* <li> boundaries are correct
2530* <li> potential match does not end in the middle of a contraction
2531* <li> identical matches
2532* <\ul>
2533* Otherwise the offset will be shifted to the next character.
2534* Internal method, status assumed to be success, caller has to check status
2535* before calling this method.
2536* @param strsrch string search data
2537* @param textoffset offset in the collation element text. the returned value
729e4ab9 2538* will be the truncated start offset of the match or the new start
b75a7d8f
A
2539* search offset.
2540* @param status only error status if any
2541* @return TRUE if the match is valid, FALSE otherwise
2542*/
2543static
729e4ab9
A
2544inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2545 int32_t *textoffset,
b75a7d8f
A
2546 UErrorCode *status)
2547{
2548 // to ensure that the start and ends are not composite characters
2549 UCollationElements *coleiter = strsrch->textIter;
2550 // if we have a canonical accent match
729e4ab9
A
2551 if ((strsrch->pattern.hasSuffixAccents &&
2552 strsrch->canonicalSuffixAccents[0]) ||
2553 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
2554 strsrch->canonicalPrefixAccents[0])) {
2555 strsrch->search->matchedIndex = *textoffset;
729e4ab9
A
2556 strsrch->search->matchedLength =
2557 getNextUStringSearchBaseOffset(strsrch,
b75a7d8f
A
2558 getColElemIterOffset(coleiter, FALSE))
2559 - *textoffset;
2560 return TRUE;
2561 }
2562
2563 int32_t end = ucol_getOffset(coleiter);
2564 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
729e4ab9 2565 status) ||
b75a7d8f
A
2566 U_FAILURE(*status)) {
2567 return FALSE;
2568 }
2569
2570 end = getNextUStringSearchBaseOffset(strsrch, end);
2571 // this totally matches, however we need to check if it is repeating
729e4ab9
A
2572 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2573 !isBreakUnit(strsrch, *textoffset, end) ||
b75a7d8f
A
2574 !checkIdentical(strsrch, *textoffset, end)) {
2575 (*textoffset) --;
729e4ab9 2576 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2577 *textoffset);
2578 return FALSE;
2579 }
729e4ab9 2580
b75a7d8f
A
2581 strsrch->search->matchedIndex = *textoffset;
2582 strsrch->search->matchedLength = end - *textoffset;
2583 return TRUE;
2584}
46f4442e 2585#endif // #if BOYER_MOORE
b75a7d8f
A
2586
2587// constructors and destructor -------------------------------------------
2588
729e4ab9
A
2589U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2590 int32_t patternlength,
2591 const UChar *text,
b75a7d8f
A
2592 int32_t textlength,
2593 const char *locale,
2594 UBreakIterator *breakiter,
729e4ab9 2595 UErrorCode *status)
b75a7d8f
A
2596{
2597 if (U_FAILURE(*status)) {
2598 return NULL;
2599 }
2600#if UCONFIG_NO_BREAK_ITERATION
2601 if (breakiter != NULL) {
2602 *status = U_UNSUPPORTED_ERROR;
2603 return NULL;
2604 }
2605#endif
2606 if (locale) {
2607 // ucol_open internally checks for status
2608 UCollator *collator = ucol_open(locale, status);
2609 // pattern, text checks are done in usearch_openFromCollator
729e4ab9
A
2610 UStringSearch *result = usearch_openFromCollator(pattern,
2611 patternlength, text, textlength,
b75a7d8f
A
2612 collator, breakiter, status);
2613
2614 if (result == NULL || U_FAILURE(*status)) {
2615 if (collator) {
2616 ucol_close(collator);
2617 }
2618 return NULL;
2619 }
2620 else {
2621 result->ownCollator = TRUE;
2622 }
2623 return result;
2624 }
2625 *status = U_ILLEGAL_ARGUMENT_ERROR;
2626 return NULL;
2627}
2628
2629U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
729e4ab9 2630 const UChar *pattern,
b75a7d8f 2631 int32_t patternlength,
729e4ab9 2632 const UChar *text,
b75a7d8f
A
2633 int32_t textlength,
2634 const UCollator *collator,
2635 UBreakIterator *breakiter,
729e4ab9 2636 UErrorCode *status)
b75a7d8f
A
2637{
2638 if (U_FAILURE(*status)) {
2639 return NULL;
2640 }
2641#if UCONFIG_NO_BREAK_ITERATION
2642 if (breakiter != NULL) {
2643 *status = U_UNSUPPORTED_ERROR;
2644 return NULL;
2645 }
2646#endif
2647 if (pattern == NULL || text == NULL || collator == NULL) {
2648 *status = U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 2649 return NULL;
b75a7d8f
A
2650 }
2651
374ca955
A
2652 // string search does not really work when numeric collation is turned on
2653 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2654 *status = U_UNSUPPORTED_ERROR;
73c04bcf 2655 return NULL;
374ca955
A
2656 }
2657
b75a7d8f
A
2658 if (U_SUCCESS(*status)) {
2659 initializeFCD(status);
2660 if (U_FAILURE(*status)) {
2661 return NULL;
2662 }
2663
2664 UStringSearch *result;
2665 if (textlength == -1) {
2666 textlength = u_strlen(text);
2667 }
2668 if (patternlength == -1) {
2669 patternlength = u_strlen(pattern);
2670 }
2671 if (textlength <= 0 || patternlength <= 0) {
2672 *status = U_ILLEGAL_ARGUMENT_ERROR;
2673 return NULL;
2674 }
729e4ab9 2675
b75a7d8f
A
2676 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2677 if (result == NULL) {
2678 *status = U_MEMORY_ALLOCATION_ERROR;
2679 return NULL;
2680 }
2681
2682 result->collator = collator;
2683 result->strength = ucol_getStrength(collator);
2684 result->ceMask = getMask(result->strength);
729e4ab9
A
2685 result->toShift =
2686 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
2687 UCOL_SHIFTED;
2688 result->variableTop = ucol_getVariableTop(collator, status);
2689
729e4ab9
A
2690 result->nfd = Normalizer2Factory::getNFDInstance(*status);
2691
b75a7d8f
A
2692 if (U_FAILURE(*status)) {
2693 uprv_free(result);
2694 return NULL;
2695 }
2696
2697 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2698 if (result->search == NULL) {
2699 *status = U_MEMORY_ALLOCATION_ERROR;
2700 uprv_free(result);
2701 return NULL;
2702 }
2703
2704 result->search->text = text;
2705 result->search->textLength = textlength;
2706
2707 result->pattern.text = pattern;
2708 result->pattern.textLength = patternlength;
2709 result->pattern.CE = NULL;
46f4442e 2710 result->pattern.PCE = NULL;
729e4ab9 2711
b75a7d8f
A
2712 result->search->breakIter = breakiter;
2713#if !UCONFIG_NO_BREAK_ITERATION
729e4ab9 2714 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
b75a7d8f 2715 if (breakiter) {
4388f060 2716 ubrk_setText(breakiter, text, textlength, status);
b75a7d8f
A
2717 }
2718#endif
2719
2720 result->ownCollator = FALSE;
2721 result->search->matchedLength = 0;
2722 result->search->matchedIndex = USEARCH_DONE;
46f4442e 2723 result->utilIter = NULL;
729e4ab9 2724 result->textIter = ucol_openElements(collator, text,
b75a7d8f 2725 textlength, status);
57a6839d 2726 result->textProcessedIter = NULL;
b75a7d8f
A
2727 if (U_FAILURE(*status)) {
2728 usearch_close(result);
2729 return NULL;
2730 }
2731
b75a7d8f
A
2732 result->search->isOverlap = FALSE;
2733 result->search->isCanonicalMatch = FALSE;
729e4ab9 2734 result->search->elementComparisonType = 0;
b75a7d8f
A
2735 result->search->isForwardSearching = TRUE;
2736 result->search->reset = TRUE;
729e4ab9 2737
b75a7d8f
A
2738 initialize(result, status);
2739
2740 if (U_FAILURE(*status)) {
2741 usearch_close(result);
2742 return NULL;
2743 }
2744
2745 return result;
2746 }
2747 return NULL;
2748}
2749
2750U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2751{
2752 if (strsrch) {
2753 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2754 strsrch->pattern.CE) {
2755 uprv_free(strsrch->pattern.CE);
2756 }
46f4442e
A
2757
2758 if (strsrch->pattern.PCE != NULL &&
2759 strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
2760 uprv_free(strsrch->pattern.PCE);
2761 }
2762
57a6839d 2763 delete strsrch->textProcessedIter;
b75a7d8f
A
2764 ucol_closeElements(strsrch->textIter);
2765 ucol_closeElements(strsrch->utilIter);
46f4442e 2766
b75a7d8f
A
2767 if (strsrch->ownCollator && strsrch->collator) {
2768 ucol_close((UCollator *)strsrch->collator);
2769 }
46f4442e
A
2770
2771#if !UCONFIG_NO_BREAK_ITERATION
2772 if (strsrch->search->internalBreakIter) {
4388f060 2773 ubrk_close(strsrch->search->internalBreakIter);
46f4442e
A
2774 }
2775#endif
2776
b75a7d8f
A
2777 uprv_free(strsrch->search);
2778 uprv_free(strsrch);
2779 }
2780}
2781
57a6839d
A
2782namespace {
2783
2784UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2785 if (U_FAILURE(*status)) { return FALSE; }
2786 if (strsrch->textProcessedIter == NULL) {
2787 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2788 if (strsrch->textProcessedIter == NULL) {
2789 *status = U_MEMORY_ALLOCATION_ERROR;
2790 return FALSE;
2791 }
2792 } else {
2793 strsrch->textProcessedIter->init(strsrch->textIter);
2794 }
2795 return TRUE;
2796}
2797
2798}
2799
b75a7d8f
A
2800// set and get methods --------------------------------------------------
2801
729e4ab9 2802U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
b75a7d8f
A
2803 int32_t position,
2804 UErrorCode *status)
2805{
2806 if (U_SUCCESS(*status) && strsrch) {
2807 if (isOutOfBounds(strsrch->search->textLength, position)) {
2808 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2809 }
2810 else {
2811 setColEIterOffset(strsrch->textIter, position);
2812 }
2813 strsrch->search->matchedIndex = USEARCH_DONE;
2814 strsrch->search->matchedLength = 0;
729e4ab9 2815 strsrch->search->reset = FALSE;
b75a7d8f
A
2816 }
2817}
2818
2819U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2820{
2821 if (strsrch) {
2822 int32_t result = ucol_getOffset(strsrch->textIter);
2823 if (isOutOfBounds(strsrch->search->textLength, result)) {
2824 return USEARCH_DONE;
2825 }
2826 return result;
2827 }
2828 return USEARCH_DONE;
2829}
729e4ab9
A
2830
2831U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
b75a7d8f
A
2832 USearchAttribute attribute,
2833 USearchAttributeValue value,
2834 UErrorCode *status)
2835{
2836 if (U_SUCCESS(*status) && strsrch) {
2837 switch (attribute)
2838 {
2839 case USEARCH_OVERLAP :
2840 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2841 break;
2842 case USEARCH_CANONICAL_MATCH :
729e4ab9 2843 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
b75a7d8f
A
2844 FALSE);
2845 break;
729e4ab9
A
2846 case USEARCH_ELEMENT_COMPARISON :
2847 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2848 strsrch->search->elementComparisonType = (int16_t)value;
2849 } else {
2850 strsrch->search->elementComparisonType = 0;
2851 }
2852 break;
b75a7d8f
A
2853 case USEARCH_ATTRIBUTE_COUNT :
2854 default:
2855 *status = U_ILLEGAL_ARGUMENT_ERROR;
2856 }
2857 }
2858 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2859 *status = U_ILLEGAL_ARGUMENT_ERROR;
2860 }
2861}
729e4ab9 2862
b75a7d8f
A
2863U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2864 const UStringSearch *strsrch,
2865 USearchAttribute attribute)
2866{
2867 if (strsrch) {
2868 switch (attribute) {
2869 case USEARCH_OVERLAP :
729e4ab9 2870 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
b75a7d8f
A
2871 USEARCH_OFF);
2872 case USEARCH_CANONICAL_MATCH :
729e4ab9 2873 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
b75a7d8f 2874 USEARCH_OFF);
729e4ab9
A
2875 case USEARCH_ELEMENT_COMPARISON :
2876 {
2877 int16_t value = strsrch->search->elementComparisonType;
2878 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2879 return (USearchAttributeValue)value;
2880 } else {
2881 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2882 }
2883 }
b75a7d8f
A
2884 case USEARCH_ATTRIBUTE_COUNT :
2885 return USEARCH_DEFAULT;
2886 }
2887 }
2888 return USEARCH_DEFAULT;
2889}
2890
2891U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2892 const UStringSearch *strsrch)
2893{
2894 if (strsrch == NULL) {
2895 return USEARCH_DONE;
2896 }
2897 return strsrch->search->matchedIndex;
2898}
2899
2900
729e4ab9
A
2901U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2902 UChar *result,
2903 int32_t resultCapacity,
b75a7d8f
A
2904 UErrorCode *status)
2905{
2906 if (U_FAILURE(*status)) {
2907 return USEARCH_DONE;
2908 }
729e4ab9 2909 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
b75a7d8f
A
2910 result == NULL)) {
2911 *status = U_ILLEGAL_ARGUMENT_ERROR;
2912 return USEARCH_DONE;
2913 }
2914
2915 int32_t copylength = strsrch->search->matchedLength;
2916 int32_t copyindex = strsrch->search->matchedIndex;
2917 if (copyindex == USEARCH_DONE) {
2918 u_terminateUChars(result, resultCapacity, 0, status);
2919 return USEARCH_DONE;
2920 }
2921
2922 if (resultCapacity < copylength) {
2923 copylength = resultCapacity;
2924 }
2925 if (copylength > 0) {
729e4ab9 2926 uprv_memcpy(result, strsrch->search->text + copyindex,
b75a7d8f
A
2927 copylength * sizeof(UChar));
2928 }
729e4ab9 2929 return u_terminateUChars(result, resultCapacity,
b75a7d8f
A
2930 strsrch->search->matchedLength, status);
2931}
729e4ab9 2932
b75a7d8f
A
2933U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2934 const UStringSearch *strsrch)
2935{
2936 if (strsrch) {
2937 return strsrch->search->matchedLength;
2938 }
2939 return USEARCH_DONE;
2940}
2941
2942#if !UCONFIG_NO_BREAK_ITERATION
2943
729e4ab9 2944U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
b75a7d8f
A
2945 UBreakIterator *breakiter,
2946 UErrorCode *status)
2947{
2948 if (U_SUCCESS(*status) && strsrch) {
4388f060 2949 strsrch->search->breakIter = breakiter;
b75a7d8f 2950 if (breakiter) {
729e4ab9 2951 ubrk_setText(breakiter, strsrch->search->text,
b75a7d8f
A
2952 strsrch->search->textLength, status);
2953 }
2954 }
2955}
2956
729e4ab9 2957U_CAPI const UBreakIterator* U_EXPORT2
b75a7d8f
A
2958usearch_getBreakIterator(const UStringSearch *strsrch)
2959{
2960 if (strsrch) {
2961 return strsrch->search->breakIter;
2962 }
2963 return NULL;
2964}
729e4ab9 2965
b75a7d8f 2966#endif
729e4ab9
A
2967
2968U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
b75a7d8f
A
2969 const UChar *text,
2970 int32_t textlength,
2971 UErrorCode *status)
2972{
2973 if (U_SUCCESS(*status)) {
729e4ab9 2974 if (strsrch == NULL || text == NULL || textlength < -1 ||
b75a7d8f
A
2975 textlength == 0) {
2976 *status = U_ILLEGAL_ARGUMENT_ERROR;
2977 }
2978 else {
2979 if (textlength == -1) {
2980 textlength = u_strlen(text);
2981 }
2982 strsrch->search->text = text;
2983 strsrch->search->textLength = textlength;
2984 ucol_setText(strsrch->textIter, text, textlength, status);
2985 strsrch->search->matchedIndex = USEARCH_DONE;
2986 strsrch->search->matchedLength = 0;
2987 strsrch->search->reset = TRUE;
2988#if !UCONFIG_NO_BREAK_ITERATION
374ca955 2989 if (strsrch->search->breakIter != NULL) {
729e4ab9 2990 ubrk_setText(strsrch->search->breakIter, text,
374ca955
A
2991 textlength, status);
2992 }
46f4442e 2993 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
b75a7d8f
A
2994#endif
2995 }
2996 }
2997}
2998
729e4ab9 2999U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
b75a7d8f
A
3000 int32_t *length)
3001{
3002 if (strsrch) {
3003 *length = strsrch->search->textLength;
3004 return strsrch->search->text;
3005 }
3006 return NULL;
3007}
3008
729e4ab9 3009U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
b75a7d8f
A
3010 const UCollator *collator,
3011 UErrorCode *status)
3012{
3013 if (U_SUCCESS(*status)) {
3014 if (collator == NULL) {
3015 *status = U_ILLEGAL_ARGUMENT_ERROR;
3016 return;
3017 }
46f4442e 3018
b75a7d8f 3019 if (strsrch) {
57a6839d
A
3020 delete strsrch->textProcessedIter;
3021 strsrch->textProcessedIter = NULL;
3022 ucol_closeElements(strsrch->textIter);
3023 ucol_closeElements(strsrch->utilIter);
3024 strsrch->textIter = strsrch->utilIter = NULL;
b75a7d8f
A
3025 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3026 ucol_close((UCollator *)strsrch->collator);
3027 strsrch->ownCollator = FALSE;
3028 }
3029 strsrch->collator = collator;
3030 strsrch->strength = ucol_getStrength(collator);
3031 strsrch->ceMask = getMask(strsrch->strength);
46f4442e 3032#if !UCONFIG_NO_BREAK_ITERATION
4388f060
A
3033 ubrk_close(strsrch->search->internalBreakIter);
3034 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3035 strsrch->search->text, strsrch->search->textLength, status);
46f4442e 3036#endif
b75a7d8f 3037 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9
A
3038 strsrch->toShift =
3039 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
3040 UCOL_SHIFTED;
3041 // if status is a failure, ucol_getVariableTop returns 0
3042 strsrch->variableTop = ucol_getVariableTop(collator, status);
57a6839d
A
3043 strsrch->textIter = ucol_openElements(collator,
3044 strsrch->search->text,
3045 strsrch->search->textLength,
3046 status);
3047 strsrch->utilIter = ucol_openElements(
3048 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3049 // initialize() _after_ setting the iterators for the new collator.
3050 initialize(strsrch, status);
b75a7d8f 3051 }
46f4442e
A
3052
3053 // **** are these calls needed?
3054 // **** we call uprv_init_pce in initializePatternPCETable
3055 // **** and the CEBuffer constructor...
3056#if 0
3057 uprv_init_pce(strsrch->textIter);
3058 uprv_init_pce(strsrch->utilIter);
3059#endif
b75a7d8f
A
3060 }
3061}
3062
3063U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3064{
3065 if (strsrch) {
3066 return (UCollator *)strsrch->collator;
3067 }
3068 return NULL;
3069}
3070
729e4ab9 3071U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
b75a7d8f
A
3072 const UChar *pattern,
3073 int32_t patternlength,
3074 UErrorCode *status)
3075{
3076 if (U_SUCCESS(*status)) {
3077 if (strsrch == NULL || pattern == NULL) {
3078 *status = U_ILLEGAL_ARGUMENT_ERROR;
3079 }
3080 else {
3081 if (patternlength == -1) {
3082 patternlength = u_strlen(pattern);
3083 }
3084 if (patternlength == 0) {
3085 *status = U_ILLEGAL_ARGUMENT_ERROR;
3086 return;
3087 }
3088 strsrch->pattern.text = pattern;
3089 strsrch->pattern.textLength = patternlength;
3090 initialize(strsrch, status);
3091 }
3092 }
3093}
3094
729e4ab9
A
3095U_CAPI const UChar* U_EXPORT2
3096usearch_getPattern(const UStringSearch *strsrch,
b75a7d8f
A
3097 int32_t *length)
3098{
3099 if (strsrch) {
3100 *length = strsrch->pattern.textLength;
3101 return strsrch->pattern.text;
3102 }
3103 return NULL;
3104}
3105
3106// miscellanous methods --------------------------------------------------
3107
729e4ab9
A
3108U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3109 UErrorCode *status)
b75a7d8f
A
3110{
3111 if (strsrch && U_SUCCESS(*status)) {
3112 strsrch->search->isForwardSearching = TRUE;
3113 usearch_setOffset(strsrch, 0, status);
3114 if (U_SUCCESS(*status)) {
3115 return usearch_next(strsrch, status);
3116 }
3117 }
3118 return USEARCH_DONE;
3119}
3120
729e4ab9 3121U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
b75a7d8f
A
3122 int32_t position,
3123 UErrorCode *status)
3124{
3125 if (strsrch && U_SUCCESS(*status)) {
3126 strsrch->search->isForwardSearching = TRUE;
3127 // position checked in usearch_setOffset
3128 usearch_setOffset(strsrch, position, status);
3129 if (U_SUCCESS(*status)) {
729e4ab9 3130 return usearch_next(strsrch, status);
b75a7d8f
A
3131 }
3132 }
3133 return USEARCH_DONE;
3134}
729e4ab9
A
3135
3136U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
b75a7d8f
A
3137 UErrorCode *status)
3138{
3139 if (strsrch && U_SUCCESS(*status)) {
3140 strsrch->search->isForwardSearching = FALSE;
3141 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3142 if (U_SUCCESS(*status)) {
3143 return usearch_previous(strsrch, status);
3144 }
3145 }
3146 return USEARCH_DONE;
3147}
3148
729e4ab9 3149U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
b75a7d8f
A
3150 int32_t position,
3151 UErrorCode *status)
3152{
3153 if (strsrch && U_SUCCESS(*status)) {
3154 strsrch->search->isForwardSearching = FALSE;
3155 // position checked in usearch_setOffset
3156 usearch_setOffset(strsrch, position, status);
3157 if (U_SUCCESS(*status)) {
729e4ab9 3158 return usearch_previous(strsrch, status);
b75a7d8f
A
3159 }
3160 }
3161 return USEARCH_DONE;
3162}
729e4ab9 3163
b75a7d8f 3164/**
729e4ab9
A
3165* If a direction switch is required, we'll count the number of ces till the
3166* beginning of the collation element iterator and iterate forwards that
3167* number of times. This is so that we get to the correct point within the
b75a7d8f
A
3168* string to continue the search in. Imagine when we are in the middle of the
3169* normalization buffer when the change in direction is request. arrrgghh....
3170* After searching the offset within the collation element iterator will be
3171* shifted to the start of the match. If a match is not found, the offset would
729e4ab9 3172* have been set to the end of the text string in the collation element
b75a7d8f
A
3173* iterator.
3174* Okay, here's my take on normalization buffer. The only time when there can
3175* be 2 matches within the same normalization is when the pattern is consists
3176* of all accents. But since the offset returned is from the text string, we
729e4ab9 3177* should not confuse the caller by returning the second match within the
b75a7d8f
A
3178* same normalization buffer. If we do, the 2 results will have the same match
3179* offsets, and that'll be confusing. I'll return the next match that doesn't
729e4ab9 3180* fall within the same normalization buffer. Note this does not affect the
b75a7d8f
A
3181* results of matches spanning the text and the normalization buffer.
3182* The position to start searching is taken from the collation element
3183* iterator. Callers of this API would have to set the offset in the collation
3184* element iterator before using this method.
3185*/
3186U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3187 UErrorCode *status)
729e4ab9 3188{
b75a7d8f 3189 if (U_SUCCESS(*status) && strsrch) {
374ca955
A
3190 // note offset is either equivalent to the start of the previous match
3191 // or is set by the user
3192 int32_t offset = usearch_getOffset(strsrch);
3193 USearch *search = strsrch->search;
3194 search->reset = FALSE;
3195 int32_t textlength = search->textLength;
b75a7d8f 3196 if (search->isForwardSearching) {
46f4442e 3197#if BOYER_MOORE
374ca955 3198 if (offset == textlength
729e4ab9 3199 || (!search->isOverlap &&
b75a7d8f 3200 (offset + strsrch->pattern.defaultShiftSize > textlength ||
729e4ab9 3201 (search->matchedIndex != USEARCH_DONE &&
374ca955 3202 offset + search->matchedLength >= textlength)))) {
b75a7d8f
A
3203 // not enough characters to match
3204 setMatchNotFound(strsrch);
729e4ab9 3205 return USEARCH_DONE;
b75a7d8f 3206 }
46f4442e
A
3207#else
3208 if (offset == textlength ||
3209 (! search->isOverlap &&
3210 (search->matchedIndex != USEARCH_DONE &&
3211 offset + search->matchedLength > textlength))) {
3212 // not enough characters to match
3213 setMatchNotFound(strsrch);
3214 return USEARCH_DONE;
3215 }
3216#endif
b75a7d8f
A
3217 }
3218 else {
729e4ab9
A
3219 // switching direction.
3220 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3221 // setOffset has been called or that previous ran off the text
729e4ab9 3222 // string. the iterator would have been set to offset 0 if a
b75a7d8f
A
3223 // match is not found.
3224 search->isForwardSearching = TRUE;
374ca955 3225 if (search->matchedIndex != USEARCH_DONE) {
b75a7d8f
A
3226 // there's no need to set the collation element iterator
3227 // the next call to next will set the offset.
374ca955 3228 return search->matchedIndex;
b75a7d8f
A
3229 }
3230 }
3231
3232 if (U_SUCCESS(*status)) {
3233 if (strsrch->pattern.CELength == 0) {
374ca955 3234 if (search->matchedIndex == USEARCH_DONE) {
b75a7d8f
A
3235 search->matchedIndex = offset;
3236 }
3237 else { // moves by codepoints
4388f060 3238 U16_FWD_1(search->text, search->matchedIndex, textlength);
b75a7d8f 3239 }
729e4ab9 3240
b75a7d8f
A
3241 search->matchedLength = 0;
3242 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3243 // status checked below
3244 if (search->matchedIndex == textlength) {
3245 search->matchedIndex = USEARCH_DONE;
3246 }
3247 }
3248 else {
374ca955
A
3249 if (search->matchedLength > 0) {
3250 // if matchlength is 0 we are at the start of the iteration
3251 if (search->isOverlap) {
3252 ucol_setOffset(strsrch->textIter, offset + 1, status);
3253 }
3254 else {
729e4ab9 3255 ucol_setOffset(strsrch->textIter,
374ca955
A
3256 offset + search->matchedLength, status);
3257 }
3258 }
3259 else {
3260 // for boundary check purposes. this will ensure that the
3261 // next match will not preceed the current offset
3262 // note search->matchedIndex will always be set to something
3263 // in the code
3264 search->matchedIndex = offset - 1;
3265 }
3266
3267 if (search->isCanonicalMatch) {
3268 // can't use exact here since extra accents are allowed.
3269 usearch_handleNextCanonical(strsrch, status);
3270 }
3271 else {
3272 usearch_handleNextExact(strsrch, status);
3273 }
3274 }
3275
b75a7d8f
A
3276 if (U_FAILURE(*status)) {
3277 return USEARCH_DONE;
3278 }
374ca955 3279
46f4442e
A
3280#if !BOYER_MOORE
3281 if (search->matchedIndex == USEARCH_DONE) {
3282 ucol_setOffset(strsrch->textIter, search->textLength, status);
3283 } else {
3284 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3285 }
3286#endif
3287
b75a7d8f
A
3288 return search->matchedIndex;
3289 }
3290 }
3291 return USEARCH_DONE;
3292}
3293
3294U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3295 UErrorCode *status)
3296{
3297 if (U_SUCCESS(*status) && strsrch) {
3298 int32_t offset;
3299 USearch *search = strsrch->search;
3300 if (search->reset) {
3301 offset = search->textLength;
3302 search->isForwardSearching = FALSE;
3303 search->reset = FALSE;
3304 setColEIterOffset(strsrch->textIter, offset);
3305 }
3306 else {
3307 offset = usearch_getOffset(strsrch);
3308 }
729e4ab9 3309
b75a7d8f
A
3310 int32_t matchedindex = search->matchedIndex;
3311 if (search->isForwardSearching == TRUE) {
729e4ab9
A
3312 // switching direction.
3313 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3314 // setOffset has been called or that next ran off the text
729e4ab9 3315 // string. the iterator would have been set to offset textLength if
b75a7d8f
A
3316 // a match is not found.
3317 search->isForwardSearching = FALSE;
3318 if (matchedindex != USEARCH_DONE) {
3319 return matchedindex;
3320 }
3321 }
3322 else {
46f4442e 3323#if BOYER_MOORE
b75a7d8f 3324 if (offset == 0 || matchedindex == 0 ||
729e4ab9 3325 (!search->isOverlap &&
b75a7d8f 3326 (offset < strsrch->pattern.defaultShiftSize ||
729e4ab9 3327 (matchedindex != USEARCH_DONE &&
b75a7d8f
A
3328 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3329 // not enough characters to match
3330 setMatchNotFound(strsrch);
729e4ab9 3331 return USEARCH_DONE;
b75a7d8f 3332 }
46f4442e
A
3333#else
3334 // Could check pattern length, but the
3335 // linear search will do the right thing
3336 if (offset == 0 || matchedindex == 0) {
3337 setMatchNotFound(strsrch);
3338 return USEARCH_DONE;
3339 }
3340#endif
b75a7d8f
A
3341 }
3342
3343 if (U_SUCCESS(*status)) {
3344 if (strsrch->pattern.CELength == 0) {
729e4ab9 3345 search->matchedIndex =
b75a7d8f
A
3346 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3347 if (search->matchedIndex == 0) {
3348 setMatchNotFound(strsrch);
3349 // status checked below
3350 }
3351 else { // move by codepoints
4388f060 3352 U16_BACK_1(search->text, 0, search->matchedIndex);
b75a7d8f
A
3353 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3354 // status checked below
3355 search->matchedLength = 0;
3356 }
3357 }
3358 else {
3359 if (strsrch->search->isCanonicalMatch) {
3360 // can't use exact here since extra accents are allowed.
3361 usearch_handlePreviousCanonical(strsrch, status);
3362 // status checked below
3363 }
3364 else {
3365 usearch_handlePreviousExact(strsrch, status);
3366 // status checked below
3367 }
3368 }
3369
3370 if (U_FAILURE(*status)) {
3371 return USEARCH_DONE;
3372 }
729e4ab9 3373
b75a7d8f
A
3374 return search->matchedIndex;
3375 }
3376 }
3377 return USEARCH_DONE;
3378}
3379
3380
729e4ab9 3381
b75a7d8f
A
3382U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3383{
729e4ab9
A
3384 /*
3385 reset is setting the attributes that are already in
b75a7d8f
A
3386 string search, hence all attributes in the collator should
3387 be retrieved without any problems
3388 */
3389 if (strsrch) {
3390 UErrorCode status = U_ZERO_ERROR;
3391 UBool sameCollAttribute = TRUE;
3392 uint32_t ceMask;
3393 UBool shift;
3394 uint32_t varTop;
3395
729e4ab9
A
3396 // **** hack to deal w/ how processed CEs encode quaternary ****
3397 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3398 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3399 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3400 sameCollAttribute = FALSE;
3401 }
3402
b75a7d8f
A
3403 strsrch->strength = ucol_getStrength(strsrch->collator);
3404 ceMask = getMask(strsrch->strength);
3405 if (strsrch->ceMask != ceMask) {
3406 strsrch->ceMask = ceMask;
3407 sameCollAttribute = FALSE;
3408 }
729e4ab9 3409
b75a7d8f 3410 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9 3411 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
b75a7d8f
A
3412 &status) == UCOL_SHIFTED;
3413 if (strsrch->toShift != shift) {
3414 strsrch->toShift = shift;
3415 sameCollAttribute = FALSE;
3416 }
3417
3418 // if status is a failure, ucol_getVariableTop returns 0
3419 varTop = ucol_getVariableTop(strsrch->collator, &status);
3420 if (strsrch->variableTop != varTop) {
3421 strsrch->variableTop = varTop;
3422 sameCollAttribute = FALSE;
3423 }
3424 if (!sameCollAttribute) {
3425 initialize(strsrch, &status);
3426 }
57a6839d 3427 ucol_setText(strsrch->textIter, strsrch->search->text,
729e4ab9 3428 strsrch->search->textLength,
729e4ab9 3429 &status);
b75a7d8f
A
3430 strsrch->search->matchedLength = 0;
3431 strsrch->search->matchedIndex = USEARCH_DONE;
3432 strsrch->search->isOverlap = FALSE;
3433 strsrch->search->isCanonicalMatch = FALSE;
729e4ab9 3434 strsrch->search->elementComparisonType = 0;
b75a7d8f
A
3435 strsrch->search->isForwardSearching = TRUE;
3436 strsrch->search->reset = TRUE;
3437 }
3438}
3439
46f4442e
A
3440//
3441// CEI Collation Element + source text index.
3442// These structs are kept in the circular buffer.
3443//
3444struct CEI {
3445 int64_t ce;
3446 int32_t lowIndex;
3447 int32_t highIndex;
3448};
3449
3450U_NAMESPACE_BEGIN
3451
57a6839d 3452namespace {
46f4442e
A
3453//
3454// CEBuffer A circular buffer of CEs from the text being searched.
3455//
4388f060
A
3456#define DEFAULT_CEBUFFER_SIZE 96
3457#define CEBUFFER_EXTRA 32
3458// Some typical max values to make buffer size more reasonable for asymmetric search.
3459// #8694 is for a better long-term solution to allocation of this buffer.
3460#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3461#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3462#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
46f4442e
A
3463struct CEBuffer {
3464 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3465 CEI *buf;
3466 int32_t bufSize;
3467 int32_t firstIx;
3468 int32_t limitIx;
3469 UCollationElements *ceIter;
3470 UStringSearch *strSearch;
3471
3472
3473
3474 CEBuffer(UStringSearch *ss, UErrorCode *status);
3475 ~CEBuffer();
3476 const CEI *get(int32_t index);
3477 const CEI *getPrevious(int32_t index);
3478};
3479
3480
3481CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
3482 buf = defBuf;
3483 strSearch = ss;
4388f060
A
3484 bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA;
3485 if (ss->search->elementComparisonType != 0) {
3486 const UChar * patText = ss->pattern.text;
3487 if (patText) {
3488 const UChar * patTextLimit = patText + ss->pattern.textLength;
3489 while ( patText < patTextLimit ) {
3490 UChar c = *patText++;
3491 if (MIGHT_BE_JAMO_L(c)) {
3492 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3493 } else {
3494 // No check for surrogates, we might allocate slightly more buffer than necessary.
3495 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3496 }
3497 }
3498 }
3499 }
46f4442e
A
3500 ceIter = ss->textIter;
3501 firstIx = 0;
3502 limitIx = 0;
3503
57a6839d 3504 if (!initTextProcessedIter(ss, status)) { return; }
46f4442e
A
3505
3506 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3507 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3508 if (buf == NULL) {
3509 *status = U_MEMORY_ALLOCATION_ERROR;
3510 }
3511 }
3512}
3513
3514// TODO: add a reset or init function so that allocated
3515// buffers can be retained & reused.
3516
3517CEBuffer::~CEBuffer() {
3518 if (buf != defBuf) {
3519 uprv_free(buf);
3520 }
3521}
3522
3523
3524// Get the CE with the specified index.
3525// Index must be in the range
3526// n-history_size < index < n+1
3527// where n is the largest index to have been fetched by some previous call to this function.
3528// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3529//
3530const CEI *CEBuffer::get(int32_t index) {
3531 int i = index % bufSize;
3532
3533 if (index>=firstIx && index<limitIx) {
3534 // The request was for an entry already in our buffer.
3535 // Just return it.
3536 return &buf[i];
3537 }
3538
3539 // Caller is requesting a new, never accessed before, CE.
3540 // Verify that it is the next one in sequence, which is all
3541 // that is allowed.
3542 if (index != limitIx) {
3543 U_ASSERT(FALSE);
3544
3545 return NULL;
3546 }
3547
3548 // Manage the circular CE buffer indexing
3549 limitIx++;
3550
3551 if (limitIx - firstIx >= bufSize) {
3552 // The buffer is full, knock out the lowest-indexed entry.
3553 firstIx++;
3554 }
3555
3556 UErrorCode status = U_ZERO_ERROR;
3557
57a6839d 3558 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3559
3560 return &buf[i];
3561}
3562
3563// Get the CE with the specified index.
3564// Index must be in the range
3565// n-history_size < index < n+1
3566// where n is the largest index to have been fetched by some previous call to this function.
3567// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3568//
3569const CEI *CEBuffer::getPrevious(int32_t index) {
3570 int i = index % bufSize;
3571
3572 if (index>=firstIx && index<limitIx) {
3573 // The request was for an entry already in our buffer.
3574 // Just return it.
3575 return &buf[i];
3576 }
3577
3578 // Caller is requesting a new, never accessed before, CE.
3579 // Verify that it is the next one in sequence, which is all
3580 // that is allowed.
3581 if (index != limitIx) {
3582 U_ASSERT(FALSE);
3583
3584 return NULL;
3585 }
3586
3587 // Manage the circular CE buffer indexing
3588 limitIx++;
3589
3590 if (limitIx - firstIx >= bufSize) {
3591 // The buffer is full, knock out the lowest-indexed entry.
3592 firstIx++;
3593 }
3594
3595 UErrorCode status = U_ZERO_ERROR;
3596
57a6839d 3597 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3598
3599 return &buf[i];
3600}
3601
57a6839d
A
3602}
3603
46f4442e
A
3604U_NAMESPACE_END
3605
3606
3607// #define USEARCH_DEBUG
3608
3609#ifdef USEARCH_DEBUG
3610#include <stdio.h>
3611#include <stdlib.h>
3612#endif
3613
3614/*
3615 * Find the next break boundary after startIndex. If the UStringSearch object
3616 * has an external break iterator, use that. Otherwise use the internal character
3617 * break iterator.
3618 */
3619static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3620#if 0
3621 const UChar *text = strsrch->search->text;
3622 int32_t textLen = strsrch->search->textLength;
729e4ab9 3623
46f4442e
A
3624 U_ASSERT(startIndex>=0);
3625 U_ASSERT(startIndex<=textLen);
729e4ab9 3626
46f4442e
A
3627 if (startIndex >= textLen) {
3628 return startIndex;
3629 }
3630
3631 UChar32 c;
3632 int32_t i = startIndex;
3633 U16_NEXT(text, i, textLen, c);
729e4ab9 3634
46f4442e
A
3635 // If we are on a control character, stop without looking for combining marks.
3636 // Control characters do not combine.
3637 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3638 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3639 return i;
3640 }
729e4ab9 3641
46f4442e
A
3642 // The initial character was not a control, and can thus accept trailing
3643 // combining characters. Advance over however many of them there are.
3644 int32_t indexOfLastCharChecked;
3645 for (;;) {
3646 indexOfLastCharChecked = i;
3647 if (i>=textLen) {
3648 break;
3649 }
3650 U16_NEXT(text, i, textLen, c);
3651 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3652 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3653 break;
3654 }
3655 }
3656 return indexOfLastCharChecked;
3657#elif !UCONFIG_NO_BREAK_ITERATION
3658 UBreakIterator *breakiterator = strsrch->search->breakIter;
3659
3660 if (breakiterator == NULL) {
3661 breakiterator = strsrch->search->internalBreakIter;
3662 }
3663
3664 if (breakiterator != NULL) {
4388f060 3665 return ubrk_following(breakiterator, startIndex);
46f4442e
A
3666 }
3667
3668 return startIndex;
3669#else
3670 // **** or should we use the original code? ****
3671 return startIndex;
3672#endif
3673
3674}
3675
3676/*
3677 * Returns TRUE if index is on a break boundary. If the UStringSearch
3678 * has an external break iterator, test using that, otherwise test
3679 * using the internal character break iterator.
3680 */
3681static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3682#if 0
3683 const UChar *text = strsrch->search->text;
3684 int32_t textLen = strsrch->search->textLength;
729e4ab9 3685
46f4442e
A
3686 U_ASSERT(index>=0);
3687 U_ASSERT(index<=textLen);
729e4ab9 3688
46f4442e 3689 if (index>=textLen || index<=0) {
4388f060 3690 return TRUE;
46f4442e 3691 }
729e4ab9 3692
46f4442e
A
3693 // If the character at the current index is not a GRAPHEME_EXTEND
3694 // then we can not be within a combining sequence.
3695 UChar32 c;
3696 U16_GET(text, 0, index, textLen, c);
3697 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3698 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
4388f060 3699 return TRUE;
46f4442e 3700 }
729e4ab9 3701
46f4442e
A
3702 // We are at a combining mark. If the preceding character is anything
3703 // except a CONTROL, CR or LF, we are in a combining sequence.
729e4ab9 3704 U16_PREV(text, 0, index, c);
46f4442e 3705 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
729e4ab9 3706 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
4388f060 3707 return !combining;
46f4442e
A
3708#elif !UCONFIG_NO_BREAK_ITERATION
3709 UBreakIterator *breakiterator = strsrch->search->breakIter;
3710
3711 if (breakiterator == NULL) {
3712 breakiterator = strsrch->search->internalBreakIter;
3713 }
3714
4388f060 3715 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
46f4442e
A
3716#else
3717 // **** or use the original code? ****
4388f060 3718 return TRUE;
46f4442e 3719#endif
729e4ab9 3720}
46f4442e
A
3721
3722#if 0
3723static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3724{
3725#if !UCONFIG_NO_BREAK_ITERATION
3726 UBreakIterator *breakiterator = strsrch->search->breakIter;
3727
3728 if (breakiterator != NULL) {
3729 int32_t startindex = ubrk_first(breakiterator);
3730 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 3731
46f4442e
A
3732 // out-of-range indexes are never boundary positions
3733 if (start < startindex || start > endindex ||
3734 end < startindex || end > endindex) {
3735 return FALSE;
3736 }
3737
729e4ab9 3738 return ubrk_isBoundary(breakiterator, start) &&
46f4442e
A
3739 ubrk_isBoundary(breakiterator, end);
3740 }
3741#endif
3742
3743 return TRUE;
3744}
3745#endif
3746
729e4ab9
A
3747typedef enum {
3748 U_CE_MATCH = -1,
3749 U_CE_NO_MATCH = 0,
3750 U_CE_SKIP_TARG,
3751 U_CE_SKIP_PATN
3752} UCompareCEsResult;
3753#define U_CE_LEVEL2_BASE 0x00000005
3754#define U_CE_LEVEL3_BASE 0x00050000
3755
3756static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3757 if (targCE == patCE) {
3758 return U_CE_MATCH;
3759 }
3760 if (compareType == 0) {
3761 return U_CE_NO_MATCH;
3762 }
46f4442e 3763
729e4ab9
A
3764 int64_t targCEshifted = targCE >> 32;
3765 int64_t patCEshifted = patCE >> 32;
3766 int64_t mask;
3767
3768 mask = 0xFFFF0000;
3769 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3770 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3771 if ( targLev1 != patLev1 ) {
3772 if ( targLev1 == 0 ) {
3773 return U_CE_SKIP_TARG;
3774 }
3775 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3776 return U_CE_SKIP_PATN;
3777 }
3778 return U_CE_NO_MATCH;
3779 }
3780
3781 mask = 0x0000FFFF;
3782 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3783 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3784 if ( targLev2 != patLev2 ) {
3785 if ( targLev2 == 0 ) {
3786 return U_CE_SKIP_TARG;
3787 }
3788 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3789 return U_CE_SKIP_PATN;
3790 }
3791 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3792 U_CE_MATCH: U_CE_NO_MATCH;
3793 }
3794
3795 mask = 0xFFFF0000;
3796 int32_t targLev3 = (int32_t)(targCE & mask);
3797 int32_t patLev3 = (int32_t)(patCE & mask);
3798 if ( targLev3 != patLev3 ) {
3799 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3800 U_CE_MATCH: U_CE_NO_MATCH;
3801 }
3802
3803 return U_CE_MATCH;
3804}
3805
3806#if BOYER_MOORE
3807// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3808#endif
3809
46f4442e
A
3810U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3811 int32_t startIdx,
3812 int32_t *matchStart,
3813 int32_t *matchLimit,
729e4ab9 3814 UErrorCode *status)
46f4442e
A
3815{
3816 if (U_FAILURE(*status)) {
3817 return FALSE;
3818 }
3819
3820 // TODO: reject search patterns beginning with a combining char.
3821
3822#ifdef USEARCH_DEBUG
3823 if (getenv("USEARCH_DEBUG") != NULL) {
3824 printf("Pattern CEs\n");
3825 for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3826 printf(" %8x", strsrch->pattern.CE[ii]);
3827 }
3828 printf("\n");
3829 }
729e4ab9 3830
46f4442e
A
3831#endif
3832 // Input parameter sanity check.
3833 // TODO: should input indicies clip to the text length
3834 // in the same way that UText does.
729e4ab9 3835 if(strsrch->pattern.CELength == 0 ||
46f4442e
A
3836 startIdx < 0 ||
3837 startIdx > strsrch->search->textLength ||
3838 strsrch->pattern.CE == NULL) {
3839 *status = U_ILLEGAL_ARGUMENT_ERROR;
3840 return FALSE;
3841 }
3842
3843 if (strsrch->pattern.PCE == NULL) {
3844 initializePatternPCETable(strsrch, status);
3845 }
3846
3847 ucol_setOffset(strsrch->textIter, startIdx, status);
3848 CEBuffer ceb(strsrch, status);
46f4442e 3849
729e4ab9
A
3850
3851 int32_t targetIx = 0;
3852 const CEI *targetCEI = NULL;
46f4442e
A
3853 int32_t patIx;
3854 UBool found;
3855
3856 int32_t mStart = -1;
3857 int32_t mLimit = -1;
3858 int32_t minLimit;
3859 int32_t maxLimit;
729e4ab9
A
3860
3861
3862
46f4442e
A
3863 // Outer loop moves over match starting positions in the
3864 // target CE space.
729e4ab9
A
3865 // Here we see the target as a sequence of collation elements, resulting from the following:
3866 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3867 // (for example, digraphs such as IJ may be broken into two characters).
3868 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3869 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3870 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3871 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3872 // then the CE is deleted, so the following code sees only CEs that are relevant.
3873 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3874 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3875 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3876 //
46f4442e
A
3877 for(targetIx=0; ; targetIx++)
3878 {
3879 found = TRUE;
3880 // Inner loop checks for a match beginning at each
3881 // position from the outer loop.
729e4ab9
A
3882 int32_t targetIxOffset = 0;
3883 int64_t patCE = 0;
4388f060
A
3884 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3885 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3886 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3887 const CEI *firstCEI = ceb.get(targetIx);
3888 if (firstCEI == NULL) {
3889 *status = U_INTERNAL_PROGRAM_ERROR;
3890 found = FALSE;
3891 break;
3892 }
3893
729e4ab9
A
3894 for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
3895 patCE = strsrch->pattern.PCE[patIx];
3896 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
46f4442e 3897 // Compare CE from target string with CE from the pattern.
729e4ab9 3898 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
46f4442e 3899 // which will fail the compare, below.
729e4ab9
A
3900 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3901 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
3902 found = FALSE;
3903 break;
729e4ab9
A
3904 } else if ( ceMatch > U_CE_NO_MATCH ) {
3905 if ( ceMatch == U_CE_SKIP_TARG ) {
3906 // redo with same patCE, next targCE
3907 patIx--;
3908 targetIxOffset++;
3909 } else { // ceMatch == U_CE_SKIP_PATN
3910 // redo with same targCE, next patCE
3911 targetIxOffset--;
3912 }
46f4442e
A
3913 }
3914 }
729e4ab9 3915 targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
46f4442e 3916
729e4ab9 3917 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
3918 // No match at this targetIx. Try again at the next.
3919 continue;
3920 }
3921
3922 if (!found) {
3923 // No match at all, we have run off the end of the target text.
3924 break;
3925 }
3926
3927
3928 // We have found a match in CE space.
3929 // Now determine the bounds in string index space.
3930 // There still is a chance of match failure if the CE range not correspond to
3931 // an acceptable character range.
3932 //
729e4ab9 3933 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
46f4442e 3934
46f4442e
A
3935 mStart = firstCEI->lowIndex;
3936 minLimit = lastCEI->lowIndex;
46f4442e
A
3937
3938 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3939 // extended to the end of input, and the match is good.
3940
3941 // Look at the high and low indices of the CE following the match. If
3942 // they are the same it means one of two things:
3943 // 1. The match extended to the last CE from the target text, which is OK, or
3944 // 2. The last CE that was part of the match is in an expansion that extends
3945 // to the first CE after the match. In this case, we reject the match.
4388f060 3946 const CEI *nextCEI = 0;
729e4ab9 3947 if (strsrch->search->elementComparisonType == 0) {
4388f060 3948 nextCEI = ceb.get(targetIx + targetIxOffset);
729e4ab9
A
3949 maxLimit = nextCEI->lowIndex;
3950 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3951 found = FALSE;
3952 }
3953 } else {
729e4ab9
A
3954 for ( ; ; ++targetIxOffset ) {
3955 nextCEI = ceb.get(targetIx + targetIxOffset);
3956 maxLimit = nextCEI->lowIndex;
4388f060 3957 // If we are at the end of the target too, match succeeds
729e4ab9
A
3958 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3959 break;
3960 }
3961 // As long as the next CE has primary weight of 0,
3962 // it is part of the last target element matched by the pattern;
3963 // make sure it can be part of a match with the last patCE
3964 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
4388f060
A
3965 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3966 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3967 found = FALSE;
3968 break;
3969 }
729e4ab9
A
3970 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3971 // target element, but it has non-zero primary weight => match fails
3972 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4388f060
A
3973 found = false;
3974 break;
729e4ab9
A
3975 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3976 } else {
4388f060 3977 break;
729e4ab9
A
3978 }
3979 }
46f4442e 3980 }
729e4ab9 3981
46f4442e
A
3982
3983 // Check for the start of the match being within a combining sequence.
3984 // This can happen if the pattern itself begins with a combining char, and
3985 // the match found combining marks in the target text that were attached
3986 // to something else.
3987 // This type of match should be rejected for not completely consuming a
3988 // combining sequence.
4388f060 3989 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
3990 found = FALSE;
3991 }
3992
3993 // Check for the start of the match being within an Collation Element Expansion,
3994 // meaning that the first char of the match is only partially matched.
729e4ab9 3995 // With exapnsions, the first CE will report the index of the source
46f4442e 3996 // character, and all subsequent (expansions) CEs will report the source index of the
729e4ab9 3997 // _following_ character.
46f4442e
A
3998 int32_t secondIx = firstCEI->highIndex;
3999 if (mStart == secondIx) {
4000 found = FALSE;
4001 }
729e4ab9 4002
46f4442e
A
4003 // Advance the match end position to the first acceptable match boundary.
4004 // This advances the index over any combining charcters.
4005 mLimit = maxLimit;
4006 if (minLimit < maxLimit) {
4388f060
A
4007 // When the last CE's low index is same with its high index, the CE is likely
4008 // a part of expansion. In this case, the index is located just after the
4009 // character corresponding to the CEs compared above. If the index is right
4010 // at the break boundary, move the position to the next boundary will result
4011 // incorrect match length when there are ignorable characters exist between
4012 // the position and the next character produces CE(s). See ticket#8482.
4013 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4014 mLimit = minLimit;
4015 } else {
4016 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4017 if (nba >= lastCEI->highIndex) {
4018 mLimit = nba;
4019 }
46f4442e
A
4020 }
4021 }
729e4ab9 4022
46f4442e
A
4023 #ifdef USEARCH_DEBUG
4024 if (getenv("USEARCH_DEBUG") != NULL) {
4025 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4026 }
4027 #endif
729e4ab9 4028
46f4442e 4029 // If advancing to the end of a combining sequence in character indexing space
729e4ab9 4030 // advanced us beyond the end of the match in CE space, reject this match.
46f4442e
A
4031 if (mLimit > maxLimit) {
4032 found = FALSE;
4033 }
4034
4388f060 4035 if (!isBreakBoundary(strsrch, mLimit)) {
46f4442e
A
4036 found = FALSE;
4037 }
4038
729e4ab9
A
4039 if (! checkIdentical(strsrch, mStart, mLimit)) {
4040 found = FALSE;
4041 }
4042
46f4442e
A
4043 if (found) {
4044 break;
4045 }
4046 }
4047
4048 #ifdef USEARCH_DEBUG
4049 if (getenv("USEARCH_DEBUG") != NULL) {
4050 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4051 int32_t lastToPrint = ceb.limitIx+2;
4052 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4053 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4054 }
4055 printf("\n%s\n", found? "match found" : "no match");
4056 }
4057 #endif
4058
4059 // All Done. Store back the match bounds to the caller.
4060 //
4061 if (found==FALSE) {
4062 mLimit = -1;
4063 mStart = -1;
4064 }
4065
4066 if (matchStart != NULL) {
4067 *matchStart= mStart;
4068 }
4069
4070 if (matchLimit != NULL) {
4071 *matchLimit = mLimit;
4072 }
4073
4074 return found;
4075}
4076
46f4442e
A
4077U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4078 int32_t startIdx,
4079 int32_t *matchStart,
4080 int32_t *matchLimit,
729e4ab9 4081 UErrorCode *status)
46f4442e
A
4082{
4083 if (U_FAILURE(*status)) {
4084 return FALSE;
4085 }
4086
4087 // TODO: reject search patterns beginning with a combining char.
4088
4089#ifdef USEARCH_DEBUG
4090 if (getenv("USEARCH_DEBUG") != NULL) {
4091 printf("Pattern CEs\n");
4092 for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
4093 printf(" %8x", strsrch->pattern.CE[ii]);
4094 }
4095 printf("\n");
4096 }
729e4ab9 4097
46f4442e
A
4098#endif
4099 // Input parameter sanity check.
4100 // TODO: should input indicies clip to the text length
4101 // in the same way that UText does.
729e4ab9 4102 if(strsrch->pattern.CELength == 0 ||
46f4442e
A
4103 startIdx < 0 ||
4104 startIdx > strsrch->search->textLength ||
4105 strsrch->pattern.CE == NULL) {
4106 *status = U_ILLEGAL_ARGUMENT_ERROR;
4107 return FALSE;
4108 }
4109
4110 if (strsrch->pattern.PCE == NULL) {
4111 initializePatternPCETable(strsrch, status);
4112 }
4113
4114 CEBuffer ceb(strsrch, status);
729e4ab9 4115 int32_t targetIx = 0;
46f4442e
A
4116
4117 /*
4118 * Pre-load the buffer with the CE's for the grapheme
4119 * after our starting position so that we're sure that
4120 * we can look at the CE following the match when we
4121 * check the match boundaries.
4122 *
4123 * This will also pre-fetch the first CE that we'll
4124 * consider for the match.
4125 */
4126 if (startIdx < strsrch->search->textLength) {
4127 UBreakIterator *bi = strsrch->search->internalBreakIter;
4128 int32_t next = ubrk_following(bi, startIdx);
4129
4130 ucol_setOffset(strsrch->textIter, next, status);
4131
4132 for (targetIx = 0; ; targetIx += 1) {
4133 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4134 break;
4135 }
4136 }
4137 } else {
4138 ucol_setOffset(strsrch->textIter, startIdx, status);
4139 }
46f4442e 4140
729e4ab9
A
4141
4142 const CEI *targetCEI = NULL;
46f4442e
A
4143 int32_t patIx;
4144 UBool found;
4145
4146 int32_t limitIx = targetIx;
4147 int32_t mStart = -1;
4148 int32_t mLimit = -1;
4149 int32_t minLimit;
4150 int32_t maxLimit;
729e4ab9
A
4151
4152
4153
46f4442e
A
4154 // Outer loop moves over match starting positions in the
4155 // target CE space.
729e4ab9
A
4156 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4157 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4158 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4159 // and the beginning of the base text.
46f4442e
A
4160 for(targetIx = limitIx; ; targetIx += 1)
4161 {
4162 found = TRUE;
4388f060
A
4163 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4164 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4165 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4166 const CEI *lastCEI = ceb.getPrevious(targetIx);
4167 if (lastCEI == NULL) {
4168 *status = U_INTERNAL_PROGRAM_ERROR;
4169 found = FALSE;
4170 break;
4171 }
46f4442e
A
4172 // Inner loop checks for a match beginning at each
4173 // position from the outer loop.
729e4ab9
A
4174 int32_t targetIxOffset = 0;
4175 for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
46f4442e
A
4176 int64_t patCE = strsrch->pattern.PCE[patIx];
4177
729e4ab9 4178 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
46f4442e
A
4179 // Compare CE from target string with CE from the pattern.
4180 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4181 // which will fail the compare, below.
729e4ab9
A
4182 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4183 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
4184 found = FALSE;
4185 break;
729e4ab9
A
4186 } else if ( ceMatch > U_CE_NO_MATCH ) {
4187 if ( ceMatch == U_CE_SKIP_TARG ) {
4188 // redo with same patCE, next targCE
4189 patIx++;
4190 targetIxOffset++;
4191 } else { // ceMatch == U_CE_SKIP_PATN
4192 // redo with same targCE, next patCE
4193 targetIxOffset--;
4194 }
46f4442e
A
4195 }
4196 }
4197
729e4ab9 4198 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
4199 // No match at this targetIx. Try again at the next.
4200 continue;
4201 }
4202
4203 if (!found) {
4204 // No match at all, we have run off the end of the target text.
4205 break;
4206 }
4207
4208
4209 // We have found a match in CE space.
4210 // Now determine the bounds in string index space.
4211 // There still is a chance of match failure if the CE range not correspond to
4212 // an acceptable character range.
4213 //
729e4ab9 4214 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
46f4442e 4215 mStart = firstCEI->lowIndex;
46f4442e
A
4216
4217 // Check for the start of the match being within a combining sequence.
4218 // This can happen if the pattern itself begins with a combining char, and
4219 // the match found combining marks in the target text that were attached
4220 // to something else.
4221 // This type of match should be rejected for not completely consuming a
4222 // combining sequence.
4388f060 4223 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
4224 found = FALSE;
4225 }
4226
4227 // Look at the high index of the first CE in the match. If it's the same as the
4228 // low index, the first CE in the match is in the middle of an expansion.
4229 if (mStart == firstCEI->highIndex) {
4230 found = FALSE;
4231 }
729e4ab9 4232
46f4442e 4233
4388f060
A
4234 minLimit = lastCEI->lowIndex;
4235
4236 if (targetIx > 0) {
4237 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4238 // extended to the end of input, and the match is good.
4239
4240 // Look at the high and low indices of the CE following the match. If
4241 // they are the same it means one of two things:
4242 // 1. The match extended to the last CE from the target text, which is OK, or
4243 // 2. The last CE that was part of the match is in an expansion that extends
4244 // to the first CE after the match. In this case, we reject the match.
4245 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4246
4247 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4248 found = FALSE;
4249 }
4250
4251 mLimit = maxLimit = nextCEI->lowIndex;
4252
4253 // Advance the match end position to the first acceptable match boundary.
4254 // This advances the index over any combining charcters.
4255 if (minLimit < maxLimit) {
4256 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4257
4258 if (nba >= lastCEI->highIndex) {
4259 mLimit = nba;
4260 }
46f4442e 4261 }
4388f060
A
4262
4263 // If advancing to the end of a combining sequence in character indexing space
4264 // advanced us beyond the end of the match in CE space, reject this match.
4265 if (mLimit > maxLimit) {
4266 found = FALSE;
4267 }
4268
4269 // Make sure the end of the match is on a break boundary
4270 if (!isBreakBoundary(strsrch, mLimit)) {
4271 found = FALSE;
4272 }
4273
4274 } else {
4275 // No non-ignorable CEs after this point.
4276 // The maximum position is detected by boundary after
4277 // the last non-ignorable CE. Combining sequence
4278 // across the start index will be truncated.
4279 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4280 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
46f4442e 4281 }
729e4ab9 4282
46f4442e
A
4283 #ifdef USEARCH_DEBUG
4284 if (getenv("USEARCH_DEBUG") != NULL) {
4285 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4286 }
4287 #endif
729e4ab9 4288
46f4442e 4289
729e4ab9
A
4290 if (! checkIdentical(strsrch, mStart, mLimit)) {
4291 found = FALSE;
4292 }
4293
46f4442e
A
4294 if (found) {
4295 break;
4296 }
4297 }
4298
4299 #ifdef USEARCH_DEBUG
4300 if (getenv("USEARCH_DEBUG") != NULL) {
4301 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4302 int32_t lastToPrint = ceb.limitIx+2;
4303 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4304 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4305 }
4306 printf("\n%s\n", found? "match found" : "no match");
4307 }
4308 #endif
4309
4310 // All Done. Store back the match bounds to the caller.
4311 //
4312 if (found==FALSE) {
4313 mLimit = -1;
4314 mStart = -1;
4315 }
4316
4317 if (matchStart != NULL) {
4318 *matchStart= mStart;
4319 }
4320
4321 if (matchLimit != NULL) {
4322 *matchLimit = mLimit;
4323 }
4324
4325 return found;
4326}
4327
b75a7d8f
A
4328// internal use methods declared in usrchimp.h -----------------------------
4329
4330UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4331{
4332 if (U_FAILURE(*status)) {
4333 setMatchNotFound(strsrch);
4334 return FALSE;
4335 }
4336
46f4442e 4337#if BOYER_MOORE
374ca955 4338 UCollationElements *coleiter = strsrch->textIter;
b75a7d8f 4339 int32_t textlength = strsrch->search->textLength;
374ca955 4340 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
4341 int32_t patterncelength = strsrch->pattern.CELength;
4342 int32_t textoffset = ucol_getOffset(coleiter);
4343
374ca955 4344 // status used in setting coleiter offset, since offset is checked in
729e4ab9 4345 // shiftForward before setting the coleiter offset, status never
374ca955 4346 // a failure
729e4ab9 4347 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4348 patterncelength);
4349 while (textoffset <= textlength)
4350 {
4351 uint32_t patternceindex = patterncelength - 1;
374ca955 4352 int32_t targetce;
b75a7d8f 4353 UBool found = FALSE;
374ca955
A
4354 int32_t lastce = UCOL_NULLORDER;
4355
4356 setColEIterOffset(coleiter, textoffset);
4357
46f4442e 4358 for (;;) {
b75a7d8f
A
4359 // finding the last pattern ce match, imagine composite characters
4360 // for example: search for pattern A in text \u00C0
4361 // we'll have to skip \u0300 the grave first before we get to A
4362 targetce = ucol_previous(coleiter, status);
4363 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4364 found = FALSE;
4365 break;
4366 }
4367 targetce = getCE(strsrch, targetce);
729e4ab9
A
4368 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4369 // this is for the text \u0315\u0300 that requires
b75a7d8f
A
4370 // normalization and pattern \u0300, where \u0315 is ignorable
4371 continue;
4372 }
4373 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4374 lastce = targetce;
4375 }
729e4ab9 4376 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4377 if (targetce == patternce[patternceindex]) {
4378 // the first ce can be a contraction
4379 found = TRUE;
4380 break;
4381 }
4382 if (!hasExpansion(coleiter)) {
4383 found = FALSE;
4384 break;
4385 }
4386 }
4387
46f4442e 4388 //targetce = lastce;
729e4ab9 4389
b75a7d8f 4390 while (found && patternceindex > 0) {
4388f060 4391 lastce = targetce;
b75a7d8f
A
4392 targetce = ucol_previous(coleiter, status);
4393 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4394 found = FALSE;
4395 break;
4396 }
4397 targetce = getCE(strsrch, targetce);
4398 if (targetce == UCOL_IGNORABLE) {
4399 continue;
4400 }
4401
4402 patternceindex --;
729e4ab9
A
4403 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4404 found = found && targetce == patternce[patternceindex];
b75a7d8f 4405 }
729e4ab9 4406
46f4442e 4407 targetce = lastce;
b75a7d8f
A
4408
4409 if (!found) {
374ca955
A
4410 if (U_FAILURE(*status)) {
4411 break;
4412 }
729e4ab9 4413 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4414 patternceindex);
4415 // status checked at loop.
4416 patternceindex = patterncelength;
4417 continue;
4418 }
374ca955
A
4419
4420 if (checkNextExactMatch(strsrch, &textoffset, status)) {
b75a7d8f 4421 // status checked in ucol_setOffset
374ca955
A
4422 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4423 return TRUE;
b75a7d8f
A
4424 }
4425 }
4426 setMatchNotFound(strsrch);
374ca955 4427 return FALSE;
46f4442e
A
4428#else
4429 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4430 int32_t start = -1;
4431 int32_t end = -1;
4432
4433 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4434 strsrch->search->matchedIndex = start;
4435 strsrch->search->matchedLength = end - start;
4436 return TRUE;
4437 } else {
4438 setMatchNotFound(strsrch);
4439 return FALSE;
4440 }
4441#endif
b75a7d8f
A
4442}
4443
4444UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4445{
4446 if (U_FAILURE(*status)) {
4447 setMatchNotFound(strsrch);
4448 return FALSE;
4449 }
4450
46f4442e 4451#if BOYER_MOORE
b75a7d8f
A
4452 UCollationElements *coleiter = strsrch->textIter;
4453 int32_t textlength = strsrch->search->textLength;
374ca955 4454 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f 4455 int32_t patterncelength = strsrch->pattern.CELength;
374ca955 4456 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4457 UBool hasPatternAccents =
b75a7d8f 4458 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9
A
4459
4460 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4461 patterncelength);
4462 strsrch->canonicalPrefixAccents[0] = 0;
4463 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4464
b75a7d8f
A
4465 while (textoffset <= textlength)
4466 {
4467 int32_t patternceindex = patterncelength - 1;
374ca955 4468 int32_t targetce;
b75a7d8f 4469 UBool found = FALSE;
374ca955
A
4470 int32_t lastce = UCOL_NULLORDER;
4471
4472 setColEIterOffset(coleiter, textoffset);
4473
4474 for (;;) {
b75a7d8f
A
4475 // finding the last pattern ce match, imagine composite characters
4476 // for example: search for pattern A in text \u00C0
4477 // we'll have to skip \u0300 the grave first before we get to A
4478 targetce = ucol_previous(coleiter, status);
4479 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4480 found = FALSE;
4481 break;
4482 }
4483 targetce = getCE(strsrch, targetce);
4484 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4485 lastce = targetce;
4486 }
729e4ab9 4487 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4488 if (targetce == patternce[patternceindex]) {
4489 // the first ce can be a contraction
4490 found = TRUE;
4491 break;
4492 }
4493 if (!hasExpansion(coleiter)) {
4494 found = FALSE;
4495 break;
4496 }
4497 }
729e4ab9 4498
b75a7d8f
A
4499 while (found && patternceindex > 0) {
4500 targetce = ucol_previous(coleiter, status);
4501 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4502 found = FALSE;
4503 break;
4504 }
4505 targetce = getCE(strsrch, targetce);
4506 if (targetce == UCOL_IGNORABLE) {
4507 continue;
4508 }
4509
4510 patternceindex --;
729e4ab9
A
4511 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4512 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4513 }
4514
4515 // initializing the rearranged accent array
4516 if (hasPatternAccents && !found) {
4517 strsrch->canonicalPrefixAccents[0] = 0;
4518 strsrch->canonicalSuffixAccents[0] = 0;
374ca955
A
4519 if (U_FAILURE(*status)) {
4520 break;
4521 }
b75a7d8f
A
4522 found = doNextCanonicalMatch(strsrch, textoffset, status);
4523 }
4524
4525 if (!found) {
374ca955
A
4526 if (U_FAILURE(*status)) {
4527 break;
4528 }
729e4ab9 4529 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4530 patternceindex);
4531 // status checked at loop
4532 patternceindex = patterncelength;
4533 continue;
4534 }
729e4ab9 4535
b75a7d8f
A
4536 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4537 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4538 return TRUE;
4539 }
4540 }
4541 setMatchNotFound(strsrch);
4542 return FALSE;
46f4442e
A
4543#else
4544 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4545 int32_t start = -1;
4546 int32_t end = -1;
4547
4548 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4549 strsrch->search->matchedIndex = start;
4550 strsrch->search->matchedLength = end - start;
4551 return TRUE;
4552 } else {
4553 setMatchNotFound(strsrch);
4554 return FALSE;
4555 }
4556#endif
b75a7d8f
A
4557}
4558
4559UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4560{
4561 if (U_FAILURE(*status)) {
4562 setMatchNotFound(strsrch);
4563 return FALSE;
4564 }
4565
46f4442e 4566#if BOYER_MOORE
b75a7d8f 4567 UCollationElements *coleiter = strsrch->textIter;
374ca955 4568 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f 4569 int32_t patterncelength = strsrch->pattern.CELength;
374ca955 4570 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
4571
4572 // shifting it check for setting offset
4573 // if setOffset is called previously or there was no previous match, we
4574 // leave the offset as it is.
4575 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4576 textoffset = strsrch->search->matchedIndex;
4577 }
729e4ab9
A
4578
4579 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f 4580 patterncelength);
729e4ab9 4581
b75a7d8f
A
4582 while (textoffset >= 0)
4583 {
4584 int32_t patternceindex = 1;
374ca955 4585 int32_t targetce;
b75a7d8f 4586 UBool found = FALSE;
374ca955 4587 int32_t firstce = UCOL_NULLORDER;
b75a7d8f 4588
374ca955 4589 // if status is a failure, ucol_setOffset does nothing
b75a7d8f 4590 setColEIterOffset(coleiter, textoffset);
374ca955
A
4591
4592 for (;;) {
729e4ab9
A
4593 // finding the first pattern ce match, imagine composite
4594 // characters. for example: search for pattern \u0300 in text
4595 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4596 // \u0300 the grave accent
4597 targetce = ucol_next(coleiter, status);
4598 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4599 found = FALSE;
4600 break;
4601 }
4602 targetce = getCE(strsrch, targetce);
4603 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4604 firstce = targetce;
4605 }
46f4442e 4606 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
b75a7d8f 4607 continue;
729e4ab9
A
4608 }
4609 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4610 if (targetce == patternce[0]) {
4611 found = TRUE;
4612 break;
4613 }
4614 if (!hasExpansion(coleiter)) {
4615 // checking for accents in composite character
4616 found = FALSE;
4617 break;
4618 }
4619 }
4620
46f4442e 4621 //targetce = firstce;
729e4ab9 4622
b75a7d8f 4623 while (found && (patternceindex < patterncelength)) {
4388f060 4624 firstce = targetce;
b75a7d8f
A
4625 targetce = ucol_next(coleiter, status);
4626 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4627 found = FALSE;
4628 break;
4629 }
4630 targetce = getCE(strsrch, targetce);
4631 if (targetce == UCOL_IGNORABLE) {
4632 continue;
4633 }
4634
729e4ab9
A
4635 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4636 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4637 patternceindex ++;
4638 }
729e4ab9 4639
46f4442e 4640 targetce = firstce;
b75a7d8f
A
4641
4642 if (!found) {
374ca955
A
4643 if (U_FAILURE(*status)) {
4644 break;
4645 }
729e4ab9
A
4646
4647 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4648 patternceindex);
4649 patternceindex = 0;
4650 continue;
4651 }
729e4ab9 4652
b75a7d8f
A
4653 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4654 setColEIterOffset(coleiter, textoffset);
4655 return TRUE;
4656 }
4657 }
4658 setMatchNotFound(strsrch);
4659 return FALSE;
46f4442e 4660#else
4388f060
A
4661 int32_t textOffset;
4662
4663 if (strsrch->search->isOverlap) {
4664 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4665 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4666 } else {
4667 // move the start position at the end of possible match
4668 initializePatternPCETable(strsrch, status);
57a6839d
A
4669 if (!initTextProcessedIter(strsrch, status)) {
4670 setMatchNotFound(strsrch);
4671 return FALSE;
4672 }
4388f060 4673 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
57a6839d 4674 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4675 if (pce == UCOL_PROCESSED_NULLORDER) {
4676 // at the end of the text
4677 break;
4678 }
4679 }
4680 if (U_FAILURE(*status)) {
4681 setMatchNotFound(strsrch);
4682 return FALSE;
4683 }
4684 textOffset = ucol_getOffset(strsrch->textIter);
4685 }
4686 } else {
4687 textOffset = ucol_getOffset(strsrch->textIter);
4688 }
4689
46f4442e
A
4690 int32_t start = -1;
4691 int32_t end = -1;
4692
4693 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4694 strsrch->search->matchedIndex = start;
4695 strsrch->search->matchedLength = end - start;
4696 return TRUE;
4697 } else {
4698 setMatchNotFound(strsrch);
4699 return FALSE;
4700 }
4701#endif
b75a7d8f
A
4702}
4703
729e4ab9 4704UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
b75a7d8f
A
4705 UErrorCode *status)
4706{
4707 if (U_FAILURE(*status)) {
4708 setMatchNotFound(strsrch);
4709 return FALSE;
4710 }
4711
46f4442e 4712#if BOYER_MOORE
b75a7d8f 4713 UCollationElements *coleiter = strsrch->textIter;
374ca955 4714 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f 4715 int32_t patterncelength = strsrch->pattern.CELength;
374ca955 4716 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4717 UBool hasPatternAccents =
b75a7d8f 4718 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9 4719
b75a7d8f
A
4720 // shifting it check for setting offset
4721 // if setOffset is called previously or there was no previous match, we
4722 // leave the offset as it is.
4723 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4724 textoffset = strsrch->search->matchedIndex;
4725 }
729e4ab9
A
4726
4727 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4728 patterncelength);
4729 strsrch->canonicalPrefixAccents[0] = 0;
4730 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4731
b75a7d8f
A
4732 while (textoffset >= 0)
4733 {
4734 int32_t patternceindex = 1;
374ca955 4735 int32_t targetce;
b75a7d8f 4736 UBool found = FALSE;
374ca955 4737 int32_t firstce = UCOL_NULLORDER;
b75a7d8f
A
4738
4739 setColEIterOffset(coleiter, textoffset);
46f4442e 4740 for (;;) {
729e4ab9
A
4741 // finding the first pattern ce match, imagine composite
4742 // characters. for example: search for pattern \u0300 in text
4743 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4744 // \u0300 the grave accent
4745 targetce = ucol_next(coleiter, status);
4746 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4747 found = FALSE;
4748 break;
4749 }
4750 targetce = getCE(strsrch, targetce);
4751 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4752 firstce = targetce;
4753 }
729e4ab9
A
4754
4755 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4756 if (targetce == patternce[0]) {
4757 // the first ce can be a contraction
4758 found = TRUE;
4759 break;
4760 }
4761 if (!hasExpansion(coleiter)) {
4762 // checking for accents in composite character
4763 found = FALSE;
4764 break;
4765 }
4766 }
4767
4768 targetce = firstce;
729e4ab9 4769
b75a7d8f
A
4770 while (found && patternceindex < patterncelength) {
4771 targetce = ucol_next(coleiter, status);
4772 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4773 found = FALSE;
4774 break;
4775 }
4776 targetce = getCE(strsrch, targetce);
4777 if (targetce == UCOL_IGNORABLE) {
4778 continue;
4779 }
4780
729e4ab9
A
4781 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4782 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4783 patternceindex ++;
4784 }
4785
4786 // initializing the rearranged accent array
4787 if (hasPatternAccents && !found) {
4788 strsrch->canonicalPrefixAccents[0] = 0;
4789 strsrch->canonicalSuffixAccents[0] = 0;
374ca955 4790 if (U_FAILURE(*status)) {
b75a7d8f
A
4791 break;
4792 }
4793 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4794 }
4795
4796 if (!found) {
374ca955 4797 if (U_FAILURE(*status)) {
b75a7d8f
A
4798 break;
4799 }
729e4ab9 4800 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4801 patternceindex);
4802 patternceindex = 0;
4803 continue;
4804 }
4805
4806 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4807 setColEIterOffset(coleiter, textoffset);
4808 return TRUE;
4809 }
4810 }
4811 setMatchNotFound(strsrch);
4812 return FALSE;
46f4442e 4813#else
4388f060
A
4814 int32_t textOffset;
4815
4816 if (strsrch->search->isOverlap) {
4817 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4818 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4819 } else {
4820 // move the start position at the end of possible match
4821 initializePatternPCETable(strsrch, status);
57a6839d
A
4822 if (!initTextProcessedIter(strsrch, status)) {
4823 setMatchNotFound(strsrch);
4824 return FALSE;
4825 }
4388f060 4826 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
57a6839d 4827 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4828 if (pce == UCOL_PROCESSED_NULLORDER) {
4829 // at the end of the text
4830 break;
4831 }
4832 }
4833 if (U_FAILURE(*status)) {
4834 setMatchNotFound(strsrch);
4835 return FALSE;
4836 }
4837 textOffset = ucol_getOffset(strsrch->textIter);
4838 }
4839 } else {
4840 textOffset = ucol_getOffset(strsrch->textIter);
4841 }
4842
46f4442e
A
4843 int32_t start = -1;
4844 int32_t end = -1;
4845
4846 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4847 strsrch->search->matchedIndex = start;
4848 strsrch->search->matchedLength = end - start;
4849 return TRUE;
4850 } else {
4851 setMatchNotFound(strsrch);
4852 return FALSE;
4853 }
4854#endif
b75a7d8f
A
4855}
4856
4857#endif /* #if !UCONFIG_NO_COLLATION */