]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/usearch.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
73c04bcf 3* Copyright (C) 2001-2006 IBM and others. All rights reserved.
b75a7d8f
A
4**********************************************************************
5* Date Name Description
6* 07/02/2001 synwee Creation.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_COLLATION
13
14#include "unicode/usearch.h"
15#include "unicode/ustring.h"
16#include "unicode/uchar.h"
17#include "unormimp.h"
18#include "ucol_imp.h"
19#include "usrchimp.h"
20#include "cmemory.h"
374ca955 21#include "ucln_in.h"
b75a7d8f 22
73c04bcf
A
23#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
24
b75a7d8f
A
25// internal definition ---------------------------------------------------
26
27#define LAST_BYTE_MASK_ 0xFF
28#define SECOND_LAST_BYTE_SHIFT_ 8
29#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
30
31static const uint16_t *FCD_ = NULL;
32
33// internal methods -------------------------------------------------
34
35/**
36* Fast collation element iterator setOffset.
37* This function does not check for bounds.
38* @param coleiter collation element iterator
39* @param offset to set
40*/
41static
42inline void setColEIterOffset(UCollationElements *elems,
43 int32_t offset)
44{
374ca955
A
45 collIterate *ci = &(elems->iteratordata_);
46 ci->pos = ci->string + offset;
47 ci->CEpos = ci->toReturn = ci->CEs;
48 if (ci->flags & UCOL_ITER_INNORMBUF) {
49 ci->flags = ci->origFlags;
50 }
51 ci->fcdPosition = NULL;
b75a7d8f
A
52}
53
54/**
55* Getting the mask for collation strength
56* @param strength collation strength
57* @return collation element mask
58*/
59static
60inline uint32_t getMask(UCollationStrength strength)
61{
62 switch (strength)
63 {
64 case UCOL_PRIMARY:
65 return UCOL_PRIMARYORDERMASK;
66 case UCOL_SECONDARY:
67 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
68 default:
69 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
70 UCOL_PRIMARYORDERMASK;
71 }
72}
73
74/**
75* This is to squeeze the 21bit ces into a 256 table
76* @param ce collation element
77* @return collapsed version of the collation element
78*/
79static
80inline int hash(uint32_t ce)
81{
82 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
83 // well with the new collation where most of the latin 1 characters
84 // are of the value xx000xxx. their hashes will most of the time be 0
85 // to be discussed on the hash algo.
86 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
87}
88
374ca955
A
89U_CDECL_BEGIN
90static UBool U_CALLCONV
91usearch_cleanup(void) {
92 FCD_ = NULL;
93 return TRUE;
94}
95U_CDECL_END
96
b75a7d8f
A
97/**
98* Initializing the fcd tables.
99* Internal method, status assumed to be a success.
100* @param status output error if any, caller to check status before calling
101* method, status assumed to be success when passed in.
102*/
103static
104inline void initializeFCD(UErrorCode *status)
105{
106 if (FCD_ == NULL) {
107 FCD_ = unorm_getFCDTrie(status);
374ca955 108 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
b75a7d8f
A
109 }
110}
111
112/**
113* Gets the fcd value for a character at the argument index.
114* This method takes into accounts of the supplementary characters.
115* @param str UTF16 string where character for fcd retrieval resides
116* @param offset position of the character whose fcd is to be retrieved, to be
117* overwritten with the next character position, taking
118* surrogate characters into consideration.
119* @param strlength length of the argument string
120* @return fcd value
121*/
122static
73c04bcf 123uint16_t getFCD(const UChar *str, int32_t *offset,
b75a7d8f
A
124 int32_t strlength)
125{
126 int32_t temp = *offset;
127 uint16_t result;
128 UChar ch = str[temp];
129 result = unorm_getFCD16(FCD_, ch);
130 temp ++;
131
132 if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
133 ch = str[temp];
134 if (UTF_IS_SECOND_SURROGATE(ch)) {
135 result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
136 temp ++;
137 } else {
138 result = 0;
139 }
140 }
141 *offset = temp;
142 return result;
143}
144
145/**
146* Getting the modified collation elements taking into account the collation
147* attributes
148* @param strsrch string search data
149* @param sourcece
150* @return the modified collation element
151*/
152static
374ca955 153inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
b75a7d8f
A
154{
155 // note for tertiary we can't use the collator->tertiaryMask, that
156 // is a preprocessed mask that takes into account case options. since
157 // we are only concerned with exact matches, we don't need that.
158 sourcece &= strsrch->ceMask;
159
160 if (strsrch->toShift) {
161 // alternate handling here, since only the 16 most significant digits
162 // is only used, we can safely do a compare without masking
163 // if the ce is a variable, we mask and get only the primary values
164 // no shifting to quartenary is required since all primary values
165 // less than variabletop will need to be masked off anyway.
166 if (strsrch->variableTop > sourcece) {
167 if (strsrch->strength == UCOL_QUATERNARY) {
168 sourcece &= UCOL_PRIMARYORDERMASK;
169 }
170 else {
171 sourcece = UCOL_IGNORABLE;
172 }
173 }
174 }
175
176 return sourcece;
177}
178
179/**
180* Allocate a memory and returns NULL if it failed.
181* Internal method, status assumed to be a success.
182* @param size to allocate
183* @param status output error if any, caller to check status before calling
184* method, status assumed to be success when passed in.
185* @return newly allocated array, NULL otherwise
186*/
187static
188inline void * allocateMemory(uint32_t size, UErrorCode *status)
189{
190 uint32_t *result = (uint32_t *)uprv_malloc(size);
191 if (result == NULL) {
192 *status = U_MEMORY_ALLOCATION_ERROR;
193 }
194 return result;
195}
196
197/**
198* Adds a uint32_t value to a destination array.
199* Creates a new array if we run out of space. The caller will have to
200* manually deallocate the newly allocated array.
201* Internal method, status assumed to be success, caller has to check status
202* before calling this method. destination not to be NULL and has at least
203* size destinationlength.
204* @param destination target array
205* @param offset destination offset to add value
206* @param destinationlength target array size, return value for the new size
207* @param value to be added
208* @param increments incremental size expected
209* @param status output error if any, caller to check status before calling
210* method, status assumed to be success when passed in.
211* @return new destination array, destination if there was no new allocation
212*/
213static
374ca955
A
214inline int32_t * addTouint32_tArray(int32_t *destination,
215 uint32_t offset,
216 uint32_t *destinationlength,
217 uint32_t value,
218 uint32_t increments,
219 UErrorCode *status)
b75a7d8f
A
220{
221 uint32_t newlength = *destinationlength;
222 if (offset + 1 == newlength) {
223 newlength += increments;
374ca955
A
224 int32_t *temp = (int32_t *)allocateMemory(
225 sizeof(int32_t) * newlength, status);
b75a7d8f
A
226 if (U_FAILURE(*status)) {
227 return NULL;
228 }
374ca955 229 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
b75a7d8f
A
230 *destinationlength = newlength;
231 destination = temp;
232 }
233 destination[offset] = value;
234 return destination;
235}
236
237/**
238* Initializing the ce table for a pattern.
239* Stores non-ignorable collation keys.
240* Table size will be estimated by the size of the pattern text. Table
241* expansion will be perform as we go along. Adding 1 to ensure that the table
242* size definitely increases.
243* Internal method, status assumed to be a success.
244* @param strsrch string search data
245* @param status output error if any, caller to check status before calling
246* method, status assumed to be success when passed in.
247* @return total number of expansions
248*/
249static
250inline uint16_t initializePatternCETable(UStringSearch *strsrch,
251 UErrorCode *status)
252{
253 UPattern *pattern = &(strsrch->pattern);
254 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
374ca955 255 int32_t *cetable = pattern->CEBuffer;
b75a7d8f
A
256 uint32_t patternlength = pattern->textLength;
257 UCollationElements *coleiter = strsrch->utilIter;
258
259 if (coleiter == NULL) {
260 coleiter = ucol_openElements(strsrch->collator, pattern->text,
261 patternlength, status);
262 // status will be checked in ucol_next(..) later and if it is an
263 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
264 // returned.
265 strsrch->utilIter = coleiter;
266 }
267 else {
268 uprv_init_collIterate(strsrch->collator, pattern->text,
269 pattern->textLength,
270 &coleiter->iteratordata_);
271 }
272
273 if (pattern->CE != cetable && pattern->CE) {
274 uprv_free(pattern->CE);
275 }
276
277 uint16_t offset = 0;
278 uint16_t result = 0;
374ca955 279 int32_t ce;
b75a7d8f
A
280
281 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
282 U_SUCCESS(*status)) {
283 uint32_t newce = getCE(strsrch, ce);
284 if (newce) {
374ca955 285 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
b75a7d8f
A
286 newce,
287 patternlength - ucol_getOffset(coleiter) + 1,
288 status);
289 if (U_FAILURE(*status)) {
290 return 0;
291 }
292 offset ++;
293 if (cetable != temp && cetable != pattern->CEBuffer) {
294 uprv_free(cetable);
295 }
296 cetable = temp;
297 }
298 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
299 }
300
301 cetable[offset] = 0;
302 pattern->CE = cetable;
303 pattern->CELength = offset;
304
305 return result;
306}
307
308/**
309* Initializes the pattern struct.
310* Internal method, status assumed to be success.
311* @param strsrch UStringSearch data storage
312* @param status output error if any, caller to check status before calling
313* method, status assumed to be success when passed in.
314* @return expansionsize the total expansion size of the pattern
315*/
316static
317inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
318{
319 UPattern *pattern = &(strsrch->pattern);
320 const UChar *patterntext = pattern->text;
321 int32_t length = pattern->textLength;
322 int32_t index = 0;
323
324 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
325 SECOND_LAST_BYTE_SHIFT_;
326 index = length;
327 UTF_BACK_1(patterntext, 0, index);
328 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
329 LAST_BYTE_MASK_;
330 // since intializePattern is an internal method status is a success.
331 return initializePatternCETable(strsrch, status);
332}
333
334/**
335* Initializing shift tables, with the default values.
336* If a corresponding default value is 0, the shift table is not set.
337* @param shift table for forwards shift
338* @param backshift table for backwards shift
339* @param cetable table containing pattern ce
340* @param cesize size of the pattern ces
341* @param expansionsize total size of the expansions
342* @param defaultforward the default forward value
343* @param defaultbackward the default backward value
344*/
345static
346inline void setShiftTable(int16_t shift[], int16_t backshift[],
374ca955 347 int32_t *cetable, int32_t cesize,
b75a7d8f
A
348 int16_t expansionsize,
349 int16_t defaultforward,
350 int16_t defaultbackward)
351{
352 // estimate the value to shift. to do that we estimate the smallest
353 // number of characters to give the relevant ces, ie approximately
354 // the number of ces minus their expansion, since expansions can come
355 // from a character.
356 int32_t count;
357 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
358 shift[count] = defaultforward;
359 }
360 cesize --; // down to the last index
361 for (count = 0; count < cesize; count ++) {
362 // number of ces from right of array to the count
363 int temp = defaultforward - count - 1;
364 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
365 }
366 shift[hash(cetable[cesize])] = 1;
367 // for ignorables we just shift by one. see test examples.
368 shift[hash(0)] = 1;
369
370 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
371 backshift[count] = defaultbackward;
372 }
373 for (count = cesize; count > 0; count --) {
374 // the original value count does not seem to work
375 backshift[hash(cetable[count])] = count > expansionsize ?
376 (int16_t)(count - expansionsize) : 1;
377 }
378 backshift[hash(cetable[0])] = 1;
379 backshift[hash(0)] = 1;
380}
381
382/**
383* Building of the pattern collation element list and the boyer moore strsrch
384* table.
385* The canonical match will only be performed after the default match fails.
386* For both cases we need to remember the size of the composed and decomposed
387* versions of the string. Since the Boyer-Moore shift calculations shifts by
388* a number of characters in the text and tries to match the pattern from that
389* offset, the shift value can not be too large in case we miss some
390* characters. To choose a right shift size, we estimate the NFC form of the
391* and use its size as a shift guide. The NFC form should be the small
392* possible representation of the pattern. Anyways, we'll err on the smaller
393* shift size. Hence the calculation for minlength.
394* Canonical match will be performed slightly differently. We'll split the
395* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
396* the first and last base character (MS), the ending accents (EA). Matches
397* will be done on MS first, and only when we match MS then some processing
398* will be required for the prefix and end accents in order to determine if
399* they match PA and EA. Hence the default shift values
400* for the canonical match will take the size of either end's accent into
401* consideration. Forwards search will take the end accents into consideration
402* for the default shift values and the backwards search will take the prefix
403* accents into consideration.
404* If pattern has no non-ignorable ce, we return a illegal argument error.
405* Internal method, status assumed to be success.
406* @param strsrch UStringSearch data storage
407* @param status for output errors if it occurs, status is assumed to be a
408* success when it is passed in.
409*/
410static
411inline void initialize(UStringSearch *strsrch, UErrorCode *status)
412{
413 int16_t expandlength = initializePattern(strsrch, status);
414 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
415 UPattern *pattern = &strsrch->pattern;
416 int32_t cesize = pattern->CELength;
417
418 int16_t minlength = cesize > expandlength
374ca955 419 ? (int16_t)cesize - expandlength : 1;
b75a7d8f
A
420 pattern->defaultShiftSize = minlength;
421 setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
422 cesize, expandlength, minlength, minlength);
423 return;
424 }
425 strsrch->pattern.defaultShiftSize = 0;
426}
427
428/**
429* Determine whether the target text in UStringSearch bounded by the offset
430* start and end is one or more whole units of text as
431* determined by the breakiterator in UStringSearch.
432* @param strsrch string search data
433* @param start target text start offset
434* @param end target text end offset
435*/
436static
73c04bcf 437UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
b75a7d8f
A
438 int32_t end)
439{
440#if !UCONFIG_NO_BREAK_ITERATION
441 UBreakIterator *breakiterator = strsrch->search->breakIter;
442 if (breakiterator) {
443 int32_t startindex = ubrk_first(breakiterator);
444 int32_t endindex = ubrk_last(breakiterator);
445
446 // out-of-range indexes are never boundary positions
447 if (start < startindex || start > endindex ||
448 end < startindex || end > endindex) {
449 return FALSE;
450 }
451 // otherwise, we can use following() on the position before the
452 // specified one and return true of the position we get back is the
453 // one the user specified
454 UBool result = (start == startindex ||
455 ubrk_following(breakiterator, start - 1) == start) &&
456 (end == endindex ||
457 ubrk_following(breakiterator, end - 1) == end);
458 if (result) {
459 // iterates the individual ces
460 UCollationElements *coleiter = strsrch->utilIter;
461 const UChar *text = strsrch->search->text +
462 start;
463 UErrorCode status = U_ZERO_ERROR;
464 ucol_setText(coleiter, text, end - start, &status);
465 for (int32_t count = 0; count < strsrch->pattern.CELength;
466 count ++) {
374ca955 467 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
468 if (ce == UCOL_IGNORABLE) {
469 count --;
470 continue;
471 }
472 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
473 return FALSE;
474 }
475 }
374ca955 476 int32_t nextce = ucol_next(coleiter, &status);
b75a7d8f
A
477 while (ucol_getOffset(coleiter) == (end - start)
478 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
479 nextce = ucol_next(coleiter, &status);
480 }
481 if (ucol_getOffset(coleiter) == (end - start)
482 && nextce != UCOL_NULLORDER) {
483 // extra collation elements at the end of the match
484 return FALSE;
485 }
486 }
487 return result;
488 }
489#endif
490 return TRUE;
491}
492
493/**
494* Getting the next base character offset if current offset is an accent,
495* or the current offset if the current character contains a base character.
496* accents the following base character will be returned
497* @param text string
498* @param textoffset current offset
499* @param textlength length of text string
500* @return the next base character or the current offset
501* if the current character is contains a base character.
502*/
503static
504inline int32_t getNextBaseOffset(const UChar *text,
505 int32_t textoffset,
506 int32_t textlength)
507{
508 if (textoffset < textlength) {
509 int32_t temp = textoffset;
510 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
511 while (temp < textlength) {
512 int32_t result = temp;
513 if ((getFCD(text, &temp, textlength) >>
514 SECOND_LAST_BYTE_SHIFT_) == 0) {
515 return result;
516 }
517 }
518 return textlength;
519 }
520 }
521 return textoffset;
522}
523
524/**
525* Gets the next base character offset depending on the string search pattern
526* data
527* @param strsrch string search data
528* @param textoffset current offset, one offset away from the last character
529* to search for.
530* @return start index of the next base character or the current offset
531* if the current character is contains a base character.
532*/
533static
534inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
535 int32_t textoffset)
536{
374ca955 537 int32_t textlength = strsrch->search->textLength;
b75a7d8f
A
538 if (strsrch->pattern.hasSuffixAccents &&
539 textoffset < textlength) {
540 int32_t temp = textoffset;
541 const UChar *text = strsrch->search->text;
542 UTF_BACK_1(text, 0, temp);
543 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
544 return getNextBaseOffset(text, textoffset, textlength);
545 }
546 }
547 return textoffset;
548}
549
550/**
551* Shifting the collation element iterator position forward to prepare for
552* a following match. If the last character is a unsafe character, we'll only
553* shift by 1 to capture contractions, normalization etc.
554* Internal method, status assumed to be success.
555* @param text strsrch string search data
556* @param textoffset start text position to do search
557* @param ce the text ce which failed the match.
558* @param patternceindex index of the ce within the pattern ce buffer which
559* failed the match
560* @return final offset
561*/
562static
563inline int32_t shiftForward(UStringSearch *strsrch,
564 int32_t textoffset,
374ca955 565 int32_t ce,
b75a7d8f
A
566 int32_t patternceindex)
567{
374ca955 568 UPattern *pattern = &(strsrch->pattern);
b75a7d8f
A
569 if (ce != UCOL_NULLORDER) {
570 int32_t shift = pattern->shift[hash(ce)];
571 // this is to adjust for characters in the middle of the
572 // substring for matching that failed.
573 int32_t adjust = pattern->CELength - patternceindex;
574 if (adjust > 1 && shift >= adjust) {
575 shift -= adjust - 1;
576 }
577 textoffset += shift;
578 }
579 else {
580 textoffset += pattern->defaultShiftSize;
581 }
582
583 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
584 // check for unsafe characters
585 // * if it is the start or middle of a contraction: to be done after
586 // a initial match is found
587 // * thai or lao base consonant character: similar to contraction
588 // * high surrogate character: similar to contraction
589 // * next character is a accent: shift to the next base character
590 return textoffset;
591}
592
593/**
594* sets match not found
595* @param strsrch string search data
596*/
597static
598inline void setMatchNotFound(UStringSearch *strsrch)
599{
600 // this method resets the match result regardless of the error status.
601 strsrch->search->matchedIndex = USEARCH_DONE;
602 strsrch->search->matchedLength = 0;
603 if (strsrch->search->isForwardSearching) {
604 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
605 }
606 else {
607 setColEIterOffset(strsrch->textIter, 0);
608 }
609}
610
611/**
612* Gets the offset to the next safe point in text.
613* ie. not the middle of a contraction, swappable characters or supplementary
614* characters.
615* @param collator collation sata
616* @param text string to work with
617* @param textoffset offset in string
618* @param textlength length of text string
619* @return offset to the next safe character
620*/
621static
622inline int32_t getNextSafeOffset(const UCollator *collator,
623 const UChar *text,
624 int32_t textoffset,
625 int32_t textlength)
626{
627 int32_t result = textoffset; // first contraction character
628 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
629 result ++;
630 }
631 return result;
632}
633
634/**
635* This checks for accents in the potential match started with a .
636* composite character.
637* This is really painful... we have to check that composite character do not
638* have any extra accents. We have to normalize the potential match and find
639* the immediate decomposed character before the match.
640* The first composite character would have been taken care of by the fcd
641* checks in checkForwardExactMatch.
642* This is the slow path after the fcd of the first character and
643* the last character has been checked by checkForwardExactMatch and we
644* determine that the potential match has extra non-ignorable preceding
645* ces.
646* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
647* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
648* Note here that accents checking are slow and cautioned in the API docs.
649* Internal method, status assumed to be a success, caller should check status
650* before calling this method
651* @param strsrch string search data
652* @param start index of the potential unfriendly composite character
653* @param end index of the potential unfriendly composite character
654* @param status output error status if any.
655* @return TRUE if there is non-ignorable accents before at the beginning
656* of the match, FALSE otherwise.
657*/
658
659static
660UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
661 int32_t end,
662 UErrorCode *status)
663{
664 UBool result = FALSE;
665 if (strsrch->pattern.hasPrefixAccents) {
666 int32_t length = end - start;
667 int32_t offset = 0;
668 const UChar *text = strsrch->search->text + start;
669
670 UTF_FWD_1(text, offset, length);
671 // we are only concerned with the first composite character
672 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
673 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
674 text, 0, length);
675 if (safeoffset != length) {
676 safeoffset ++;
677 }
678 UChar *norm = NULL;
679 UChar buffer[INITIAL_ARRAY_SIZE_];
680 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
681 buffer, INITIAL_ARRAY_SIZE_,
682 status);
683 if (U_FAILURE(*status)) {
684 return FALSE;
685 }
686 if (size >= INITIAL_ARRAY_SIZE_) {
687 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
688 status);
689 // if allocation failed, status will be set to
690 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
691 // checks for it.
692 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
693 size, status);
694 if (U_FAILURE(*status) && norm != NULL) {
695 uprv_free(norm);
696 return FALSE;
697 }
698 }
699 else {
700 norm = buffer;
701 }
702
703 UCollationElements *coleiter = strsrch->utilIter;
704 ucol_setText(coleiter, norm, size, status);
705 uint32_t firstce = strsrch->pattern.CE[0];
706 UBool ignorable = TRUE;
707 uint32_t ce = UCOL_IGNORABLE;
708 while (U_SUCCESS(*status) && ce != firstce) {
709 offset = ucol_getOffset(coleiter);
710 if (ce != firstce && ce != UCOL_IGNORABLE) {
711 ignorable = FALSE;
712 }
713 ce = ucol_next(coleiter, status);
714 }
715 UChar32 codepoint;
716 UTF_PREV_CHAR(norm, 0, offset, codepoint);
717 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
718
719 if (norm != buffer) {
720 uprv_free(norm);
721 }
722 }
723 }
724
725 return result;
726}
727
728/**
729* Used by exact matches, checks if there are accents before the match.
730* This is really painful... we have to check that composite characters at
731* the start of the matches have to not have any extra accents.
732* We check the FCD of the character first, if it starts with an accent and
733* the first pattern ce does not match the first ce of the character, we bail.
734* Otherwise we try normalizing the first composite
735* character and find the immediate decomposed character before the match to
736* see if it is an non-ignorable accent.
737* Now normalizing the first composite character is enough because we ensure
738* that when the match is passed in here with extra beginning ces, the
739* first or last ce that match has to occur within the first character.
740* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
741* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
742* Note here that accents checking are slow and cautioned in the API docs.
743* @param strsrch string search data
744* @param start offset
745* @param end offset
746* @return TRUE if there are accents on either side of the match,
747* FALSE otherwise
748*/
749static
750UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
751 int32_t end)
752{
753 if (strsrch->pattern.hasPrefixAccents) {
754 UCollationElements *coleiter = strsrch->textIter;
755 UErrorCode status = U_ZERO_ERROR;
756 // we have been iterating forwards previously
757 uint32_t ignorable = TRUE;
374ca955 758 int32_t firstce = strsrch->pattern.CE[0];
b75a7d8f 759
374ca955
A
760 setColEIterOffset(coleiter, start);
761 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
762 if (U_FAILURE(status)) {
b75a7d8f
A
763 return TRUE;
764 }
765 while (ce != firstce) {
766 if (ce != UCOL_IGNORABLE) {
767 ignorable = FALSE;
768 }
769 ce = getCE(strsrch, ucol_next(coleiter, &status));
770 if (U_FAILURE(status)) {
771 return TRUE;
772 }
773 }
374ca955 774 if (!ignorable && inNormBuf(coleiter)) {
b75a7d8f 775 // within normalization buffer, discontiguous handled here
374ca955 776 return TRUE;
b75a7d8f
A
777 }
778
374ca955 779 // within text
b75a7d8f 780 int32_t temp = start;
374ca955
A
781 // original code
782 // accent = (getFCD(strsrch->search->text, &temp,
b75a7d8f 783 // strsrch->search->textLength)
374ca955
A
784 // >> SECOND_LAST_BYTE_SHIFT_);
785 // however this code does not work well with VC7 .net in release mode.
786 // maybe the inlines for getFCD combined with shifting has bugs in
787 // VC7. anyways this is a work around.
788 UBool accent = getFCD(strsrch->search->text, &temp,
b75a7d8f
A
789 strsrch->search->textLength) > 0xFF;
790 if (!accent) {
374ca955 791 return checkExtraMatchAccents(strsrch, start, end, &status);
b75a7d8f 792 }
374ca955 793 if (!ignorable) {
b75a7d8f
A
794 return TRUE;
795 }
796 if (start > 0) {
797 temp = start;
798 UTF_BACK_1(strsrch->search->text, 0, temp);
799 if (getFCD(strsrch->search->text, &temp,
800 strsrch->search->textLength) & LAST_BYTE_MASK_) {
801 setColEIterOffset(coleiter, start);
802 ce = ucol_previous(coleiter, &status);
803 if (U_FAILURE(status) ||
804 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
805 return TRUE;
806 }
807 }
808 }
809 }
810
811 return FALSE;
812}
813
814/**
815* Used by exact matches, checks if there are accents bounding the match.
816* Note this is the initial boundary check. If the potential match
817* starts or ends with composite characters, the accents in those
818* characters will be determined later.
819* Not doing backwards iteration here, since discontiguos contraction for
820* backwards collation element iterator, use up too many characters.
821* E.g. looking for \u030A ring in \u01FA A ring above and acute,
822* should fail since there is a acute at the end of \u01FA
823* Note here that accents checking are slow and cautioned in the API docs.
824* @param strsrch string search data
825* @param start offset of match
826* @param end end offset of the match
827* @return TRUE if there are accents on either side of the match,
828* FALSE otherwise
829*/
830static
831UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
832 int32_t end)
833{
834 if (strsrch->pattern.hasSuffixAccents) {
835 const UChar *text = strsrch->search->text;
836 int32_t temp = end;
837 int32_t textlength = strsrch->search->textLength;
838 UTF_BACK_1(text, 0, temp);
839 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
374ca955 840 int32_t firstce = strsrch->pattern.CE[0];
b75a7d8f
A
841 UCollationElements *coleiter = strsrch->textIter;
842 UErrorCode status = U_ZERO_ERROR;
843 setColEIterOffset(coleiter, start);
844 while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
845 if (U_FAILURE(status)) {
846 return TRUE;
847 }
848 }
849 int32_t count = 1;
850 while (count < strsrch->pattern.CELength) {
851 if (getCE(strsrch, ucol_next(coleiter, &status))
852 == UCOL_IGNORABLE) {
853 // Thai can give an ignorable here.
854 count --;
855 }
856 if (U_FAILURE(status)) {
857 return TRUE;
858 }
859 count ++;
860 }
374ca955 861 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
862 if (U_FAILURE(status)) {
863 return TRUE;
864 }
865 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
866 if (ucol_getOffset(coleiter) <= end) {
867 return TRUE;
868 }
869 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
870 return TRUE;
871 }
872 }
873 }
874 }
875 return FALSE;
876}
877
878/**
879* Checks if the offset runs out of the text string
880* @param offset
881* @param textlength of the text string
882* @return TRUE if offset is out of bounds, FALSE otherwise
883*/
884static
885inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
886{
887 return offset < 0 || offset > textlength;
888}
889
890/**
891* Checks for identical match
892* @param strsrch string search data
893* @param start offset of possible match
894* @param end offset of possible match
895* @return TRUE if identical match is found
896*/
897static
898inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
899 int32_t end)
900{
73c04bcf 901 UChar t2[32], p2[32];
b75a7d8f
A
902 int32_t length = end - start;
903 if (strsrch->strength != UCOL_IDENTICAL) {
904 return TRUE;
905 }
906
73c04bcf
A
907 UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
908 int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
b75a7d8f
A
909 strsrch->search->text + start, length,
910 FALSE, 0, &status);
73c04bcf
A
911 // use separate status2 in case of buffer overflow
912 if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
913 strsrch->pattern.text,
b75a7d8f 914 strsrch->pattern.textLength,
73c04bcf
A
915 FALSE, 0, &status2)) {
916 return FALSE; // lengths are different
917 }
918
919 // compare contents
920 UChar *text, *pattern;
921 if(U_SUCCESS(status)) {
922 text = t2;
923 pattern = p2;
924 } else if(status==U_BUFFER_OVERFLOW_ERROR) {
925 status = U_ZERO_ERROR;
926 // allocate one buffer for both decompositions
927 text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
928 pattern = text + decomplength;
929 unorm_decompose(text, decomplength, strsrch->search->text + start,
930 length, FALSE, 0, &status);
931 unorm_decompose(pattern, decomplength, strsrch->pattern.text,
932 strsrch->pattern.textLength, FALSE, 0, &status);
933 } else {
934 // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
935 // and that we don't uprv_free() an undefined text pointer
936 text = pattern = t2;
937 decomplength = 0;
938 }
939 UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
940 if(text != t2) {
941 uprv_free(text);
942 }
943 // return FALSE if NFD failed
944 return U_SUCCESS(status) && result;
b75a7d8f
A
945}
946
947/**
948* Checks to see if the match is repeated
949* @param strsrch string search data
950* @param start new match start index
951* @param end new match end index
952* @return TRUE if the the match is repeated, FALSE otherwise
953*/
954static
955inline UBool checkRepeatedMatch(UStringSearch *strsrch,
956 int32_t start,
957 int32_t end)
958{
959 int32_t lastmatchindex = strsrch->search->matchedIndex;
960 UBool result;
961 if (lastmatchindex == USEARCH_DONE) {
962 return FALSE;
963 }
964 if (strsrch->search->isForwardSearching) {
965 result = start <= lastmatchindex;
966 }
967 else {
968 result = start >= lastmatchindex;
969 }
374ca955 970 if (!result && !strsrch->search->isOverlap) {
b75a7d8f
A
971 if (strsrch->search->isForwardSearching) {
972 result = start < lastmatchindex + strsrch->search->matchedLength;
973 }
974 else {
975 result = end > lastmatchindex;
976 }
977 }
978 return result;
979}
980
981/**
982* Gets the collation element iterator's current offset.
983* @param coleiter collation element iterator
984* @param forwards flag TRUE if we are moving in th forwards direction
985* @return current offset
986*/
987static
988inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
989 UBool forwards)
990{
991 int32_t result = ucol_getOffset(coleiter);
992 // intricacies of the the backwards collation element iterator
993 if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
994 result ++;
995 }
996 return result;
997}
998
999/**
1000* Checks match for contraction.
1001* If the match ends with a partial contraction we fail.
1002* If the match starts too far off (because of backwards iteration) we try to
1003* chip off the extra characters depending on whether a breakiterator has
1004* been used.
1005* Internal method, error assumed to be success, caller has to check status
1006* before calling this method.
1007* @param strsrch string search data
1008* @param start offset of potential match, to be modified if necessary
1009* @param end offset of potential match, to be modified if necessary
1010* @param status output error status if any
1011* @return TRUE if match passes the contraction test, FALSE otherwise
1012*/
1013
1014static
1015UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1016 int32_t *start,
1017 int32_t *end, UErrorCode *status)
1018{
1019 UCollationElements *coleiter = strsrch->textIter;
1020 int32_t textlength = strsrch->search->textLength;
1021 int32_t temp = *start;
1022 const UCollator *collator = strsrch->collator;
1023 const UChar *text = strsrch->search->text;
1024 // This part checks if either ends of the match contains potential
1025 // contraction. If so we'll have to iterate through them
374ca955
A
1026 // The start contraction needs to be checked since ucol_previous dumps
1027 // all characters till the first safe character into the buffer.
1028 // *start + 1 is used to test for the unsafe characters instead of *start
1029 // because ucol_prev takes all unsafe characters till the first safe
1030 // character ie *start. so by testing *start + 1, we can estimate if
1031 // excess prefix characters has been included in the potential search
1032 // results.
b75a7d8f
A
1033 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1034 (*start + 1 < textlength
1035 && ucol_unsafeCP(text[*start + 1], collator))) {
1036 int32_t expansion = getExpansionPrefix(coleiter);
1037 UBool expandflag = expansion > 0;
1038 setColEIterOffset(coleiter, *start);
1039 while (expansion > 0) {
1040 // getting rid of the redundant ce, caused by setOffset.
1041 // since backward contraction/expansion may have extra ces if we
1042 // are in the normalization buffer, hasAccentsBeforeMatch would
1043 // have taken care of it.
1044 // E.g. the character \u01FA will have an expansion of 3, but if
1045 // we are only looking for acute and ring \u030A and \u0301, we'll
1046 // have to skip the first ce in the expansion buffer.
1047 ucol_next(coleiter, status);
374ca955
A
1048 if (U_FAILURE(*status)) {
1049 return FALSE;
1050 }
b75a7d8f
A
1051 if (ucol_getOffset(coleiter) != temp) {
1052 *start = temp;
1053 temp = ucol_getOffset(coleiter);
1054 }
1055 expansion --;
1056 }
1057
374ca955 1058 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1059 int32_t patterncelength = strsrch->pattern.CELength;
1060 int32_t count = 0;
1061 while (count < patterncelength) {
374ca955 1062 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1063 if (ce == UCOL_IGNORABLE) {
1064 continue;
1065 }
1066 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1067 *start = temp;
1068 temp = ucol_getOffset(coleiter);
1069 }
1070 if (U_FAILURE(*status) || ce != patternce[count]) {
1071 (*end) ++;
1072 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1073 return FALSE;
1074 }
1075 count ++;
1076 }
1077 }
1078 return TRUE;
1079}
1080
1081/**
1082* Checks and sets the match information if found.
1083* Checks
1084* <ul>
1085* <li> the potential match does not repeat the previous match
1086* <li> boundaries are correct
1087* <li> exact matches has no extra accents
1088* <li> identical matchesb
1089* <li> potential match does not end in the middle of a contraction
1090* <\ul>
1091* Otherwise the offset will be shifted to the next character.
1092* Internal method, status assumed to be success, caller has to check status
1093* before calling this method.
1094* @param strsrch string search data
1095* @param textoffset offset in the collation element text. the returned value
1096* will be the truncated end offset of the match or the new start
1097* search offset.
1098* @param status output error status if any
1099* @return TRUE if the match is valid, FALSE otherwise
1100*/
1101static
1102inline UBool checkNextExactMatch(UStringSearch *strsrch,
1103 int32_t *textoffset, UErrorCode *status)
1104{
1105 UCollationElements *coleiter = strsrch->textIter;
1106 int32_t start = getColElemIterOffset(coleiter, FALSE);
1107
374ca955
A
1108 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1109 return FALSE;
b75a7d8f
A
1110 }
1111
1112 // this totally matches, however we need to check if it is repeating
1113 if (!isBreakUnit(strsrch, start, *textoffset) ||
1114 checkRepeatedMatch(strsrch, start, *textoffset) ||
1115 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1116 !checkIdentical(strsrch, start, *textoffset) ||
1117 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
374ca955
A
1118
1119 (*textoffset) ++;
b75a7d8f 1120 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
374ca955 1121 return FALSE;
b75a7d8f
A
1122 }
1123
1124 // totally match, we will get rid of the ending ignorables.
1125 strsrch->search->matchedIndex = start;
1126 strsrch->search->matchedLength = *textoffset - start;
374ca955 1127 return TRUE;
b75a7d8f
A
1128}
1129
1130/**
1131* Getting the previous base character offset, or the current offset if the
1132* current character is a base character
1133* @param text string
1134* @param textoffset one offset after the current character
1135* @return the offset of the next character after the base character or the first
1136* composed character with accents
1137*/
1138static
1139inline int32_t getPreviousBaseOffset(const UChar *text,
1140 int32_t textoffset)
1141{
1142 if (textoffset > 0) {
1143 while (TRUE) {
1144 int32_t result = textoffset;
1145 UTF_BACK_1(text, 0, textoffset);
1146 int32_t temp = textoffset;
1147 uint16_t fcd = getFCD(text, &temp, result);
1148 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1149 if (fcd & LAST_BYTE_MASK_) {
1150 return textoffset;
1151 }
1152 return result;
1153 }
1154 if (textoffset == 0) {
1155 return 0;
1156 }
1157 }
1158 }
1159 return textoffset;
1160}
1161
1162/**
1163* Getting the indexes of the accents that are not blocked in the argument
1164* accent array
1165* @param accents array of accents in nfd terminated by a 0.
1166* @param accentsindex array of indexes of the accents that are not blocked
1167*/
1168static
1169inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1170{
1171 int32_t index = 0;
1172 int32_t length = u_strlen(accents);
1173 UChar32 codepoint = 0;
1174 int cclass = 0;
1175 int result = 0;
1176 int32_t temp;
1177 while (index < length) {
1178 temp = index;
1179 UTF_NEXT_CHAR(accents, index, length, codepoint);
1180 if (u_getCombiningClass(codepoint) != cclass) {
1181 cclass = u_getCombiningClass(codepoint);
1182 accentsindex[result] = temp;
1183 result ++;
1184 }
1185 }
1186 accentsindex[result] = length;
1187 return result;
1188}
1189
1190/**
1191* Appends 3 UChar arrays to a destination array.
1192* Creates a new array if we run out of space. The caller will have to
1193* manually deallocate the newly allocated array.
1194* Internal method, status assumed to be success, caller has to check status
1195* before calling this method. destination not to be NULL and has at least
1196* size destinationlength.
1197* @param destination target array
1198* @param destinationlength target array size, returning the appended length
1199* @param source1 null-terminated first array
1200* @param source2 second array
1201* @param source2length length of seond array
1202* @param source3 null-terminated third array
1203* @param status error status if any
1204* @return new destination array, destination if there was no new allocation
1205*/
1206static
1207inline UChar * addToUCharArray( UChar *destination,
1208 int32_t *destinationlength,
1209 const UChar *source1,
1210 const UChar *source2,
1211 int32_t source2length,
1212 const UChar *source3,
1213 UErrorCode *status)
1214{
1215 int32_t source1length = source1 ? u_strlen(source1) : 0;
1216 int32_t source3length = source3 ? u_strlen(source3) : 0;
1217 if (*destinationlength < source1length + source2length + source3length +
1218 1)
1219 {
1220 destination = (UChar *)allocateMemory(
1221 (source1length + source2length + source3length + 1) * sizeof(UChar),
1222 status);
1223 // if error allocating memory, status will be
1224 // U_MEMORY_ALLOCATION_ERROR
1225 if (U_FAILURE(*status)) {
1226 *destinationlength = 0;
1227 return NULL;
1228 }
1229 }
1230 if (source1length != 0) {
1231 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1232 }
1233 if (source2length != 0) {
1234 uprv_memcpy(destination + source1length, source2,
1235 sizeof(UChar) * source2length);
1236 }
1237 if (source3length != 0) {
1238 uprv_memcpy(destination + source1length + source2length, source3,
1239 sizeof(UChar) * source3length);
1240 }
1241 *destinationlength = source1length + source2length + source3length;
1242 return destination;
1243}
1244
1245/**
1246* Running through a collation element iterator to see if the contents matches
1247* pattern in string search data
1248* @param strsrch string search data
1249* @param coleiter collation element iterator
1250* @return TRUE if a match if found, FALSE otherwise
1251*/
1252static
1253inline UBool checkCollationMatch(const UStringSearch *strsrch,
1254 UCollationElements *coleiter)
1255{
1256 int patternceindex = strsrch->pattern.CELength;
374ca955 1257 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1258 UErrorCode status = U_ZERO_ERROR;
1259 while (patternceindex > 0) {
374ca955 1260 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
1261 if (ce == UCOL_IGNORABLE) {
1262 continue;
1263 }
1264 if (U_FAILURE(status) || ce != *patternce) {
1265 return FALSE;
1266 }
1267 patternce ++;
1268 patternceindex --;
1269 }
1270 return TRUE;
1271}
1272
1273/**
1274* Rearranges the front accents to try matching.
1275* Prefix accents in the text will be grouped according to their combining
1276* class and the groups will be mixed and matched to try find the perfect
1277* match with the pattern.
1278* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1279* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1280* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1281* "\u0301\u0325".
1282* step 2: check if any of the generated substrings matches the pattern.
1283* Internal method, status is assumed to be success, caller has to check status
1284* before calling this method.
1285* @param strsrch string search match
1286* @param start first offset of the accents to start searching
1287* @param end start of the last accent set
1288* @param status output error status if any
1289* @return USEARCH_DONE if a match is not found, otherwise return the starting
1290* offset of the match. Note this start includes all preceding accents.
1291*/
1292static
1293int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1294 int32_t start,
1295 int32_t end,
1296 UErrorCode *status)
1297{
1298 const UChar *text = strsrch->search->text;
1299 int32_t textlength = strsrch->search->textLength;
1300 int32_t tempstart = start;
1301
1302 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1303 // die... failed at a base character
1304 return USEARCH_DONE;
1305 }
1306
1307 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1308 start = getPreviousBaseOffset(text, tempstart);
1309
1310 UChar accents[INITIAL_ARRAY_SIZE_];
1311 // normalizing the offensive string
1312 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1313 INITIAL_ARRAY_SIZE_, status);
1314 if (U_FAILURE(*status)) {
1315 return USEARCH_DONE;
1316 }
1317
1318 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1319 int32_t accentsize = getUnblockedAccentIndex(accents,
1320 accentsindex);
374ca955 1321 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
1322 UChar buffer[INITIAL_ARRAY_SIZE_];
1323 UCollationElements *coleiter = strsrch->utilIter;
1324 while (U_SUCCESS(*status) && count > 0) {
1325 UChar *rearrange = strsrch->canonicalPrefixAccents;
1326 // copy the base characters
1327 for (int k = 0; k < accentsindex[0]; k ++) {
1328 *rearrange ++ = accents[k];
1329 }
1330 // forming all possible canonical rearrangement by dropping
1331 // sets of accents
1332 for (int i = 0; i <= accentsize - 1; i ++) {
1333 int32_t mask = 1 << (accentsize - i - 1);
1334 if (count & mask) {
1335 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1336 *rearrange ++ = accents[j];
1337 }
1338 }
1339 }
1340 *rearrange = 0;
1341 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1342 UChar *match = addToUCharArray(buffer, &matchsize,
1343 strsrch->canonicalPrefixAccents,
1344 strsrch->search->text + offset,
1345 end - offset,
1346 strsrch->canonicalSuffixAccents,
1347 status);
1348
1349 // if status is a failure, ucol_setText does nothing.
1350 // run the collator iterator through this match
1351 ucol_setText(coleiter, match, matchsize, status);
1352 if (U_SUCCESS(*status)) {
1353 if (checkCollationMatch(strsrch, coleiter)) {
1354 if (match != buffer) {
1355 uprv_free(match);
1356 }
1357 return start;
1358 }
1359 }
1360 count --;
1361 }
1362 return USEARCH_DONE;
1363}
1364
1365/**
1366* Gets the offset to the safe point in text before textoffset.
1367* ie. not the middle of a contraction, swappable characters or supplementary
1368* characters.
1369* @param collator collation sata
1370* @param text string to work with
1371* @param textoffset offset in string
1372* @param textlength length of text string
1373* @return offset to the previous safe character
1374*/
1375static
1376inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1377 const UChar *text,
1378 int32_t textoffset)
1379{
1380 int32_t result = textoffset; // first contraction character
1381 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1382 result --;
1383 }
1384 if (result != 0) {
1385 // the first contraction character is consider unsafe here
1386 result --;
1387 }
1388 return result;
1389}
1390
1391/**
1392* Cleaning up after we passed the safe zone
1393* @param strsrch string search data
1394* @param safetext safe text array
1395* @param safebuffer safe text buffer
1396* @param coleiter collation element iterator for safe text
1397*/
1398static
1399inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1400 UChar *safebuffer)
1401{
1402 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1403 {
1404 uprv_free(safetext);
1405 }
1406}
1407
1408/**
1409* Take the rearranged end accents and tries matching. If match failed at
1410* a seperate preceding set of accents (seperated from the rearranged on by
1411* at least a base character) then we rearrange the preceding accents and
1412* tries matching again.
1413* We allow skipping of the ends of the accent set if the ces do not match.
1414* However if the failure is found before the accent set, it fails.
1415* Internal method, status assumed to be success, caller has to check status
1416* before calling this method.
1417* @param strsrch string search data
1418* @param textoffset of the start of the rearranged accent
1419* @param status output error status if any
1420* @return USEARCH_DONE if a match is not found, otherwise return the starting
1421* offset of the match. Note this start includes all preceding accents.
1422*/
1423static
1424int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1425 int32_t textoffset,
1426 UErrorCode *status)
1427{
1428 const UChar *text = strsrch->search->text;
1429 const UCollator *collator = strsrch->collator;
1430 int32_t safelength = 0;
1431 UChar *safetext;
1432 int32_t safetextlength;
1433 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1434 UCollationElements *coleiter = strsrch->utilIter;
1435 int32_t safeoffset = textoffset;
1436
1437 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1438 collator)) {
1439 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1440 safelength = textoffset - safeoffset;
1441 safetextlength = INITIAL_ARRAY_SIZE_;
1442 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1443 text + safeoffset, safelength,
1444 strsrch->canonicalSuffixAccents,
1445 status);
1446 }
1447 else {
1448 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1449 safetext = strsrch->canonicalSuffixAccents;
1450 }
1451
1452 // if status is a failure, ucol_setText does nothing
1453 ucol_setText(coleiter, safetext, safetextlength, status);
1454 // status checked in loop below
1455
374ca955
A
1456 int32_t *ce = strsrch->pattern.CE;
1457 int32_t celength = strsrch->pattern.CELength;
b75a7d8f
A
1458 int ceindex = celength - 1;
1459 UBool isSafe = TRUE; // indication flag for position in safe zone
1460
1461 while (ceindex >= 0) {
374ca955 1462 int32_t textce = ucol_previous(coleiter, status);
b75a7d8f
A
1463 if (U_FAILURE(*status)) {
1464 if (isSafe) {
1465 cleanUpSafeText(strsrch, safetext, safebuffer);
1466 }
1467 return USEARCH_DONE;
1468 }
1469 if (textce == UCOL_NULLORDER) {
1470 // check if we have passed the safe buffer
1471 if (coleiter == strsrch->textIter) {
1472 cleanUpSafeText(strsrch, safetext, safebuffer);
1473 return USEARCH_DONE;
1474 }
1475 cleanUpSafeText(strsrch, safetext, safebuffer);
1476 safetext = safebuffer;
1477 coleiter = strsrch->textIter;
1478 setColEIterOffset(coleiter, safeoffset);
1479 // status checked at the start of the loop
1480 isSafe = FALSE;
1481 continue;
1482 }
1483 textce = getCE(strsrch, textce);
1484 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1485 // do the beginning stuff
1486 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1487 if (isSafe && failedoffset >= safelength) {
1488 // alas... no hope. failed at rearranged accent set
1489 cleanUpSafeText(strsrch, safetext, safebuffer);
1490 return USEARCH_DONE;
1491 }
1492 else {
1493 if (isSafe) {
1494 failedoffset += safeoffset;
1495 cleanUpSafeText(strsrch, safetext, safebuffer);
1496 }
1497
1498 // try rearranging the front accents
1499 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1500 failedoffset, textoffset, status);
1501 if (result != USEARCH_DONE) {
1502 // if status is a failure, ucol_setOffset does nothing
1503 setColEIterOffset(strsrch->textIter, result);
1504 }
1505 if (U_FAILURE(*status)) {
1506 return USEARCH_DONE;
1507 }
1508 return result;
1509 }
1510 }
1511 if (textce == ce[ceindex]) {
1512 ceindex --;
1513 }
1514 }
1515 // set offset here
1516 if (isSafe) {
1517 int32_t result = getColElemIterOffset(coleiter, FALSE);
1518 // sets the text iterator here with the correct expansion and offset
1519 int32_t leftoverces = getExpansionPrefix(coleiter);
1520 cleanUpSafeText(strsrch, safetext, safebuffer);
1521 if (result >= safelength) {
1522 result = textoffset;
1523 }
1524 else {
1525 result += safeoffset;
1526 }
1527 setColEIterOffset(strsrch->textIter, result);
1528 strsrch->textIter->iteratordata_.toReturn =
1529 setExpansionPrefix(strsrch->textIter, leftoverces);
1530 return result;
1531 }
1532
1533 return ucol_getOffset(coleiter);
1534}
1535
1536/**
1537* Trying out the substring and sees if it can be a canonical match.
1538* This will try normalizing the end accents and arranging them into canonical
1539* equivalents and check their corresponding ces with the pattern ce.
1540* Suffix accents in the text will be grouped according to their combining
1541* class and the groups will be mixed and matched to try find the perfect
1542* match with the pattern.
1543* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1544* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1545* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1546* "\u0301\u0325".
1547* step 2: check if any of the generated substrings matches the pattern.
1548* Internal method, status assumed to be success, caller has to check status
1549* before calling this method.
1550* @param strsrch string search data
1551* @param textoffset end offset in the collation element text that ends with
1552* the accents to be rearranged
1553* @param status error status if any
1554* @return TRUE if the match is valid, FALSE otherwise
1555*/
1556static
1557UBool doNextCanonicalMatch(UStringSearch *strsrch,
1558 int32_t textoffset,
1559 UErrorCode *status)
1560{
1561 const UChar *text = strsrch->search->text;
1562 int32_t temp = textoffset;
1563 UTF_BACK_1(text, 0, temp);
1564 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1565 UCollationElements *coleiter = strsrch->textIter;
1566 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1567 if (strsrch->pattern.hasPrefixAccents) {
1568 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1569 status);
1570 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1571 setColEIterOffset(coleiter, offset);
1572 return TRUE;
1573 }
1574 }
1575 return FALSE;
1576 }
1577
1578 if (!strsrch->pattern.hasSuffixAccents) {
1579 return FALSE;
1580 }
1581
1582 UChar accents[INITIAL_ARRAY_SIZE_];
1583 // offset to the last base character in substring to search
1584 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1585 // normalizing the offensive string
1586 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1587 0, accents, INITIAL_ARRAY_SIZE_, status);
1588 // status checked in loop below
1589
1590 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1591 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1592
374ca955
A
1593 // 2 power n - 1 plus the full set of accents
1594 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
1595 while (U_SUCCESS(*status) && count > 0) {
1596 UChar *rearrange = strsrch->canonicalSuffixAccents;
1597 // copy the base characters
1598 for (int k = 0; k < accentsindex[0]; k ++) {
1599 *rearrange ++ = accents[k];
1600 }
1601 // forming all possible canonical rearrangement by dropping
1602 // sets of accents
1603 for (int i = 0; i <= size - 1; i ++) {
1604 int32_t mask = 1 << (size - i - 1);
1605 if (count & mask) {
1606 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1607 *rearrange ++ = accents[j];
1608 }
1609 }
1610 }
1611 *rearrange = 0;
1612 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1613 status);
1614 if (offset != USEARCH_DONE) {
1615 return TRUE; // match found
1616 }
1617 count --;
1618 }
1619 return FALSE;
1620}
1621
1622/**
1623* Gets the previous base character offset depending on the string search
1624* pattern data
1625* @param strsrch string search data
1626* @param textoffset current offset, current character
1627* @return the offset of the next character after this base character or itself
1628* if it is a composed character with accents
1629*/
1630static
1631inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1632 int32_t textoffset)
1633{
1634 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1635 const UChar *text = strsrch->search->text;
1636 int32_t offset = textoffset;
1637 if (getFCD(text, &offset, strsrch->search->textLength) >>
1638 SECOND_LAST_BYTE_SHIFT_) {
1639 return getPreviousBaseOffset(text, textoffset);
1640 }
1641 }
1642 return textoffset;
1643}
1644
1645/**
1646* Checks match for contraction.
1647* If the match ends with a partial contraction we fail.
1648* If the match starts too far off (because of backwards iteration) we try to
1649* chip off the extra characters
1650* Internal method, status assumed to be success, caller has to check status
1651* before calling this method.
1652* @param strsrch string search data
1653* @param start offset of potential match, to be modified if necessary
1654* @param end offset of potential match, to be modified if necessary
1655* @param status output error status if any
1656* @return TRUE if match passes the contraction test, FALSE otherwise
1657*/
1658static
1659UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1660 int32_t *start,
1661 int32_t *end,
1662 UErrorCode *status)
1663{
1664 UCollationElements *coleiter = strsrch->textIter;
1665 int32_t textlength = strsrch->search->textLength;
1666 int32_t temp = *start;
1667 const UCollator *collator = strsrch->collator;
1668 const UChar *text = strsrch->search->text;
1669 // This part checks if either ends of the match contains potential
1670 // contraction. If so we'll have to iterate through them
374ca955 1671 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
b75a7d8f
A
1672 (*start + 1 < textlength
1673 && ucol_unsafeCP(text[*start + 1], collator))) {
1674 int32_t expansion = getExpansionPrefix(coleiter);
1675 UBool expandflag = expansion > 0;
1676 setColEIterOffset(coleiter, *start);
1677 while (expansion > 0) {
1678 // getting rid of the redundant ce, caused by setOffset.
1679 // since backward contraction/expansion may have extra ces if we
1680 // are in the normalization buffer, hasAccentsBeforeMatch would
1681 // have taken care of it.
1682 // E.g. the character \u01FA will have an expansion of 3, but if
1683 // we are only looking for acute and ring \u030A and \u0301, we'll
1684 // have to skip the first ce in the expansion buffer.
1685 ucol_next(coleiter, status);
374ca955
A
1686 if (U_FAILURE(*status)) {
1687 return FALSE;
1688 }
b75a7d8f
A
1689 if (ucol_getOffset(coleiter) != temp) {
1690 *start = temp;
1691 temp = ucol_getOffset(coleiter);
1692 }
1693 expansion --;
1694 }
1695
374ca955 1696 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1697 int32_t patterncelength = strsrch->pattern.CELength;
1698 int32_t count = 0;
1699 int32_t textlength = strsrch->search->textLength;
1700 while (count < patterncelength) {
374ca955 1701 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1702 // status checked below, note that if status is a failure
1703 // ucol_next returns UCOL_NULLORDER
1704 if (ce == UCOL_IGNORABLE) {
1705 continue;
1706 }
1707 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1708 *start = temp;
1709 temp = ucol_getOffset(coleiter);
1710 }
1711
1712 if (count == 0 && ce != patternce[0]) {
1713 // accents may have extra starting ces, this occurs when a
1714 // pure accent pattern is matched without rearrangement
1715 // text \u0325\u0300 and looking for \u0300
374ca955 1716 int32_t expected = patternce[0];
b75a7d8f
A
1717 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1718 ce = getCE(strsrch, ucol_next(coleiter, status));
1719 while (U_SUCCESS(*status) && ce != expected &&
1720 ce != UCOL_NULLORDER &&
1721 ucol_getOffset(coleiter) <= *end) {
1722 ce = getCE(strsrch, ucol_next(coleiter, status));
1723 }
1724 }
1725 }
1726 if (U_FAILURE(*status) || ce != patternce[count]) {
1727 (*end) ++;
1728 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1729 return FALSE;
1730 }
1731 count ++;
1732 }
1733 }
1734 return TRUE;
1735}
1736
1737/**
1738* Checks and sets the match information if found.
1739* Checks
1740* <ul>
1741* <li> the potential match does not repeat the previous match
1742* <li> boundaries are correct
1743* <li> potential match does not end in the middle of a contraction
1744* <li> identical matches
1745* <\ul>
1746* Otherwise the offset will be shifted to the next character.
1747* Internal method, status assumed to be success, caller has to check the
1748* status before calling this method.
1749* @param strsrch string search data
1750* @param textoffset offset in the collation element text. the returned value
1751* will be the truncated end offset of the match or the new start
1752* search offset.
1753* @param status output error status if any
1754* @return TRUE if the match is valid, FALSE otherwise
1755*/
1756static
1757inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1758 int32_t *textoffset,
1759 UErrorCode *status)
1760{
1761 // to ensure that the start and ends are not composite characters
1762 UCollationElements *coleiter = strsrch->textIter;
1763 // if we have a canonical accent match
1764 if ((strsrch->pattern.hasSuffixAccents &&
1765 strsrch->canonicalSuffixAccents[0]) ||
1766 (strsrch->pattern.hasPrefixAccents &&
1767 strsrch->canonicalPrefixAccents[0])) {
1768 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1769 strsrch,
1770 ucol_getOffset(coleiter));
1771 strsrch->search->matchedLength = *textoffset -
1772 strsrch->search->matchedIndex;
1773 return TRUE;
1774 }
1775
1776 int32_t start = getColElemIterOffset(coleiter, FALSE);
1777 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1778 status) || U_FAILURE(*status)) {
1779 return FALSE;
1780 }
1781
1782 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1783 // this totally matches, however we need to check if it is repeating
1784 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1785 !isBreakUnit(strsrch, start, *textoffset) ||
1786 !checkIdentical(strsrch, start, *textoffset)) {
1787 (*textoffset) ++;
1788 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1789 strsrch->search->textLength);
1790 return FALSE;
1791 }
1792
1793 strsrch->search->matchedIndex = start;
1794 strsrch->search->matchedLength = *textoffset - start;
1795 return TRUE;
1796}
1797
1798/**
1799* Shifting the collation element iterator position forward to prepare for
1800* a preceding match. If the first character is a unsafe character, we'll only
1801* shift by 1 to capture contractions, normalization etc.
1802* Internal method, status assumed to be success, caller has to check status
1803* before calling this method.
1804* @param text strsrch string search data
1805* @param textoffset start text position to do search
1806* @param ce the text ce which failed the match.
1807* @param patternceindex index of the ce within the pattern ce buffer which
1808* failed the match
1809* @return final offset
1810*/
1811static
1812inline int32_t reverseShift(UStringSearch *strsrch,
1813 int32_t textoffset,
374ca955 1814 int32_t ce,
b75a7d8f
A
1815 int32_t patternceindex)
1816{
1817 if (strsrch->search->isOverlap) {
1818 if (textoffset != strsrch->search->textLength) {
1819 textoffset --;
1820 }
1821 else {
1822 textoffset -= strsrch->pattern.defaultShiftSize;
1823 }
1824 }
1825 else {
1826 if (ce != UCOL_NULLORDER) {
1827 int32_t shift = strsrch->pattern.backShift[hash(ce)];
1828
1829 // this is to adjust for characters in the middle of the substring
1830 // for matching that failed.
1831 int32_t adjust = patternceindex;
1832 if (adjust > 1 && shift > adjust) {
1833 shift -= adjust - 1;
1834 }
1835 textoffset -= shift;
1836 }
1837 else {
1838 textoffset -= strsrch->pattern.defaultShiftSize;
1839 }
1840 }
1841 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1842 return textoffset;
1843}
1844
1845/**
1846* Checks match for contraction.
1847* If the match starts with a partial contraction we fail.
1848* Internal method, status assumed to be success, caller has to check status
1849* before calling this method.
1850* @param strsrch string search data
1851* @param start offset of potential match, to be modified if necessary
1852* @param end offset of potential match, to be modified if necessary
1853* @param status output error status if any
1854* @return TRUE if match passes the contraction test, FALSE otherwise
1855*/
1856static
1857UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
1858 int32_t *start,
1859 int32_t *end, UErrorCode *status)
1860{
1861 UCollationElements *coleiter = strsrch->textIter;
1862 int32_t textlength = strsrch->search->textLength;
1863 int32_t temp = *end;
1864 const UCollator *collator = strsrch->collator;
1865 const UChar *text = strsrch->search->text;
1866 // This part checks if either if the start of the match contains potential
1867 // contraction. If so we'll have to iterate through them
374ca955
A
1868 // Since we used ucol_next while previously looking for the potential
1869 // match, this guarantees that our end will not be a partial contraction,
1870 // or a partial supplementary character.
b75a7d8f
A
1871 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
1872 int32_t expansion = getExpansionSuffix(coleiter);
1873 UBool expandflag = expansion > 0;
1874 setColEIterOffset(coleiter, *end);
1875 while (U_SUCCESS(*status) && expansion > 0) {
1876 // getting rid of the redundant ce
1877 // since forward contraction/expansion may have extra ces
1878 // if we are in the normalization buffer, hasAccentsBeforeMatch
1879 // would have taken care of it.
1880 // E.g. the character \u01FA will have an expansion of 3, but if
1881 // we are only looking for A ring A\u030A, we'll have to skip the
1882 // last ce in the expansion buffer
1883 ucol_previous(coleiter, status);
374ca955
A
1884 if (U_FAILURE(*status)) {
1885 return FALSE;
1886 }
b75a7d8f
A
1887 if (ucol_getOffset(coleiter) != temp) {
1888 *end = temp;
1889 temp = ucol_getOffset(coleiter);
1890 }
1891 expansion --;
1892 }
1893
374ca955 1894 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
1895 int32_t patterncelength = strsrch->pattern.CELength;
1896 int32_t count = patterncelength;
1897 while (count > 0) {
374ca955 1898 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
1899 // status checked below, note that if status is a failure
1900 // ucol_previous returns UCOL_NULLORDER
1901 if (ce == UCOL_IGNORABLE) {
1902 continue;
1903 }
1904 if (expandflag && count == 0 &&
1905 getColElemIterOffset(coleiter, FALSE) != temp) {
1906 *end = temp;
1907 temp = ucol_getOffset(coleiter);
1908 }
1909 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
1910 (*start) --;
1911 *start = getPreviousBaseOffset(text, *start);
1912 return FALSE;
1913 }
1914 count --;
1915 }
1916 }
1917 return TRUE;
1918}
1919
1920/**
1921* Checks and sets the match information if found.
1922* Checks
1923* <ul>
1924* <li> the current match does not repeat the last match
1925* <li> boundaries are correct
1926* <li> exact matches has no extra accents
1927* <li> identical matches
1928* <\ul>
1929* Otherwise the offset will be shifted to the preceding character.
1930* Internal method, status assumed to be success, caller has to check status
1931* before calling this method.
1932* @param strsrch string search data
1933* @param collator
1934* @param coleiter collation element iterator
1935* @param text string
1936* @param textoffset offset in the collation element text. the returned value
1937* will be the truncated start offset of the match or the new start
1938* search offset.
1939* @param status output error status if any
1940* @return TRUE if the match is valid, FALSE otherwise
1941*/
1942static
1943inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
1944 int32_t *textoffset,
1945 UErrorCode *status)
1946{
1947 // to ensure that the start and ends are not composite characters
1948 int32_t end = ucol_getOffset(strsrch->textIter);
1949 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
1950 || U_FAILURE(*status)) {
1951 return FALSE;
1952 }
1953
1954 // this totally matches, however we need to check if it is repeating
1955 // the old match
1956 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
1957 !isBreakUnit(strsrch, *textoffset, end) ||
1958 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
1959 !checkIdentical(strsrch, *textoffset, end) ||
1960 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
1961 (*textoffset) --;
1962 *textoffset = getPreviousBaseOffset(strsrch->search->text,
1963 *textoffset);
1964 return FALSE;
1965 }
1966 strsrch->search->matchedIndex = *textoffset;
1967 strsrch->search->matchedLength = end - *textoffset;
1968 return TRUE;
1969}
1970
1971/**
1972* Rearranges the end accents to try matching.
1973* Suffix accents in the text will be grouped according to their combining
1974* class and the groups will be mixed and matched to try find the perfect
1975* match with the pattern.
1976* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1977* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1978* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1979* "\u0301\u0325".
1980* step 2: check if any of the generated substrings matches the pattern.
1981* Internal method, status assumed to be success, user has to check status
1982* before calling this method.
1983* @param strsrch string search match
1984* @param start offset of the first base character
1985* @param end start of the last accent set
1986* @param status only error status if any
1987* @return USEARCH_DONE if a match is not found, otherwise return the ending
1988* offset of the match. Note this start includes all following accents.
1989*/
1990static
1991int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
1992 int32_t start,
1993 int32_t end,
1994 UErrorCode *status)
1995{
1996 const UChar *text = strsrch->search->text;
1997 int32_t tempend = end;
1998
1999 UTF_BACK_1(text, 0, tempend);
2000 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2001 LAST_BYTE_MASK_)) {
2002 // die... failed at a base character
2003 return USEARCH_DONE;
2004 }
2005 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2006
2007 if (U_SUCCESS(*status)) {
2008 UChar accents[INITIAL_ARRAY_SIZE_];
2009 int32_t offset = getPreviousBaseOffset(text, end);
2010 // normalizing the offensive string
2011 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2012 INITIAL_ARRAY_SIZE_, status);
2013
2014 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2015 int32_t accentsize = getUnblockedAccentIndex(accents,
2016 accentsindex);
374ca955 2017 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
2018 UChar buffer[INITIAL_ARRAY_SIZE_];
2019 UCollationElements *coleiter = strsrch->utilIter;
2020 while (U_SUCCESS(*status) && count > 0) {
2021 UChar *rearrange = strsrch->canonicalSuffixAccents;
2022 // copy the base characters
2023 for (int k = 0; k < accentsindex[0]; k ++) {
2024 *rearrange ++ = accents[k];
2025 }
2026 // forming all possible canonical rearrangement by dropping
2027 // sets of accents
2028 for (int i = 0; i <= accentsize - 1; i ++) {
2029 int32_t mask = 1 << (accentsize - i - 1);
2030 if (count & mask) {
2031 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2032 *rearrange ++ = accents[j];
2033 }
2034 }
2035 }
2036 *rearrange = 0;
2037 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2038 UChar *match = addToUCharArray(buffer, &matchsize,
2039 strsrch->canonicalPrefixAccents,
2040 strsrch->search->text + start,
2041 offset - start,
2042 strsrch->canonicalSuffixAccents,
2043 status);
2044
2045 // run the collator iterator through this match
2046 // if status is a failure ucol_setText does nothing
2047 ucol_setText(coleiter, match, matchsize, status);
2048 if (U_SUCCESS(*status)) {
2049 if (checkCollationMatch(strsrch, coleiter)) {
2050 if (match != buffer) {
2051 uprv_free(match);
2052 }
2053 return end;
2054 }
2055 }
2056 count --;
2057 }
2058 }
2059 return USEARCH_DONE;
2060}
2061
2062/**
2063* Take the rearranged start accents and tries matching. If match failed at
2064* a seperate following set of accents (seperated from the rearranged on by
2065* at least a base character) then we rearrange the preceding accents and
2066* tries matching again.
2067* We allow skipping of the ends of the accent set if the ces do not match.
2068* However if the failure is found before the accent set, it fails.
2069* Internal method, status assumed to be success, caller has to check status
2070* before calling this method.
2071* @param strsrch string search data
2072* @param textoffset of the ends of the rearranged accent
2073* @param status output error status if any
2074* @return USEARCH_DONE if a match is not found, otherwise return the ending
2075* offset of the match. Note this start includes all following accents.
2076*/
2077static
2078int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2079 int32_t textoffset,
2080 UErrorCode *status)
2081{
2082 const UChar *text = strsrch->search->text;
2083 const UCollator *collator = strsrch->collator;
2084 int32_t safelength = 0;
2085 UChar *safetext;
2086 int32_t safetextlength;
2087 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2088 int32_t safeoffset = textoffset;
2089
2090 if (textoffset &&
2091 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2092 u_strlen(strsrch->canonicalPrefixAccents) - 1
2093 ], collator)) {
2094 safeoffset = getNextSafeOffset(collator, text, textoffset,
2095 strsrch->search->textLength);
2096 safelength = safeoffset - textoffset;
2097 safetextlength = INITIAL_ARRAY_SIZE_;
2098 safetext = addToUCharArray(safebuffer, &safetextlength,
2099 strsrch->canonicalPrefixAccents,
2100 text + textoffset, safelength,
2101 NULL, status);
2102 }
2103 else {
2104 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2105 safetext = strsrch->canonicalPrefixAccents;
2106 }
2107
2108 UCollationElements *coleiter = strsrch->utilIter;
2109 // if status is a failure, ucol_setText does nothing
2110 ucol_setText(coleiter, safetext, safetextlength, status);
2111 // status checked in loop below
2112
374ca955 2113 int32_t *ce = strsrch->pattern.CE;
b75a7d8f
A
2114 int32_t celength = strsrch->pattern.CELength;
2115 int ceindex = 0;
2116 UBool isSafe = TRUE; // safe zone indication flag for position
2117 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2118
2119 while (ceindex < celength) {
374ca955 2120 int32_t textce = ucol_next(coleiter, status);
b75a7d8f
A
2121 if (U_FAILURE(*status)) {
2122 if (isSafe) {
2123 cleanUpSafeText(strsrch, safetext, safebuffer);
2124 }
2125 return USEARCH_DONE;
2126 }
2127 if (textce == UCOL_NULLORDER) {
2128 // check if we have passed the safe buffer
2129 if (coleiter == strsrch->textIter) {
2130 cleanUpSafeText(strsrch, safetext, safebuffer);
2131 return USEARCH_DONE;
2132 }
2133 cleanUpSafeText(strsrch, safetext, safebuffer);
2134 safetext = safebuffer;
2135 coleiter = strsrch->textIter;
2136 setColEIterOffset(coleiter, safeoffset);
2137 // status checked at the start of the loop
2138 isSafe = FALSE;
2139 continue;
2140 }
2141 textce = getCE(strsrch, textce);
2142 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2143 // do the beginning stuff
2144 int32_t failedoffset = ucol_getOffset(coleiter);
2145 if (isSafe && failedoffset <= prefixlength) {
2146 // alas... no hope. failed at rearranged accent set
2147 cleanUpSafeText(strsrch, safetext, safebuffer);
2148 return USEARCH_DONE;
2149 }
2150 else {
2151 if (isSafe) {
2152 failedoffset = safeoffset - failedoffset;
2153 cleanUpSafeText(strsrch, safetext, safebuffer);
2154 }
2155
2156 // try rearranging the end accents
2157 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2158 textoffset, failedoffset, status);
2159 if (result != USEARCH_DONE) {
2160 // if status is a failure, ucol_setOffset does nothing
2161 setColEIterOffset(strsrch->textIter, result);
2162 }
2163 if (U_FAILURE(*status)) {
2164 return USEARCH_DONE;
2165 }
2166 return result;
2167 }
2168 }
2169 if (textce == ce[ceindex]) {
2170 ceindex ++;
2171 }
2172 }
2173 // set offset here
2174 if (isSafe) {
2175 int32_t result = ucol_getOffset(coleiter);
2176 // sets the text iterator here with the correct expansion and offset
2177 int32_t leftoverces = getExpansionSuffix(coleiter);
2178 cleanUpSafeText(strsrch, safetext, safebuffer);
2179 if (result <= prefixlength) {
2180 result = textoffset;
2181 }
2182 else {
2183 result = textoffset + (safeoffset - result);
2184 }
2185 setColEIterOffset(strsrch->textIter, result);
2186 setExpansionSuffix(strsrch->textIter, leftoverces);
2187 return result;
2188 }
2189
2190 return ucol_getOffset(coleiter);
2191}
2192
2193/**
2194* Trying out the substring and sees if it can be a canonical match.
2195* This will try normalizing the starting accents and arranging them into
2196* canonical equivalents and check their corresponding ces with the pattern ce.
2197* Prefix accents in the text will be grouped according to their combining
2198* class and the groups will be mixed and matched to try find the perfect
2199* match with the pattern.
2200* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2201* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2202* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2203* "\u0301\u0325".
2204* step 2: check if any of the generated substrings matches the pattern.
2205* Internal method, status assumed to be success, caller has to check status
2206* before calling this method.
2207* @param strsrch string search data
2208* @param textoffset start offset in the collation element text that starts
2209* with the accents to be rearranged
2210* @param status output error status if any
2211* @return TRUE if the match is valid, FALSE otherwise
2212*/
2213static
2214UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2215 int32_t textoffset,
2216 UErrorCode *status)
2217{
2218 const UChar *text = strsrch->search->text;
2219 int32_t temp = textoffset;
2220 int32_t textlength = strsrch->search->textLength;
2221 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2222 UCollationElements *coleiter = strsrch->textIter;
2223 int32_t offset = ucol_getOffset(coleiter);
2224 if (strsrch->pattern.hasSuffixAccents) {
2225 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2226 offset, status);
2227 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2228 setColEIterOffset(coleiter, offset);
2229 return TRUE;
2230 }
2231 }
2232 return FALSE;
2233 }
2234
2235 if (!strsrch->pattern.hasPrefixAccents) {
2236 return FALSE;
2237 }
2238
2239 UChar accents[INITIAL_ARRAY_SIZE_];
2240 // offset to the last base character in substring to search
2241 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2242 // normalizing the offensive string
2243 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2244 0, accents, INITIAL_ARRAY_SIZE_, status);
2245 // status checked in loop
2246
2247 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2248 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2249
374ca955
A
2250 // 2 power n - 1 plus the full set of accents
2251 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
2252 while (U_SUCCESS(*status) && count > 0) {
2253 UChar *rearrange = strsrch->canonicalPrefixAccents;
2254 // copy the base characters
2255 for (int k = 0; k < accentsindex[0]; k ++) {
2256 *rearrange ++ = accents[k];
2257 }
2258 // forming all possible canonical rearrangement by dropping
2259 // sets of accents
2260 for (int i = 0; i <= size - 1; i ++) {
2261 int32_t mask = 1 << (size - i - 1);
2262 if (count & mask) {
2263 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2264 *rearrange ++ = accents[j];
2265 }
2266 }
2267 }
2268 *rearrange = 0;
2269 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2270 baseoffset, status);
2271 if (offset != USEARCH_DONE) {
2272 return TRUE; // match found
2273 }
2274 count --;
2275 }
2276 return FALSE;
2277}
2278
2279/**
2280* Checks match for contraction.
2281* If the match starts with a partial contraction we fail.
2282* Internal method, status assumed to be success, caller has to check status
2283* before calling this method.
2284* @param strsrch string search data
2285* @param start offset of potential match, to be modified if necessary
2286* @param end offset of potential match, to be modified if necessary
2287* @param status only error status if any
2288* @return TRUE if match passes the contraction test, FALSE otherwise
2289*/
2290static
2291UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2292 int32_t *start,
2293 int32_t *end, UErrorCode *status)
2294{
2295 UCollationElements *coleiter = strsrch->textIter;
2296 int32_t textlength = strsrch->search->textLength;
2297 int32_t temp = *end;
2298 const UCollator *collator = strsrch->collator;
2299 const UChar *text = strsrch->search->text;
374ca955 2300 // This part checks if either if the start of the match contains potential
b75a7d8f 2301 // contraction. If so we'll have to iterate through them
374ca955
A
2302 // Since we used ucol_next while previously looking for the potential
2303 // match, this guarantees that our end will not be a partial contraction,
2304 // or a partial supplementary character.
b75a7d8f
A
2305 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2306 int32_t expansion = getExpansionSuffix(coleiter);
2307 UBool expandflag = expansion > 0;
2308 setColEIterOffset(coleiter, *end);
2309 while (expansion > 0) {
2310 // getting rid of the redundant ce
2311 // since forward contraction/expansion may have extra ces
2312 // if we are in the normalization buffer, hasAccentsBeforeMatch
2313 // would have taken care of it.
2314 // E.g. the character \u01FA will have an expansion of 3, but if
2315 // we are only looking for A ring A\u030A, we'll have to skip the
2316 // last ce in the expansion buffer
2317 ucol_previous(coleiter, status);
374ca955
A
2318 if (U_FAILURE(*status)) {
2319 return FALSE;
2320 }
b75a7d8f
A
2321 if (ucol_getOffset(coleiter) != temp) {
2322 *end = temp;
2323 temp = ucol_getOffset(coleiter);
2324 }
2325 expansion --;
2326 }
2327
374ca955 2328 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
2329 int32_t patterncelength = strsrch->pattern.CELength;
2330 int32_t count = patterncelength;
2331 while (count > 0) {
374ca955 2332 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2333 // status checked below, note that if status is a failure
2334 // ucol_previous returns UCOL_NULLORDER
2335 if (ce == UCOL_IGNORABLE) {
2336 continue;
2337 }
2338 if (expandflag && count == 0 &&
2339 getColElemIterOffset(coleiter, FALSE) != temp) {
2340 *end = temp;
2341 temp = ucol_getOffset(coleiter);
2342 }
2343 if (count == patterncelength &&
2344 ce != patternce[patterncelength - 1]) {
2345 // accents may have extra starting ces, this occurs when a
2346 // pure accent pattern is matched without rearrangement
374ca955 2347 int32_t expected = patternce[patterncelength - 1];
b75a7d8f
A
2348 UTF_BACK_1(text, 0, *end);
2349 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2350 ce = getCE(strsrch, ucol_previous(coleiter, status));
2351 while (U_SUCCESS(*status) && ce != expected &&
2352 ce != UCOL_NULLORDER &&
2353 ucol_getOffset(coleiter) <= *start) {
2354 ce = getCE(strsrch, ucol_previous(coleiter, status));
2355 }
2356 }
2357 }
2358 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2359 (*start) --;
2360 *start = getPreviousBaseOffset(text, *start);
2361 return FALSE;
2362 }
2363 count --;
2364 }
2365 }
2366 return TRUE;
2367}
2368
2369/**
2370* Checks and sets the match information if found.
2371* Checks
2372* <ul>
2373* <li> the potential match does not repeat the previous match
2374* <li> boundaries are correct
2375* <li> potential match does not end in the middle of a contraction
2376* <li> identical matches
2377* <\ul>
2378* Otherwise the offset will be shifted to the next character.
2379* Internal method, status assumed to be success, caller has to check status
2380* before calling this method.
2381* @param strsrch string search data
2382* @param textoffset offset in the collation element text. the returned value
2383* will be the truncated start offset of the match or the new start
2384* search offset.
2385* @param status only error status if any
2386* @return TRUE if the match is valid, FALSE otherwise
2387*/
2388static
2389inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2390 int32_t *textoffset,
2391 UErrorCode *status)
2392{
2393 // to ensure that the start and ends are not composite characters
2394 UCollationElements *coleiter = strsrch->textIter;
2395 // if we have a canonical accent match
2396 if ((strsrch->pattern.hasSuffixAccents &&
2397 strsrch->canonicalSuffixAccents[0]) ||
2398 (strsrch->pattern.hasPrefixAccents &&
2399 strsrch->canonicalPrefixAccents[0])) {
2400 strsrch->search->matchedIndex = *textoffset;
2401 strsrch->search->matchedLength =
2402 getNextUStringSearchBaseOffset(strsrch,
2403 getColElemIterOffset(coleiter, FALSE))
2404 - *textoffset;
2405 return TRUE;
2406 }
2407
2408 int32_t end = ucol_getOffset(coleiter);
2409 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2410 status) ||
2411 U_FAILURE(*status)) {
2412 return FALSE;
2413 }
2414
2415 end = getNextUStringSearchBaseOffset(strsrch, end);
2416 // this totally matches, however we need to check if it is repeating
2417 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2418 !isBreakUnit(strsrch, *textoffset, end) ||
2419 !checkIdentical(strsrch, *textoffset, end)) {
2420 (*textoffset) --;
2421 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2422 *textoffset);
2423 return FALSE;
2424 }
2425
2426 strsrch->search->matchedIndex = *textoffset;
2427 strsrch->search->matchedLength = end - *textoffset;
2428 return TRUE;
2429}
2430
2431// constructors and destructor -------------------------------------------
2432
2433U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2434 int32_t patternlength,
2435 const UChar *text,
2436 int32_t textlength,
2437 const char *locale,
2438 UBreakIterator *breakiter,
2439 UErrorCode *status)
2440{
2441 if (U_FAILURE(*status)) {
2442 return NULL;
2443 }
2444#if UCONFIG_NO_BREAK_ITERATION
2445 if (breakiter != NULL) {
2446 *status = U_UNSUPPORTED_ERROR;
2447 return NULL;
2448 }
2449#endif
2450 if (locale) {
2451 // ucol_open internally checks for status
2452 UCollator *collator = ucol_open(locale, status);
2453 // pattern, text checks are done in usearch_openFromCollator
2454 UStringSearch *result = usearch_openFromCollator(pattern,
2455 patternlength, text, textlength,
2456 collator, breakiter, status);
2457
2458 if (result == NULL || U_FAILURE(*status)) {
2459 if (collator) {
2460 ucol_close(collator);
2461 }
2462 return NULL;
2463 }
2464 else {
2465 result->ownCollator = TRUE;
2466 }
2467 return result;
2468 }
2469 *status = U_ILLEGAL_ARGUMENT_ERROR;
2470 return NULL;
2471}
2472
2473U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2474 const UChar *pattern,
2475 int32_t patternlength,
2476 const UChar *text,
2477 int32_t textlength,
2478 const UCollator *collator,
2479 UBreakIterator *breakiter,
2480 UErrorCode *status)
2481{
2482 if (U_FAILURE(*status)) {
2483 return NULL;
2484 }
2485#if UCONFIG_NO_BREAK_ITERATION
2486 if (breakiter != NULL) {
2487 *status = U_UNSUPPORTED_ERROR;
2488 return NULL;
2489 }
2490#endif
2491 if (pattern == NULL || text == NULL || collator == NULL) {
2492 *status = U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 2493 return NULL;
b75a7d8f
A
2494 }
2495
374ca955
A
2496 // string search does not really work when numeric collation is turned on
2497 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2498 *status = U_UNSUPPORTED_ERROR;
73c04bcf 2499 return NULL;
374ca955
A
2500 }
2501
b75a7d8f
A
2502 if (U_SUCCESS(*status)) {
2503 initializeFCD(status);
2504 if (U_FAILURE(*status)) {
2505 return NULL;
2506 }
2507
2508 UStringSearch *result;
2509 if (textlength == -1) {
2510 textlength = u_strlen(text);
2511 }
2512 if (patternlength == -1) {
2513 patternlength = u_strlen(pattern);
2514 }
2515 if (textlength <= 0 || patternlength <= 0) {
2516 *status = U_ILLEGAL_ARGUMENT_ERROR;
2517 return NULL;
2518 }
2519
2520 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2521 if (result == NULL) {
2522 *status = U_MEMORY_ALLOCATION_ERROR;
2523 return NULL;
2524 }
2525
2526 result->collator = collator;
2527 result->strength = ucol_getStrength(collator);
2528 result->ceMask = getMask(result->strength);
2529 result->toShift =
2530 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2531 UCOL_SHIFTED;
2532 result->variableTop = ucol_getVariableTop(collator, status);
2533
2534 if (U_FAILURE(*status)) {
2535 uprv_free(result);
2536 return NULL;
2537 }
2538
2539 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2540 if (result->search == NULL) {
2541 *status = U_MEMORY_ALLOCATION_ERROR;
2542 uprv_free(result);
2543 return NULL;
2544 }
2545
2546 result->search->text = text;
2547 result->search->textLength = textlength;
2548
2549 result->pattern.text = pattern;
2550 result->pattern.textLength = patternlength;
2551 result->pattern.CE = NULL;
2552
2553 result->search->breakIter = breakiter;
2554#if !UCONFIG_NO_BREAK_ITERATION
2555 if (breakiter) {
2556 ubrk_setText(breakiter, text, textlength, status);
2557 }
2558#endif
2559
2560 result->ownCollator = FALSE;
2561 result->search->matchedLength = 0;
2562 result->search->matchedIndex = USEARCH_DONE;
2563 result->textIter = ucol_openElements(collator, text,
2564 textlength, status);
2565 if (U_FAILURE(*status)) {
2566 usearch_close(result);
2567 return NULL;
2568 }
2569
2570 result->utilIter = NULL;
2571
2572 result->search->isOverlap = FALSE;
2573 result->search->isCanonicalMatch = FALSE;
2574 result->search->isForwardSearching = TRUE;
2575 result->search->reset = TRUE;
2576
2577 initialize(result, status);
2578
2579 if (U_FAILURE(*status)) {
2580 usearch_close(result);
2581 return NULL;
2582 }
2583
2584 return result;
2585 }
2586 return NULL;
2587}
2588
2589U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2590{
2591 if (strsrch) {
2592 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2593 strsrch->pattern.CE) {
2594 uprv_free(strsrch->pattern.CE);
2595 }
2596 ucol_closeElements(strsrch->textIter);
2597 ucol_closeElements(strsrch->utilIter);
2598 if (strsrch->ownCollator && strsrch->collator) {
2599 ucol_close((UCollator *)strsrch->collator);
2600 }
2601 uprv_free(strsrch->search);
2602 uprv_free(strsrch);
2603 }
2604}
2605
2606// set and get methods --------------------------------------------------
2607
2608U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2609 int32_t position,
2610 UErrorCode *status)
2611{
2612 if (U_SUCCESS(*status) && strsrch) {
2613 if (isOutOfBounds(strsrch->search->textLength, position)) {
2614 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2615 }
2616 else {
2617 setColEIterOffset(strsrch->textIter, position);
2618 }
2619 strsrch->search->matchedIndex = USEARCH_DONE;
2620 strsrch->search->matchedLength = 0;
2621 strsrch->search->reset = FALSE;
2622 }
2623}
2624
2625U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2626{
2627 if (strsrch) {
2628 int32_t result = ucol_getOffset(strsrch->textIter);
2629 if (isOutOfBounds(strsrch->search->textLength, result)) {
2630 return USEARCH_DONE;
2631 }
2632 return result;
2633 }
2634 return USEARCH_DONE;
2635}
2636
2637U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2638 USearchAttribute attribute,
2639 USearchAttributeValue value,
2640 UErrorCode *status)
2641{
2642 if (U_SUCCESS(*status) && strsrch) {
2643 switch (attribute)
2644 {
2645 case USEARCH_OVERLAP :
2646 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2647 break;
2648 case USEARCH_CANONICAL_MATCH :
2649 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2650 FALSE);
2651 break;
2652 case USEARCH_ATTRIBUTE_COUNT :
2653 default:
2654 *status = U_ILLEGAL_ARGUMENT_ERROR;
2655 }
2656 }
2657 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2658 *status = U_ILLEGAL_ARGUMENT_ERROR;
2659 }
2660}
2661
2662U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2663 const UStringSearch *strsrch,
2664 USearchAttribute attribute)
2665{
2666 if (strsrch) {
2667 switch (attribute) {
2668 case USEARCH_OVERLAP :
2669 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2670 USEARCH_OFF);
2671 case USEARCH_CANONICAL_MATCH :
2672 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2673 USEARCH_OFF);
2674 case USEARCH_ATTRIBUTE_COUNT :
2675 return USEARCH_DEFAULT;
2676 }
2677 }
2678 return USEARCH_DEFAULT;
2679}
2680
2681U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2682 const UStringSearch *strsrch)
2683{
2684 if (strsrch == NULL) {
2685 return USEARCH_DONE;
2686 }
2687 return strsrch->search->matchedIndex;
2688}
2689
2690
2691U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2692 UChar *result,
2693 int32_t resultCapacity,
2694 UErrorCode *status)
2695{
2696 if (U_FAILURE(*status)) {
2697 return USEARCH_DONE;
2698 }
2699 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2700 result == NULL)) {
2701 *status = U_ILLEGAL_ARGUMENT_ERROR;
2702 return USEARCH_DONE;
2703 }
2704
2705 int32_t copylength = strsrch->search->matchedLength;
2706 int32_t copyindex = strsrch->search->matchedIndex;
2707 if (copyindex == USEARCH_DONE) {
2708 u_terminateUChars(result, resultCapacity, 0, status);
2709 return USEARCH_DONE;
2710 }
2711
2712 if (resultCapacity < copylength) {
2713 copylength = resultCapacity;
2714 }
2715 if (copylength > 0) {
2716 uprv_memcpy(result, strsrch->search->text + copyindex,
2717 copylength * sizeof(UChar));
2718 }
2719 return u_terminateUChars(result, resultCapacity,
2720 strsrch->search->matchedLength, status);
2721}
2722
2723U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2724 const UStringSearch *strsrch)
2725{
2726 if (strsrch) {
2727 return strsrch->search->matchedLength;
2728 }
2729 return USEARCH_DONE;
2730}
2731
2732#if !UCONFIG_NO_BREAK_ITERATION
2733
2734U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2735 UBreakIterator *breakiter,
2736 UErrorCode *status)
2737{
2738 if (U_SUCCESS(*status) && strsrch) {
2739 strsrch->search->breakIter = breakiter;
2740 if (breakiter) {
2741 ubrk_setText(breakiter, strsrch->search->text,
2742 strsrch->search->textLength, status);
2743 }
2744 }
2745}
2746
2747U_CAPI const UBreakIterator* U_EXPORT2
2748usearch_getBreakIterator(const UStringSearch *strsrch)
2749{
2750 if (strsrch) {
2751 return strsrch->search->breakIter;
2752 }
2753 return NULL;
2754}
2755
2756#endif
2757
2758U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2759 const UChar *text,
2760 int32_t textlength,
2761 UErrorCode *status)
2762{
2763 if (U_SUCCESS(*status)) {
2764 if (strsrch == NULL || text == NULL || textlength < -1 ||
2765 textlength == 0) {
2766 *status = U_ILLEGAL_ARGUMENT_ERROR;
2767 }
2768 else {
2769 if (textlength == -1) {
2770 textlength = u_strlen(text);
2771 }
2772 strsrch->search->text = text;
2773 strsrch->search->textLength = textlength;
2774 ucol_setText(strsrch->textIter, text, textlength, status);
2775 strsrch->search->matchedIndex = USEARCH_DONE;
2776 strsrch->search->matchedLength = 0;
2777 strsrch->search->reset = TRUE;
2778#if !UCONFIG_NO_BREAK_ITERATION
374ca955
A
2779 if (strsrch->search->breakIter != NULL) {
2780 ubrk_setText(strsrch->search->breakIter, text,
2781 textlength, status);
2782 }
b75a7d8f
A
2783#endif
2784 }
2785 }
2786}
2787
2788U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2789 int32_t *length)
2790{
2791 if (strsrch) {
2792 *length = strsrch->search->textLength;
2793 return strsrch->search->text;
2794 }
2795 return NULL;
2796}
2797
2798U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
2799 const UCollator *collator,
2800 UErrorCode *status)
2801{
2802 if (U_SUCCESS(*status)) {
2803 if (collator == NULL) {
2804 *status = U_ILLEGAL_ARGUMENT_ERROR;
2805 return;
2806 }
2807 if (strsrch) {
2808 if (strsrch->ownCollator && (strsrch->collator != collator)) {
2809 ucol_close((UCollator *)strsrch->collator);
2810 strsrch->ownCollator = FALSE;
2811 }
2812 strsrch->collator = collator;
2813 strsrch->strength = ucol_getStrength(collator);
2814 strsrch->ceMask = getMask(strsrch->strength);
2815 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2816 strsrch->toShift =
2817 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2818 UCOL_SHIFTED;
2819 // if status is a failure, ucol_getVariableTop returns 0
2820 strsrch->variableTop = ucol_getVariableTop(collator, status);
2821 if (U_SUCCESS(*status)) {
2822 initialize(strsrch, status);
2823 if (U_SUCCESS(*status)) {
2824 uprv_init_collIterate(collator, strsrch->search->text,
2825 strsrch->search->textLength,
2826 &(strsrch->textIter->iteratordata_));
374ca955 2827 strsrch->utilIter->iteratordata_.coll = collator;
b75a7d8f
A
2828 }
2829 }
2830 }
2831 }
2832}
2833
2834U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
2835{
2836 if (strsrch) {
2837 return (UCollator *)strsrch->collator;
2838 }
2839 return NULL;
2840}
2841
2842U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
2843 const UChar *pattern,
2844 int32_t patternlength,
2845 UErrorCode *status)
2846{
2847 if (U_SUCCESS(*status)) {
2848 if (strsrch == NULL || pattern == NULL) {
2849 *status = U_ILLEGAL_ARGUMENT_ERROR;
2850 }
2851 else {
2852 if (patternlength == -1) {
2853 patternlength = u_strlen(pattern);
2854 }
2855 if (patternlength == 0) {
2856 *status = U_ILLEGAL_ARGUMENT_ERROR;
2857 return;
2858 }
2859 strsrch->pattern.text = pattern;
2860 strsrch->pattern.textLength = patternlength;
2861 initialize(strsrch, status);
2862 }
2863 }
2864}
2865
2866U_CAPI const UChar* U_EXPORT2
2867usearch_getPattern(const UStringSearch *strsrch,
2868 int32_t *length)
2869{
2870 if (strsrch) {
2871 *length = strsrch->pattern.textLength;
2872 return strsrch->pattern.text;
2873 }
2874 return NULL;
2875}
2876
2877// miscellanous methods --------------------------------------------------
2878
2879U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
2880 UErrorCode *status)
2881{
2882 if (strsrch && U_SUCCESS(*status)) {
2883 strsrch->search->isForwardSearching = TRUE;
2884 usearch_setOffset(strsrch, 0, status);
2885 if (U_SUCCESS(*status)) {
2886 return usearch_next(strsrch, status);
2887 }
2888 }
2889 return USEARCH_DONE;
2890}
2891
2892U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
2893 int32_t position,
2894 UErrorCode *status)
2895{
2896 if (strsrch && U_SUCCESS(*status)) {
2897 strsrch->search->isForwardSearching = TRUE;
2898 // position checked in usearch_setOffset
2899 usearch_setOffset(strsrch, position, status);
2900 if (U_SUCCESS(*status)) {
2901 return usearch_next(strsrch, status);
2902 }
2903 }
2904 return USEARCH_DONE;
2905}
2906
2907U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
2908 UErrorCode *status)
2909{
2910 if (strsrch && U_SUCCESS(*status)) {
2911 strsrch->search->isForwardSearching = FALSE;
2912 usearch_setOffset(strsrch, strsrch->search->textLength, status);
2913 if (U_SUCCESS(*status)) {
2914 return usearch_previous(strsrch, status);
2915 }
2916 }
2917 return USEARCH_DONE;
2918}
2919
2920U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
2921 int32_t position,
2922 UErrorCode *status)
2923{
2924 if (strsrch && U_SUCCESS(*status)) {
2925 strsrch->search->isForwardSearching = FALSE;
2926 // position checked in usearch_setOffset
2927 usearch_setOffset(strsrch, position, status);
2928 if (U_SUCCESS(*status)) {
2929 return usearch_previous(strsrch, status);
2930 }
2931 }
2932 return USEARCH_DONE;
2933}
2934
2935/**
2936* If a direction switch is required, we'll count the number of ces till the
2937* beginning of the collation element iterator and iterate forwards that
2938* number of times. This is so that we get to the correct point within the
2939* string to continue the search in. Imagine when we are in the middle of the
2940* normalization buffer when the change in direction is request. arrrgghh....
2941* After searching the offset within the collation element iterator will be
2942* shifted to the start of the match. If a match is not found, the offset would
2943* have been set to the end of the text string in the collation element
2944* iterator.
2945* Okay, here's my take on normalization buffer. The only time when there can
2946* be 2 matches within the same normalization is when the pattern is consists
2947* of all accents. But since the offset returned is from the text string, we
2948* should not confuse the caller by returning the second match within the
2949* same normalization buffer. If we do, the 2 results will have the same match
2950* offsets, and that'll be confusing. I'll return the next match that doesn't
2951* fall within the same normalization buffer. Note this does not affect the
2952* results of matches spanning the text and the normalization buffer.
2953* The position to start searching is taken from the collation element
2954* iterator. Callers of this API would have to set the offset in the collation
2955* element iterator before using this method.
2956*/
2957U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
2958 UErrorCode *status)
2959{
2960 if (U_SUCCESS(*status) && strsrch) {
374ca955
A
2961 // note offset is either equivalent to the start of the previous match
2962 // or is set by the user
2963 int32_t offset = usearch_getOffset(strsrch);
2964 USearch *search = strsrch->search;
2965 search->reset = FALSE;
2966 int32_t textlength = search->textLength;
b75a7d8f 2967 if (search->isForwardSearching) {
374ca955
A
2968 if (offset == textlength
2969 || (!search->isOverlap &&
b75a7d8f 2970 (offset + strsrch->pattern.defaultShiftSize > textlength ||
374ca955
A
2971 (search->matchedIndex != USEARCH_DONE &&
2972 offset + search->matchedLength >= textlength)))) {
b75a7d8f
A
2973 // not enough characters to match
2974 setMatchNotFound(strsrch);
2975 return USEARCH_DONE;
2976 }
2977 }
2978 else {
2979 // switching direction.
2980 // if matchedIndex == USEARCH_DONE, it means that either a
2981 // setOffset has been called or that previous ran off the text
2982 // string. the iterator would have been set to offset 0 if a
2983 // match is not found.
2984 search->isForwardSearching = TRUE;
374ca955 2985 if (search->matchedIndex != USEARCH_DONE) {
b75a7d8f
A
2986 // there's no need to set the collation element iterator
2987 // the next call to next will set the offset.
374ca955 2988 return search->matchedIndex;
b75a7d8f
A
2989 }
2990 }
2991
2992 if (U_SUCCESS(*status)) {
2993 if (strsrch->pattern.CELength == 0) {
374ca955 2994 if (search->matchedIndex == USEARCH_DONE) {
b75a7d8f
A
2995 search->matchedIndex = offset;
2996 }
2997 else { // moves by codepoints
2998 UTF_FWD_1(search->text, search->matchedIndex, textlength);
2999 }
3000
3001 search->matchedLength = 0;
3002 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3003 // status checked below
3004 if (search->matchedIndex == textlength) {
3005 search->matchedIndex = USEARCH_DONE;
3006 }
3007 }
3008 else {
374ca955
A
3009 if (search->matchedLength > 0) {
3010 // if matchlength is 0 we are at the start of the iteration
3011 if (search->isOverlap) {
3012 ucol_setOffset(strsrch->textIter, offset + 1, status);
3013 }
3014 else {
3015 ucol_setOffset(strsrch->textIter,
3016 offset + search->matchedLength, status);
3017 }
3018 }
3019 else {
3020 // for boundary check purposes. this will ensure that the
3021 // next match will not preceed the current offset
3022 // note search->matchedIndex will always be set to something
3023 // in the code
3024 search->matchedIndex = offset - 1;
3025 }
3026
3027 if (search->isCanonicalMatch) {
3028 // can't use exact here since extra accents are allowed.
3029 usearch_handleNextCanonical(strsrch, status);
3030 }
3031 else {
3032 usearch_handleNextExact(strsrch, status);
3033 }
3034 }
3035
b75a7d8f
A
3036 if (U_FAILURE(*status)) {
3037 return USEARCH_DONE;
3038 }
374ca955 3039
b75a7d8f
A
3040 return search->matchedIndex;
3041 }
3042 }
3043 return USEARCH_DONE;
3044}
3045
3046U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3047 UErrorCode *status)
3048{
3049 if (U_SUCCESS(*status) && strsrch) {
3050 int32_t offset;
3051 USearch *search = strsrch->search;
3052 if (search->reset) {
3053 offset = search->textLength;
3054 search->isForwardSearching = FALSE;
3055 search->reset = FALSE;
3056 setColEIterOffset(strsrch->textIter, offset);
3057 }
3058 else {
3059 offset = usearch_getOffset(strsrch);
3060 }
3061
3062 int32_t matchedindex = search->matchedIndex;
3063 if (search->isForwardSearching == TRUE) {
3064 // switching direction.
3065 // if matchedIndex == USEARCH_DONE, it means that either a
3066 // setOffset has been called or that next ran off the text
3067 // string. the iterator would have been set to offset textLength if
3068 // a match is not found.
3069 search->isForwardSearching = FALSE;
3070 if (matchedindex != USEARCH_DONE) {
3071 return matchedindex;
3072 }
3073 }
3074 else {
3075 if (offset == 0 || matchedindex == 0 ||
3076 (!search->isOverlap &&
3077 (offset < strsrch->pattern.defaultShiftSize ||
3078 (matchedindex != USEARCH_DONE &&
3079 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3080 // not enough characters to match
3081 setMatchNotFound(strsrch);
3082 return USEARCH_DONE;
3083 }
3084 }
3085
3086 if (U_SUCCESS(*status)) {
3087 if (strsrch->pattern.CELength == 0) {
3088 search->matchedIndex =
3089 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3090 if (search->matchedIndex == 0) {
3091 setMatchNotFound(strsrch);
3092 // status checked below
3093 }
3094 else { // move by codepoints
3095 UTF_BACK_1(search->text, 0, search->matchedIndex);
3096 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3097 // status checked below
3098 search->matchedLength = 0;
3099 }
3100 }
3101 else {
3102 if (strsrch->search->isCanonicalMatch) {
3103 // can't use exact here since extra accents are allowed.
3104 usearch_handlePreviousCanonical(strsrch, status);
3105 // status checked below
3106 }
3107 else {
3108 usearch_handlePreviousExact(strsrch, status);
3109 // status checked below
3110 }
3111 }
3112
3113 if (U_FAILURE(*status)) {
3114 return USEARCH_DONE;
3115 }
3116
3117 return search->matchedIndex;
3118 }
3119 }
3120 return USEARCH_DONE;
3121}
3122
3123
3124
3125U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3126{
3127 /*
3128 reset is setting the attributes that are already in
3129 string search, hence all attributes in the collator should
3130 be retrieved without any problems
3131 */
3132 if (strsrch) {
3133 UErrorCode status = U_ZERO_ERROR;
3134 UBool sameCollAttribute = TRUE;
3135 uint32_t ceMask;
3136 UBool shift;
3137 uint32_t varTop;
3138
3139 strsrch->strength = ucol_getStrength(strsrch->collator);
3140 ceMask = getMask(strsrch->strength);
3141 if (strsrch->ceMask != ceMask) {
3142 strsrch->ceMask = ceMask;
3143 sameCollAttribute = FALSE;
3144 }
3145 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3146 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3147 &status) == UCOL_SHIFTED;
3148 if (strsrch->toShift != shift) {
3149 strsrch->toShift = shift;
3150 sameCollAttribute = FALSE;
3151 }
3152
3153 // if status is a failure, ucol_getVariableTop returns 0
3154 varTop = ucol_getVariableTop(strsrch->collator, &status);
3155 if (strsrch->variableTop != varTop) {
3156 strsrch->variableTop = varTop;
3157 sameCollAttribute = FALSE;
3158 }
3159 if (!sameCollAttribute) {
3160 initialize(strsrch, &status);
3161 }
3162 uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3163 strsrch->search->textLength,
3164 &(strsrch->textIter->iteratordata_));
3165 strsrch->search->matchedLength = 0;
3166 strsrch->search->matchedIndex = USEARCH_DONE;
3167 strsrch->search->isOverlap = FALSE;
3168 strsrch->search->isCanonicalMatch = FALSE;
3169 strsrch->search->isForwardSearching = TRUE;
3170 strsrch->search->reset = TRUE;
3171 }
3172}
3173
3174// internal use methods declared in usrchimp.h -----------------------------
3175
3176UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
3177{
3178 if (U_FAILURE(*status)) {
3179 setMatchNotFound(strsrch);
3180 return FALSE;
3181 }
3182
374ca955 3183 UCollationElements *coleiter = strsrch->textIter;
b75a7d8f 3184 int32_t textlength = strsrch->search->textLength;
374ca955 3185 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f
A
3186 int32_t patterncelength = strsrch->pattern.CELength;
3187 int32_t textoffset = ucol_getOffset(coleiter);
3188
374ca955
A
3189 // status used in setting coleiter offset, since offset is checked in
3190 // shiftForward before setting the coleiter offset, status never
3191 // a failure
b75a7d8f
A
3192 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3193 patterncelength);
3194 while (textoffset <= textlength)
3195 {
3196 uint32_t patternceindex = patterncelength - 1;
374ca955 3197 int32_t targetce;
b75a7d8f 3198 UBool found = FALSE;
374ca955
A
3199 int32_t lastce = UCOL_NULLORDER;
3200
3201 setColEIterOffset(coleiter, textoffset);
3202
b75a7d8f
A
3203 while (TRUE) {
3204 // finding the last pattern ce match, imagine composite characters
3205 // for example: search for pattern A in text \u00C0
3206 // we'll have to skip \u0300 the grave first before we get to A
3207 targetce = ucol_previous(coleiter, status);
3208 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3209 found = FALSE;
3210 break;
3211 }
3212 targetce = getCE(strsrch, targetce);
3213 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
3214 // this is for the text \u0315\u0300 that requires
3215 // normalization and pattern \u0300, where \u0315 is ignorable
3216 continue;
3217 }
3218 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3219 lastce = targetce;
3220 }
3221 if (targetce == patternce[patternceindex]) {
3222 // the first ce can be a contraction
3223 found = TRUE;
3224 break;
3225 }
3226 if (!hasExpansion(coleiter)) {
3227 found = FALSE;
3228 break;
3229 }
3230 }
3231
3232 targetce = lastce;
3233
3234 while (found && patternceindex > 0) {
3235 targetce = ucol_previous(coleiter, status);
3236 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3237 found = FALSE;
3238 break;
3239 }
3240 targetce = getCE(strsrch, targetce);
3241 if (targetce == UCOL_IGNORABLE) {
3242 continue;
3243 }
3244
3245 patternceindex --;
3246 found = found && targetce == patternce[patternceindex];
3247 }
3248
3249 if (!found) {
374ca955
A
3250 if (U_FAILURE(*status)) {
3251 break;
3252 }
3253 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
3254 patternceindex);
3255 // status checked at loop.
3256 patternceindex = patterncelength;
3257 continue;
3258 }
374ca955
A
3259
3260 if (checkNextExactMatch(strsrch, &textoffset, status)) {
b75a7d8f 3261 // status checked in ucol_setOffset
374ca955
A
3262 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3263 return TRUE;
b75a7d8f
A
3264 }
3265 }
3266 setMatchNotFound(strsrch);
374ca955 3267 return FALSE;
b75a7d8f
A
3268}
3269
3270UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
3271{
3272 if (U_FAILURE(*status)) {
3273 setMatchNotFound(strsrch);
3274 return FALSE;
3275 }
3276
3277 UCollationElements *coleiter = strsrch->textIter;
3278 int32_t textlength = strsrch->search->textLength;
374ca955 3279 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f 3280 int32_t patterncelength = strsrch->pattern.CELength;
374ca955 3281 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
3282 UBool hasPatternAccents =
3283 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3284
3285 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3286 patterncelength);
3287 strsrch->canonicalPrefixAccents[0] = 0;
3288 strsrch->canonicalSuffixAccents[0] = 0;
3289
3290 while (textoffset <= textlength)
3291 {
3292 int32_t patternceindex = patterncelength - 1;
374ca955 3293 int32_t targetce;
b75a7d8f 3294 UBool found = FALSE;
374ca955
A
3295 int32_t lastce = UCOL_NULLORDER;
3296
3297 setColEIterOffset(coleiter, textoffset);
3298
3299 for (;;) {
b75a7d8f
A
3300 // finding the last pattern ce match, imagine composite characters
3301 // for example: search for pattern A in text \u00C0
3302 // we'll have to skip \u0300 the grave first before we get to A
3303 targetce = ucol_previous(coleiter, status);
3304 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3305 found = FALSE;
3306 break;
3307 }
3308 targetce = getCE(strsrch, targetce);
3309 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3310 lastce = targetce;
3311 }
3312 if (targetce == patternce[patternceindex]) {
3313 // the first ce can be a contraction
3314 found = TRUE;
3315 break;
3316 }
3317 if (!hasExpansion(coleiter)) {
3318 found = FALSE;
3319 break;
3320 }
3321 }
b75a7d8f
A
3322
3323 while (found && patternceindex > 0) {
3324 targetce = ucol_previous(coleiter, status);
3325 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3326 found = FALSE;
3327 break;
3328 }
3329 targetce = getCE(strsrch, targetce);
3330 if (targetce == UCOL_IGNORABLE) {
3331 continue;
3332 }
3333
3334 patternceindex --;
3335 found = found && targetce == patternce[patternceindex];
3336 }
3337
3338 // initializing the rearranged accent array
3339 if (hasPatternAccents && !found) {
3340 strsrch->canonicalPrefixAccents[0] = 0;
3341 strsrch->canonicalSuffixAccents[0] = 0;
374ca955
A
3342 if (U_FAILURE(*status)) {
3343 break;
3344 }
b75a7d8f
A
3345 found = doNextCanonicalMatch(strsrch, textoffset, status);
3346 }
3347
3348 if (!found) {
374ca955
A
3349 if (U_FAILURE(*status)) {
3350 break;
3351 }
3352 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
3353 patternceindex);
3354 // status checked at loop
3355 patternceindex = patterncelength;
3356 continue;
3357 }
3358
3359 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
3360 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3361 return TRUE;
3362 }
3363 }
3364 setMatchNotFound(strsrch);
3365 return FALSE;
3366}
3367
3368UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
3369{
3370 if (U_FAILURE(*status)) {
3371 setMatchNotFound(strsrch);
3372 return FALSE;
3373 }
3374
3375 UCollationElements *coleiter = strsrch->textIter;
374ca955 3376 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f 3377 int32_t patterncelength = strsrch->pattern.CELength;
374ca955 3378 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
3379
3380 // shifting it check for setting offset
3381 // if setOffset is called previously or there was no previous match, we
3382 // leave the offset as it is.
3383 if (strsrch->search->matchedIndex != USEARCH_DONE) {
3384 textoffset = strsrch->search->matchedIndex;
3385 }
3386
3387 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3388 patterncelength);
3389
3390 while (textoffset >= 0)
3391 {
3392 int32_t patternceindex = 1;
374ca955 3393 int32_t targetce;
b75a7d8f 3394 UBool found = FALSE;
374ca955 3395 int32_t firstce = UCOL_NULLORDER;
b75a7d8f 3396
374ca955 3397 // if status is a failure, ucol_setOffset does nothing
b75a7d8f 3398 setColEIterOffset(coleiter, textoffset);
374ca955
A
3399
3400 for (;;) {
b75a7d8f
A
3401 // finding the first pattern ce match, imagine composite
3402 // characters. for example: search for pattern \u0300 in text
3403 // \u00C0, we'll have to skip A first before we get to
3404 // \u0300 the grave accent
3405 targetce = ucol_next(coleiter, status);
3406 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3407 found = FALSE;
3408 break;
3409 }
3410 targetce = getCE(strsrch, targetce);
3411 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3412 firstce = targetce;
3413 }
3414 if (targetce == UCOL_IGNORABLE) {
3415 continue;
3416 }
3417 if (targetce == patternce[0]) {
3418 found = TRUE;
3419 break;
3420 }
3421 if (!hasExpansion(coleiter)) {
3422 // checking for accents in composite character
3423 found = FALSE;
3424 break;
3425 }
3426 }
3427
3428 targetce = firstce;
3429
3430 while (found && (patternceindex < patterncelength)) {
3431 targetce = ucol_next(coleiter, status);
3432 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3433 found = FALSE;
3434 break;
3435 }
3436 targetce = getCE(strsrch, targetce);
3437 if (targetce == UCOL_IGNORABLE) {
3438 continue;
3439 }
3440
3441 found = found && targetce == patternce[patternceindex];
3442 patternceindex ++;
3443 }
3444
3445 if (!found) {
374ca955
A
3446 if (U_FAILURE(*status)) {
3447 break;
3448 }
b75a7d8f
A
3449 textoffset = reverseShift(strsrch, textoffset, targetce,
3450 patternceindex);
3451 patternceindex = 0;
3452 continue;
3453 }
3454
3455 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
3456 setColEIterOffset(coleiter, textoffset);
3457 return TRUE;
3458 }
3459 }
3460 setMatchNotFound(strsrch);
3461 return FALSE;
3462}
3463
3464UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
3465 UErrorCode *status)
3466{
3467 if (U_FAILURE(*status)) {
3468 setMatchNotFound(strsrch);
3469 return FALSE;
3470 }
3471
3472 UCollationElements *coleiter = strsrch->textIter;
374ca955 3473 int32_t *patternce = strsrch->pattern.CE;
b75a7d8f 3474 int32_t patterncelength = strsrch->pattern.CELength;
374ca955 3475 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
3476 UBool hasPatternAccents =
3477 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3478
3479 // shifting it check for setting offset
3480 // if setOffset is called previously or there was no previous match, we
3481 // leave the offset as it is.
3482 if (strsrch->search->matchedIndex != USEARCH_DONE) {
3483 textoffset = strsrch->search->matchedIndex;
3484 }
3485
3486 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3487 patterncelength);
3488 strsrch->canonicalPrefixAccents[0] = 0;
3489 strsrch->canonicalSuffixAccents[0] = 0;
3490
3491 while (textoffset >= 0)
3492 {
3493 int32_t patternceindex = 1;
374ca955 3494 int32_t targetce;
b75a7d8f 3495 UBool found = FALSE;
374ca955 3496 int32_t firstce = UCOL_NULLORDER;
b75a7d8f
A
3497
3498 setColEIterOffset(coleiter, textoffset);
3499 while (TRUE) {
3500 // finding the first pattern ce match, imagine composite
3501 // characters. for example: search for pattern \u0300 in text
3502 // \u00C0, we'll have to skip A first before we get to
3503 // \u0300 the grave accent
3504 targetce = ucol_next(coleiter, status);
3505 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3506 found = FALSE;
3507 break;
3508 }
3509 targetce = getCE(strsrch, targetce);
3510 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3511 firstce = targetce;
3512 }
3513
3514 if (targetce == patternce[0]) {
3515 // the first ce can be a contraction
3516 found = TRUE;
3517 break;
3518 }
3519 if (!hasExpansion(coleiter)) {
3520 // checking for accents in composite character
3521 found = FALSE;
3522 break;
3523 }
3524 }
3525
3526 targetce = firstce;
3527
3528 while (found && patternceindex < patterncelength) {
3529 targetce = ucol_next(coleiter, status);
3530 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3531 found = FALSE;
3532 break;
3533 }
3534 targetce = getCE(strsrch, targetce);
3535 if (targetce == UCOL_IGNORABLE) {
3536 continue;
3537 }
3538
3539 found = found && targetce == patternce[patternceindex];
3540 patternceindex ++;
3541 }
3542
3543 // initializing the rearranged accent array
3544 if (hasPatternAccents && !found) {
3545 strsrch->canonicalPrefixAccents[0] = 0;
3546 strsrch->canonicalSuffixAccents[0] = 0;
374ca955 3547 if (U_FAILURE(*status)) {
b75a7d8f
A
3548 break;
3549 }
3550 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
3551 }
3552
3553 if (!found) {
374ca955 3554 if (U_FAILURE(*status)) {
b75a7d8f
A
3555 break;
3556 }
3557 textoffset = reverseShift(strsrch, textoffset, targetce,
3558 patternceindex);
3559 patternceindex = 0;
3560 continue;
3561 }
3562
3563 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
3564 setColEIterOffset(coleiter, textoffset);
3565 return TRUE;
3566 }
3567 }
3568 setMatchNotFound(strsrch);
3569 return FALSE;
3570}
3571
3572#endif /* #if !UCONFIG_NO_COLLATION */