2 **********************************************************************
3 * Copyright (C) 2001-2011 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
28 // don't use Boyer-Moore
29 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34 // internal definition ---------------------------------------------------
36 #define LAST_BYTE_MASK_ 0xFF
37 #define SECOND_LAST_BYTE_SHIFT_ 8
38 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
40 static const Normalizer2Impl
*g_nfcImpl
= NULL
;
42 // internal methods -------------------------------------------------
45 * Fast collation element iterator setOffset.
46 * This function does not check for bounds.
47 * @param coleiter collation element iterator
48 * @param offset to set
51 inline void setColEIterOffset(UCollationElements
*elems
,
54 collIterate
*ci
= &(elems
->iteratordata_
);
55 ci
->pos
= ci
->string
+ offset
;
56 ci
->CEpos
= ci
->toReturn
= ci
->extendCEs
? ci
->extendCEs
: ci
->CEs
;
57 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
58 ci
->flags
= ci
->origFlags
;
60 ci
->fcdPosition
= NULL
;
62 ci
->offsetReturn
= NULL
;
63 ci
->offsetStore
= ci
->offsetBuffer
;
64 ci
->offsetRepeatCount
= ci
->offsetRepeatValue
= 0;
68 * Getting the mask for collation strength
69 * @param strength collation strength
70 * @return collation element mask
73 inline uint32_t getMask(UCollationStrength strength
)
78 return UCOL_PRIMARYORDERMASK
;
80 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
82 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
83 UCOL_PRIMARYORDERMASK
;
88 * This is to squeeze the 21bit ces into a 256 table
89 * @param ce collation element
90 * @return collapsed version of the collation element
93 inline int hash(uint32_t ce
)
95 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
96 // well with the new collation where most of the latin 1 characters
97 // are of the value xx000xxx. their hashes will most of the time be 0
98 // to be discussed on the hash algo.
99 return UCOL_PRIMARYORDER(ce
) % MAX_TABLE_SIZE_
;
103 static UBool U_CALLCONV
104 usearch_cleanup(void) {
111 * Initializing the fcd tables.
112 * Internal method, status assumed to be a success.
113 * @param status output error if any, caller to check status before calling
114 * method, status assumed to be success when passed in.
117 inline void initializeFCD(UErrorCode
*status
)
119 if (g_nfcImpl
== NULL
) {
120 g_nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
121 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
126 * Gets the fcd value for a character at the argument index.
127 * This method takes into accounts of the supplementary characters.
128 * @param str UTF16 string where character for fcd retrieval resides
129 * @param offset position of the character whose fcd is to be retrieved, to be
130 * overwritten with the next character position, taking
131 * surrogate characters into consideration.
132 * @param strlength length of the argument string
136 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
139 const UChar
*temp
= str
+ *offset
;
140 uint16_t result
= g_nfcImpl
->nextFCD16(temp
, str
+ strlength
);
141 *offset
= (int32_t)(temp
- str
);
146 * Getting the modified collation elements taking into account the collation
148 * @param strsrch string search data
150 * @return the modified collation element
153 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
155 // note for tertiary we can't use the collator->tertiaryMask, that
156 // is a preprocessed mask that takes into account case options. since
157 // we are only concerned with exact matches, we don't need that.
158 sourcece
&= strsrch
->ceMask
;
160 if (strsrch
->toShift
) {
161 // alternate handling here, since only the 16 most significant digits
162 // is only used, we can safely do a compare without masking
163 // if the ce is a variable, we mask and get only the primary values
164 // no shifting to quartenary is required since all primary values
165 // less than variabletop will need to be masked off anyway.
166 if (strsrch
->variableTop
> sourcece
) {
167 if (strsrch
->strength
>= UCOL_QUATERNARY
) {
168 sourcece
&= UCOL_PRIMARYORDERMASK
;
171 sourcece
= UCOL_IGNORABLE
;
174 } else if (strsrch
->strength
>= UCOL_QUATERNARY
&& sourcece
== UCOL_IGNORABLE
) {
182 * Allocate a memory and returns NULL if it failed.
183 * Internal method, status assumed to be a success.
184 * @param size to allocate
185 * @param status output error if any, caller to check status before calling
186 * method, status assumed to be success when passed in.
187 * @return newly allocated array, NULL otherwise
190 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
192 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
193 if (result
== NULL
) {
194 *status
= U_MEMORY_ALLOCATION_ERROR
;
200 * Adds a uint32_t value to a destination array.
201 * Creates a new array if we run out of space. The caller will have to
202 * manually deallocate the newly allocated array.
203 * Internal method, status assumed to be success, caller has to check status
204 * before calling this method. destination not to be NULL and has at least
205 * size destinationlength.
206 * @param destination target array
207 * @param offset destination offset to add value
208 * @param destinationlength target array size, return value for the new size
209 * @param value to be added
210 * @param increments incremental size expected
211 * @param status output error if any, caller to check status before calling
212 * method, status assumed to be success when passed in.
213 * @return new destination array, destination if there was no new allocation
216 inline int32_t * addTouint32_tArray(int32_t *destination
,
218 uint32_t *destinationlength
,
223 uint32_t newlength
= *destinationlength
;
224 if (offset
+ 1 == newlength
) {
225 newlength
+= increments
;
226 int32_t *temp
= (int32_t *)allocateMemory(
227 sizeof(int32_t) * newlength
, status
);
228 if (U_FAILURE(*status
)) {
231 uprv_memcpy(temp
, destination
, sizeof(int32_t) * offset
);
232 *destinationlength
= newlength
;
235 destination
[offset
] = value
;
240 * Adds a uint64_t value to a destination array.
241 * Creates a new array if we run out of space. The caller will have to
242 * manually deallocate the newly allocated array.
243 * Internal method, status assumed to be success, caller has to check status
244 * before calling this method. destination not to be NULL and has at least
245 * size destinationlength.
246 * @param destination target array
247 * @param offset destination offset to add value
248 * @param destinationlength target array size, return value for the new size
249 * @param value to be added
250 * @param increments incremental size expected
251 * @param status output error if any, caller to check status before calling
252 * method, status assumed to be success when passed in.
253 * @return new destination array, destination if there was no new allocation
256 inline int64_t * addTouint64_tArray(int64_t *destination
,
258 uint32_t *destinationlength
,
263 uint32_t newlength
= *destinationlength
;
264 if (offset
+ 1 == newlength
) {
265 newlength
+= increments
;
266 int64_t *temp
= (int64_t *)allocateMemory(
267 sizeof(int64_t) * newlength
, status
);
269 if (U_FAILURE(*status
)) {
273 uprv_memcpy(temp
, destination
, sizeof(int64_t) * offset
);
274 *destinationlength
= newlength
;
278 destination
[offset
] = value
;
284 * Initializing the ce table for a pattern.
285 * Stores non-ignorable collation keys.
286 * Table size will be estimated by the size of the pattern text. Table
287 * expansion will be perform as we go along. Adding 1 to ensure that the table
288 * size definitely increases.
289 * Internal method, status assumed to be a success.
290 * @param strsrch string search data
291 * @param status output error if any, caller to check status before calling
292 * method, status assumed to be success when passed in.
293 * @return total number of expansions
296 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
299 UPattern
*pattern
= &(strsrch
->pattern
);
300 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
301 int32_t *cetable
= pattern
->CEBuffer
;
302 uint32_t patternlength
= pattern
->textLength
;
303 UCollationElements
*coleiter
= strsrch
->utilIter
;
305 if (coleiter
== NULL
) {
306 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
307 patternlength
, status
);
308 // status will be checked in ucol_next(..) later and if it is an
309 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
311 strsrch
->utilIter
= coleiter
;
314 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
316 &coleiter
->iteratordata_
,
319 if(U_FAILURE(*status
)) {
323 if (pattern
->CE
!= cetable
&& pattern
->CE
) {
324 uprv_free(pattern
->CE
);
331 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
332 U_SUCCESS(*status
)) {
333 uint32_t newce
= getCE(strsrch
, ce
);
335 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
337 patternlength
- ucol_getOffset(coleiter
) + 1,
339 if (U_FAILURE(*status
)) {
343 if (cetable
!= temp
&& cetable
!= pattern
->CEBuffer
) {
348 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
352 pattern
->CE
= cetable
;
353 pattern
->CELength
= offset
;
359 * Initializing the pce table for a pattern.
360 * Stores non-ignorable collation keys.
361 * Table size will be estimated by the size of the pattern text. Table
362 * expansion will be perform as we go along. Adding 1 to ensure that the table
363 * size definitely increases.
364 * Internal method, status assumed to be a success.
365 * @param strsrch string search data
366 * @param status output error if any, caller to check status before calling
367 * method, status assumed to be success when passed in.
368 * @return total number of expansions
371 inline uint16_t initializePatternPCETable(UStringSearch
*strsrch
,
374 UPattern
*pattern
= &(strsrch
->pattern
);
375 uint32_t pcetablesize
= INITIAL_ARRAY_SIZE_
;
376 int64_t *pcetable
= pattern
->PCEBuffer
;
377 uint32_t patternlength
= pattern
->textLength
;
378 UCollationElements
*coleiter
= strsrch
->utilIter
;
380 if (coleiter
== NULL
) {
381 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
382 patternlength
, status
);
383 // status will be checked in ucol_next(..) later and if it is an
384 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
386 strsrch
->utilIter
= coleiter
;
388 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
390 &coleiter
->iteratordata_
,
393 if(U_FAILURE(*status
)) {
397 if (pattern
->PCE
!= pcetable
&& pattern
->PCE
!= NULL
) {
398 uprv_free(pattern
->PCE
);
405 uprv_init_pce(coleiter
);
407 // ** Should processed CEs be signed or unsigned?
408 // ** (the rest of the code in this file seems to play fast-and-loose with
409 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
410 while ((pce
= ucol_nextProcessed(coleiter
, NULL
, NULL
, status
)) != UCOL_PROCESSED_NULLORDER
&&
411 U_SUCCESS(*status
)) {
412 int64_t *temp
= addTouint64_tArray(pcetable
, offset
, &pcetablesize
,
414 patternlength
- ucol_getOffset(coleiter
) + 1,
417 if (U_FAILURE(*status
)) {
423 if (pcetable
!= temp
&& pcetable
!= pattern
->PCEBuffer
) {
428 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
431 pcetable
[offset
] = 0;
432 pattern
->PCE
= pcetable
;
433 pattern
->PCELength
= offset
;
439 * Initializes the pattern struct.
440 * Internal method, status assumed to be success.
441 * @param strsrch UStringSearch data storage
442 * @param status output error if any, caller to check status before calling
443 * method, status assumed to be success when passed in.
444 * @return expansionsize the total expansion size of the pattern
447 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
449 UPattern
*pattern
= &(strsrch
->pattern
);
450 const UChar
*patterntext
= pattern
->text
;
451 int32_t length
= pattern
->textLength
;
454 // Since the strength is primary, accents are ignored in the pattern.
455 if (strsrch
->strength
== UCOL_PRIMARY
) {
456 pattern
->hasPrefixAccents
= 0;
457 pattern
->hasSuffixAccents
= 0;
459 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
460 SECOND_LAST_BYTE_SHIFT_
;
462 U16_BACK_1(patterntext
, 0, index
);
463 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
468 if (strsrch
->pattern
.PCE
!= NULL
) {
469 if (strsrch
->pattern
.PCE
!= strsrch
->pattern
.PCEBuffer
) {
470 uprv_free(strsrch
->pattern
.PCE
);
473 strsrch
->pattern
.PCE
= NULL
;
476 // since intializePattern is an internal method status is a success.
477 return initializePatternCETable(strsrch
, status
);
481 * Initializing shift tables, with the default values.
482 * If a corresponding default value is 0, the shift table is not set.
483 * @param shift table for forwards shift
484 * @param backshift table for backwards shift
485 * @param cetable table containing pattern ce
486 * @param cesize size of the pattern ces
487 * @param expansionsize total size of the expansions
488 * @param defaultforward the default forward value
489 * @param defaultbackward the default backward value
492 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
493 int32_t *cetable
, int32_t cesize
,
494 int16_t expansionsize
,
495 int16_t defaultforward
,
496 int16_t defaultbackward
)
498 // estimate the value to shift. to do that we estimate the smallest
499 // number of characters to give the relevant ces, ie approximately
500 // the number of ces minus their expansion, since expansions can come
503 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
504 shift
[count
] = defaultforward
;
506 cesize
--; // down to the last index
507 for (count
= 0; count
< cesize
; count
++) {
508 // number of ces from right of array to the count
509 int temp
= defaultforward
- count
- 1;
510 shift
[hash(cetable
[count
])] = temp
> 1 ? temp
: 1;
512 shift
[hash(cetable
[cesize
])] = 1;
513 // for ignorables we just shift by one. see test examples.
516 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
517 backshift
[count
] = defaultbackward
;
519 for (count
= cesize
; count
> 0; count
--) {
520 // the original value count does not seem to work
521 backshift
[hash(cetable
[count
])] = count
> expansionsize
?
522 (int16_t)(count
- expansionsize
) : 1;
524 backshift
[hash(cetable
[0])] = 1;
525 backshift
[hash(0)] = 1;
529 * Building of the pattern collation element list and the boyer moore strsrch
531 * The canonical match will only be performed after the default match fails.
532 * For both cases we need to remember the size of the composed and decomposed
533 * versions of the string. Since the Boyer-Moore shift calculations shifts by
534 * a number of characters in the text and tries to match the pattern from that
535 * offset, the shift value can not be too large in case we miss some
536 * characters. To choose a right shift size, we estimate the NFC form of the
537 * and use its size as a shift guide. The NFC form should be the small
538 * possible representation of the pattern. Anyways, we'll err on the smaller
539 * shift size. Hence the calculation for minlength.
540 * Canonical match will be performed slightly differently. We'll split the
541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
542 * the first and last base character (MS), the ending accents (EA). Matches
543 * will be done on MS first, and only when we match MS then some processing
544 * will be required for the prefix and end accents in order to determine if
545 * they match PA and EA. Hence the default shift values
546 * for the canonical match will take the size of either end's accent into
547 * consideration. Forwards search will take the end accents into consideration
548 * for the default shift values and the backwards search will take the prefix
549 * accents into consideration.
550 * If pattern has no non-ignorable ce, we return a illegal argument error.
551 * Internal method, status assumed to be success.
552 * @param strsrch UStringSearch data storage
553 * @param status for output errors if it occurs, status is assumed to be a
554 * success when it is passed in.
557 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
559 int16_t expandlength
= initializePattern(strsrch
, status
);
560 if (U_SUCCESS(*status
) && strsrch
->pattern
.CELength
> 0) {
561 UPattern
*pattern
= &strsrch
->pattern
;
562 int32_t cesize
= pattern
->CELength
;
564 int16_t minlength
= cesize
> expandlength
565 ? (int16_t)cesize
- expandlength
: 1;
566 pattern
->defaultShiftSize
= minlength
;
567 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->CE
,
568 cesize
, expandlength
, minlength
, minlength
);
571 strsrch
->pattern
.defaultShiftSize
= 0;
576 * Check to make sure that the match length is at the end of the character by
577 * using the breakiterator.
578 * @param strsrch string search data
579 * @param start target text start offset
580 * @param end target text end offset
583 void checkBreakBoundary(const UStringSearch
*strsrch
, int32_t * /*start*/,
586 #if !UCONFIG_NO_BREAK_ITERATION
587 UBreakIterator
*breakiterator
= strsrch
->search
->internalBreakIter
;
589 int32_t matchend
= *end
;
590 //int32_t matchstart = *start;
592 if (!ubrk_isBoundary(breakiterator
, matchend
)) {
593 *end
= ubrk_following(breakiterator
, matchend
);
596 /* Check the start of the matched text to make sure it doesn't have any accents
597 * before it. This code may not be necessary and so it is commented out */
598 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
599 *start = ubrk_preceding(breakiterator, matchstart);
606 * Determine whether the target text in UStringSearch bounded by the offset
607 * start and end is one or more whole units of text as
608 * determined by the breakiterator in UStringSearch.
609 * @param strsrch string search data
610 * @param start target text start offset
611 * @param end target text end offset
614 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
617 #if !UCONFIG_NO_BREAK_ITERATION
618 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
621 int32_t startindex
= ubrk_first(breakiterator
);
622 int32_t endindex
= ubrk_last(breakiterator
);
624 // out-of-range indexes are never boundary positions
625 if (start
< startindex
|| start
> endindex
||
626 end
< startindex
|| end
> endindex
) {
629 // otherwise, we can use following() on the position before the
630 // specified one and return true of the position we get back is the
631 // one the user specified
632 UBool result
= (start
== startindex
||
633 ubrk_following(breakiterator
, start
- 1) == start
) &&
635 ubrk_following(breakiterator
, end
- 1) == end
);
637 // iterates the individual ces
638 UCollationElements
*coleiter
= strsrch
->utilIter
;
639 const UChar
*text
= strsrch
->search
->text
+
641 UErrorCode status
= U_ZERO_ERROR
;
642 ucol_setText(coleiter
, text
, end
- start
, &status
);
643 for (int32_t count
= 0; count
< strsrch
->pattern
.CELength
;
645 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
646 if (ce
== UCOL_IGNORABLE
) {
650 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.CE
[count
]) {
654 int32_t nextce
= ucol_next(coleiter
, &status
);
655 while (ucol_getOffset(coleiter
) == (end
- start
)
656 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
657 nextce
= ucol_next(coleiter
, &status
);
659 if (ucol_getOffset(coleiter
) == (end
- start
)
660 && nextce
!= UCOL_NULLORDER
) {
661 // extra collation elements at the end of the match
672 * Getting the next base character offset if current offset is an accent,
673 * or the current offset if the current character contains a base character.
674 * accents the following base character will be returned
676 * @param textoffset current offset
677 * @param textlength length of text string
678 * @return the next base character or the current offset
679 * if the current character is contains a base character.
682 inline int32_t getNextBaseOffset(const UChar
*text
,
686 if (textoffset
< textlength
) {
687 int32_t temp
= textoffset
;
688 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
689 while (temp
< textlength
) {
690 int32_t result
= temp
;
691 if ((getFCD(text
, &temp
, textlength
) >>
692 SECOND_LAST_BYTE_SHIFT_
) == 0) {
703 * Gets the next base character offset depending on the string search pattern
705 * @param strsrch string search data
706 * @param textoffset current offset, one offset away from the last character
708 * @return start index of the next base character or the current offset
709 * if the current character is contains a base character.
712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
715 int32_t textlength
= strsrch
->search
->textLength
;
716 if (strsrch
->pattern
.hasSuffixAccents
&&
717 textoffset
< textlength
) {
718 int32_t temp
= textoffset
;
719 const UChar
*text
= strsrch
->search
->text
;
720 U16_BACK_1(text
, 0, temp
);
721 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
722 return getNextBaseOffset(text
, textoffset
, textlength
);
729 * Shifting the collation element iterator position forward to prepare for
730 * a following match. If the last character is a unsafe character, we'll only
731 * shift by 1 to capture contractions, normalization etc.
732 * Internal method, status assumed to be success.
733 * @param text strsrch string search data
734 * @param textoffset start text position to do search
735 * @param ce the text ce which failed the match.
736 * @param patternceindex index of the ce within the pattern ce buffer which
738 * @return final offset
741 inline int32_t shiftForward(UStringSearch
*strsrch
,
744 int32_t patternceindex
)
746 UPattern
*pattern
= &(strsrch
->pattern
);
747 if (ce
!= UCOL_NULLORDER
) {
748 int32_t shift
= pattern
->shift
[hash(ce
)];
749 // this is to adjust for characters in the middle of the
750 // substring for matching that failed.
751 int32_t adjust
= pattern
->CELength
- patternceindex
;
752 if (adjust
> 1 && shift
>= adjust
) {
758 textoffset
+= pattern
->defaultShiftSize
;
761 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
762 // check for unsafe characters
763 // * if it is the start or middle of a contraction: to be done after
764 // a initial match is found
765 // * thai or lao base consonant character: similar to contraction
766 // * high surrogate character: similar to contraction
767 // * next character is a accent: shift to the next base character
770 #endif // #if BOYER_MOORE
773 * sets match not found
774 * @param strsrch string search data
777 inline void setMatchNotFound(UStringSearch
*strsrch
)
779 // this method resets the match result regardless of the error status.
780 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
781 strsrch
->search
->matchedLength
= 0;
782 if (strsrch
->search
->isForwardSearching
) {
783 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
786 setColEIterOffset(strsrch
->textIter
, 0);
792 * Gets the offset to the next safe point in text.
793 * ie. not the middle of a contraction, swappable characters or supplementary
795 * @param collator collation sata
796 * @param text string to work with
797 * @param textoffset offset in string
798 * @param textlength length of text string
799 * @return offset to the next safe character
802 inline int32_t getNextSafeOffset(const UCollator
*collator
,
807 int32_t result
= textoffset
; // first contraction character
808 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
815 * This checks for accents in the potential match started with a .
816 * composite character.
817 * This is really painful... we have to check that composite character do not
818 * have any extra accents. We have to normalize the potential match and find
819 * the immediate decomposed character before the match.
820 * The first composite character would have been taken care of by the fcd
821 * checks in checkForwardExactMatch.
822 * This is the slow path after the fcd of the first character and
823 * the last character has been checked by checkForwardExactMatch and we
824 * determine that the potential match has extra non-ignorable preceding
826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
828 * Note here that accents checking are slow and cautioned in the API docs.
829 * Internal method, status assumed to be a success, caller should check status
830 * before calling this method
831 * @param strsrch string search data
832 * @param start index of the potential unfriendly composite character
833 * @param end index of the potential unfriendly composite character
834 * @param status output error status if any.
835 * @return TRUE if there is non-ignorable accents before at the beginning
836 * of the match, FALSE otherwise.
840 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
844 UBool result
= FALSE
;
845 if (strsrch
->pattern
.hasPrefixAccents
) {
846 int32_t length
= end
- start
;
848 const UChar
*text
= strsrch
->search
->text
+ start
;
850 U16_FWD_1(text
, offset
, length
);
851 // we are only concerned with the first composite character
852 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
853 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
855 if (safeoffset
!= length
) {
859 UChar buffer
[INITIAL_ARRAY_SIZE_
];
860 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
861 buffer
, INITIAL_ARRAY_SIZE_
,
863 if (U_FAILURE(*status
)) {
866 if (size
>= INITIAL_ARRAY_SIZE_
) {
867 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
869 // if allocation failed, status will be set to
870 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
872 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
874 if (U_FAILURE(*status
) && norm
!= NULL
) {
883 UCollationElements
*coleiter
= strsrch
->utilIter
;
884 ucol_setText(coleiter
, norm
, size
, status
);
885 uint32_t firstce
= strsrch
->pattern
.CE
[0];
886 UBool ignorable
= TRUE
;
887 uint32_t ce
= UCOL_IGNORABLE
;
888 while (U_SUCCESS(*status
) && ce
!= firstce
&& ce
!= (uint32_t)UCOL_NULLORDER
) {
889 offset
= ucol_getOffset(coleiter
);
890 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
893 ce
= ucol_next(coleiter
, status
);
896 U16_PREV(norm
, 0, offset
, codepoint
);
897 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
899 if (norm
!= buffer
) {
909 * Used by exact matches, checks if there are accents before the match.
910 * This is really painful... we have to check that composite characters at
911 * the start of the matches have to not have any extra accents.
912 * We check the FCD of the character first, if it starts with an accent and
913 * the first pattern ce does not match the first ce of the character, we bail.
914 * Otherwise we try normalizing the first composite
915 * character and find the immediate decomposed character before the match to
916 * see if it is an non-ignorable accent.
917 * Now normalizing the first composite character is enough because we ensure
918 * that when the match is passed in here with extra beginning ces, the
919 * first or last ce that match has to occur within the first character.
920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
922 * Note here that accents checking are slow and cautioned in the API docs.
923 * @param strsrch string search data
924 * @param start offset
926 * @return TRUE if there are accents on either side of the match,
930 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
933 if (strsrch
->pattern
.hasPrefixAccents
) {
934 UCollationElements
*coleiter
= strsrch
->textIter
;
935 UErrorCode status
= U_ZERO_ERROR
;
936 // we have been iterating forwards previously
937 uint32_t ignorable
= TRUE
;
938 int32_t firstce
= strsrch
->pattern
.CE
[0];
940 setColEIterOffset(coleiter
, start
);
941 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
942 if (U_FAILURE(status
)) {
945 while (ce
!= firstce
) {
946 if (ce
!= UCOL_IGNORABLE
) {
949 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
950 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
954 if (!ignorable
&& inNormBuf(coleiter
)) {
955 // within normalization buffer, discontiguous handled here
960 int32_t temp
= start
;
962 // accent = (getFCD(strsrch->search->text, &temp,
963 // strsrch->search->textLength)
964 // >> SECOND_LAST_BYTE_SHIFT_);
965 // however this code does not work well with VC7 .net in release mode.
966 // maybe the inlines for getFCD combined with shifting has bugs in
967 // VC7. anyways this is a work around.
968 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
969 strsrch
->search
->textLength
) > 0xFF;
971 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
978 U16_BACK_1(strsrch
->search
->text
, 0, temp
);
979 if (getFCD(strsrch
->search
->text
, &temp
,
980 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
981 setColEIterOffset(coleiter
, start
);
982 ce
= ucol_previous(coleiter
, &status
);
983 if (U_FAILURE(status
) ||
984 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
995 * Used by exact matches, checks if there are accents bounding the match.
996 * Note this is the initial boundary check. If the potential match
997 * starts or ends with composite characters, the accents in those
998 * characters will be determined later.
999 * Not doing backwards iteration here, since discontiguos contraction for
1000 * backwards collation element iterator, use up too many characters.
1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
1002 * should fail since there is a acute at the end of \u01FA
1003 * Note here that accents checking are slow and cautioned in the API docs.
1004 * @param strsrch string search data
1005 * @param start offset of match
1006 * @param end end offset of the match
1007 * @return TRUE if there are accents on either side of the match,
1011 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
1014 if (strsrch
->pattern
.hasSuffixAccents
) {
1015 const UChar
*text
= strsrch
->search
->text
;
1017 int32_t textlength
= strsrch
->search
->textLength
;
1018 U16_BACK_1(text
, 0, temp
);
1019 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
1020 int32_t firstce
= strsrch
->pattern
.CE
[0];
1021 UCollationElements
*coleiter
= strsrch
->textIter
;
1022 UErrorCode status
= U_ZERO_ERROR
;
1024 setColEIterOffset(coleiter
, start
);
1025 while ((ce
= getCE(strsrch
, ucol_next(coleiter
, &status
))) != firstce
) {
1026 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
1031 while (count
< strsrch
->pattern
.CELength
) {
1032 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
1033 == UCOL_IGNORABLE
) {
1034 // Thai can give an ignorable here.
1037 if (U_FAILURE(status
)) {
1043 ce
= ucol_next(coleiter
, &status
);
1044 if (U_FAILURE(status
)) {
1047 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1048 ce
= getCE(strsrch
, ce
);
1050 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1051 if (ucol_getOffset(coleiter
) <= end
) {
1054 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
1062 #endif // #if BOYER_MOORE
1065 * Checks if the offset runs out of the text string
1067 * @param textlength of the text string
1068 * @return TRUE if offset is out of bounds, FALSE otherwise
1071 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
1073 return offset
< 0 || offset
> textlength
;
1077 * Checks for identical match
1078 * @param strsrch string search data
1079 * @param start offset of possible match
1080 * @param end offset of possible match
1081 * @return TRUE if identical match is found
1084 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
1087 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
1091 // Note: We could use Normalizer::compare() or similar, but for short strings
1092 // which may not be in FCD it might be faster to just NFD them.
1093 UErrorCode status
= U_ZERO_ERROR
;
1094 UnicodeString t2
, p2
;
1095 strsrch
->nfd
->normalize(
1096 UnicodeString(FALSE
, strsrch
->search
->text
+ start
, end
- start
), t2
, status
);
1097 strsrch
->nfd
->normalize(
1098 UnicodeString(FALSE
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
), p2
, status
);
1099 // return FALSE if NFD failed
1100 return U_SUCCESS(status
) && t2
== p2
;
1105 * Checks to see if the match is repeated
1106 * @param strsrch string search data
1107 * @param start new match start index
1108 * @param end new match end index
1109 * @return TRUE if the the match is repeated, FALSE otherwise
1112 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
1116 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
1118 if (lastmatchindex
== USEARCH_DONE
) {
1121 if (strsrch
->search
->isForwardSearching
) {
1122 result
= start
<= lastmatchindex
;
1125 result
= start
>= lastmatchindex
;
1127 if (!result
&& !strsrch
->search
->isOverlap
) {
1128 if (strsrch
->search
->isForwardSearching
) {
1129 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
1132 result
= end
> lastmatchindex
;
1139 * Gets the collation element iterator's current offset.
1140 * @param coleiter collation element iterator
1141 * @param forwards flag TRUE if we are moving in th forwards direction
1142 * @return current offset
1145 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
1148 int32_t result
= ucol_getOffset(coleiter
);
1149 // intricacies of the the backwards collation element iterator
1150 if (FALSE
&& !forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1157 * Checks match for contraction.
1158 * If the match ends with a partial contraction we fail.
1159 * If the match starts too far off (because of backwards iteration) we try to
1160 * chip off the extra characters depending on whether a breakiterator has
1162 * Internal method, error assumed to be success, caller has to check status
1163 * before calling this method.
1164 * @param strsrch string search data
1165 * @param start offset of potential match, to be modified if necessary
1166 * @param end offset of potential match, to be modified if necessary
1167 * @param status output error status if any
1168 * @return TRUE if match passes the contraction test, FALSE otherwise
1172 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1174 int32_t *end
, UErrorCode
*status
)
1176 UCollationElements
*coleiter
= strsrch
->textIter
;
1177 int32_t textlength
= strsrch
->search
->textLength
;
1178 int32_t temp
= *start
;
1179 const UCollator
*collator
= strsrch
->collator
;
1180 const UChar
*text
= strsrch
->search
->text
;
1181 // This part checks if either ends of the match contains potential
1182 // contraction. If so we'll have to iterate through them
1183 // The start contraction needs to be checked since ucol_previous dumps
1184 // all characters till the first safe character into the buffer.
1185 // *start + 1 is used to test for the unsafe characters instead of *start
1186 // because ucol_prev takes all unsafe characters till the first safe
1187 // character ie *start. so by testing *start + 1, we can estimate if
1188 // excess prefix characters has been included in the potential search
1190 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1191 (*start
+ 1 < textlength
1192 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1193 int32_t expansion
= getExpansionPrefix(coleiter
);
1194 UBool expandflag
= expansion
> 0;
1195 setColEIterOffset(coleiter
, *start
);
1196 while (expansion
> 0) {
1197 // getting rid of the redundant ce, caused by setOffset.
1198 // since backward contraction/expansion may have extra ces if we
1199 // are in the normalization buffer, hasAccentsBeforeMatch would
1200 // have taken care of it.
1201 // E.g. the character \u01FA will have an expansion of 3, but if
1202 // we are only looking for acute and ring \u030A and \u0301, we'll
1203 // have to skip the first ce in the expansion buffer.
1204 ucol_next(coleiter
, status
);
1205 if (U_FAILURE(*status
)) {
1208 if (ucol_getOffset(coleiter
) != temp
) {
1210 temp
= ucol_getOffset(coleiter
);
1215 int32_t *patternce
= strsrch
->pattern
.CE
;
1216 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1218 while (count
< patterncelength
) {
1219 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1220 if (ce
== UCOL_IGNORABLE
) {
1223 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1225 temp
= ucol_getOffset(coleiter
);
1227 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1229 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1239 * Checks and sets the match information if found.
1242 * <li> the potential match does not repeat the previous match
1243 * <li> boundaries are correct
1244 * <li> exact matches has no extra accents
1245 * <li> identical matchesb
1246 * <li> potential match does not end in the middle of a contraction
1248 * Otherwise the offset will be shifted to the next character.
1249 * Internal method, status assumed to be success, caller has to check status
1250 * before calling this method.
1251 * @param strsrch string search data
1252 * @param textoffset offset in the collation element text. the returned value
1253 * will be the truncated end offset of the match or the new start
1255 * @param status output error status if any
1256 * @return TRUE if the match is valid, FALSE otherwise
1259 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1260 int32_t *textoffset
, UErrorCode
*status
)
1262 UCollationElements
*coleiter
= strsrch
->textIter
;
1263 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1265 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1269 // this totally matches, however we need to check if it is repeating
1270 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1271 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1272 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1273 !checkIdentical(strsrch
, start
, *textoffset
) ||
1274 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1277 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1281 //Add breakiterator boundary check for primary strength search.
1282 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
1283 checkBreakBoundary(strsrch
, &start
, textoffset
);
1286 // totally match, we will get rid of the ending ignorables.
1287 strsrch
->search
->matchedIndex
= start
;
1288 strsrch
->search
->matchedLength
= *textoffset
- start
;
1293 * Getting the previous base character offset, or the current offset if the
1294 * current character is a base character
1295 * @param text string
1296 * @param textoffset one offset after the current character
1297 * @return the offset of the next character after the base character or the first
1298 * composed character with accents
1301 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1304 if (textoffset
> 0) {
1306 int32_t result
= textoffset
;
1307 U16_BACK_1(text
, 0, textoffset
);
1308 int32_t temp
= textoffset
;
1309 uint16_t fcd
= getFCD(text
, &temp
, result
);
1310 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1311 if (fcd
& LAST_BYTE_MASK_
) {
1316 if (textoffset
== 0) {
1325 * Getting the indexes of the accents that are not blocked in the argument
1327 * @param accents array of accents in nfd terminated by a 0.
1328 * @param accentsindex array of indexes of the accents that are not blocked
1331 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1334 int32_t length
= u_strlen(accents
);
1335 UChar32 codepoint
= 0;
1339 while (index
< length
) {
1341 U16_NEXT(accents
, index
, length
, codepoint
);
1342 if (u_getCombiningClass(codepoint
) != cclass
) {
1343 cclass
= u_getCombiningClass(codepoint
);
1344 accentsindex
[result
] = temp
;
1348 accentsindex
[result
] = length
;
1353 * Appends 3 UChar arrays to a destination array.
1354 * Creates a new array if we run out of space. The caller will have to
1355 * manually deallocate the newly allocated array.
1356 * Internal method, status assumed to be success, caller has to check status
1357 * before calling this method. destination not to be NULL and has at least
1358 * size destinationlength.
1359 * @param destination target array
1360 * @param destinationlength target array size, returning the appended length
1361 * @param source1 null-terminated first array
1362 * @param source2 second array
1363 * @param source2length length of seond array
1364 * @param source3 null-terminated third array
1365 * @param status error status if any
1366 * @return new destination array, destination if there was no new allocation
1369 inline UChar
* addToUCharArray( UChar
*destination
,
1370 int32_t *destinationlength
,
1371 const UChar
*source1
,
1372 const UChar
*source2
,
1373 int32_t source2length
,
1374 const UChar
*source3
,
1377 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1378 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1379 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1382 destination
= (UChar
*)allocateMemory(
1383 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1385 // if error allocating memory, status will be
1386 // U_MEMORY_ALLOCATION_ERROR
1387 if (U_FAILURE(*status
)) {
1388 *destinationlength
= 0;
1392 if (source1length
!= 0) {
1393 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1395 if (source2length
!= 0) {
1396 uprv_memcpy(destination
+ source1length
, source2
,
1397 sizeof(UChar
) * source2length
);
1399 if (source3length
!= 0) {
1400 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1401 sizeof(UChar
) * source3length
);
1403 *destinationlength
= source1length
+ source2length
+ source3length
;
1408 * Running through a collation element iterator to see if the contents matches
1409 * pattern in string search data
1410 * @param strsrch string search data
1411 * @param coleiter collation element iterator
1412 * @return TRUE if a match if found, FALSE otherwise
1415 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1416 UCollationElements
*coleiter
)
1418 int patternceindex
= strsrch
->pattern
.CELength
;
1419 int32_t *patternce
= strsrch
->pattern
.CE
;
1420 UErrorCode status
= U_ZERO_ERROR
;
1421 while (patternceindex
> 0) {
1422 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1423 if (ce
== UCOL_IGNORABLE
) {
1426 if (U_FAILURE(status
) || ce
!= *patternce
) {
1436 * Rearranges the front accents to try matching.
1437 * Prefix accents in the text will be grouped according to their combining
1438 * class and the groups will be mixed and matched to try find the perfect
1439 * match with the pattern.
1440 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1441 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1442 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1444 * step 2: check if any of the generated substrings matches the pattern.
1445 * Internal method, status is assumed to be success, caller has to check status
1446 * before calling this method.
1447 * @param strsrch string search match
1448 * @param start first offset of the accents to start searching
1449 * @param end start of the last accent set
1450 * @param status output error status if any
1451 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1452 * offset of the match. Note this start includes all preceding accents.
1455 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1460 const UChar
*text
= strsrch
->search
->text
;
1461 int32_t textlength
= strsrch
->search
->textLength
;
1462 int32_t tempstart
= start
;
1464 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1465 // die... failed at a base character
1466 return USEARCH_DONE
;
1469 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1470 start
= getPreviousBaseOffset(text
, tempstart
);
1472 UChar accents
[INITIAL_ARRAY_SIZE_
];
1473 // normalizing the offensive string
1474 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1475 INITIAL_ARRAY_SIZE_
, status
);
1476 if (U_FAILURE(*status
)) {
1477 return USEARCH_DONE
;
1480 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1481 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1483 int32_t count
= (2 << (accentsize
- 1)) - 1;
1484 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1485 UCollationElements
*coleiter
= strsrch
->utilIter
;
1486 while (U_SUCCESS(*status
) && count
> 0) {
1487 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1488 // copy the base characters
1489 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1490 *rearrange
++ = accents
[k
];
1492 // forming all possible canonical rearrangement by dropping
1494 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1495 int32_t mask
= 1 << (accentsize
- i
- 1);
1497 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1498 *rearrange
++ = accents
[j
];
1503 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1504 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1505 strsrch
->canonicalPrefixAccents
,
1506 strsrch
->search
->text
+ offset
,
1508 strsrch
->canonicalSuffixAccents
,
1511 // if status is a failure, ucol_setText does nothing.
1512 // run the collator iterator through this match
1513 ucol_setText(coleiter
, match
, matchsize
, status
);
1514 if (U_SUCCESS(*status
)) {
1515 if (checkCollationMatch(strsrch
, coleiter
)) {
1516 if (match
!= buffer
) {
1524 return USEARCH_DONE
;
1528 * Gets the offset to the safe point in text before textoffset.
1529 * ie. not the middle of a contraction, swappable characters or supplementary
1531 * @param collator collation sata
1532 * @param text string to work with
1533 * @param textoffset offset in string
1534 * @param textlength length of text string
1535 * @return offset to the previous safe character
1538 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1542 int32_t result
= textoffset
; // first contraction character
1543 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1547 // the first contraction character is consider unsafe here
1554 * Cleaning up after we passed the safe zone
1555 * @param strsrch string search data
1556 * @param safetext safe text array
1557 * @param safebuffer safe text buffer
1558 * @param coleiter collation element iterator for safe text
1561 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1564 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1566 uprv_free(safetext
);
1571 * Take the rearranged end accents and tries matching. If match failed at
1572 * a seperate preceding set of accents (seperated from the rearranged on by
1573 * at least a base character) then we rearrange the preceding accents and
1574 * tries matching again.
1575 * We allow skipping of the ends of the accent set if the ces do not match.
1576 * However if the failure is found before the accent set, it fails.
1577 * Internal method, status assumed to be success, caller has to check status
1578 * before calling this method.
1579 * @param strsrch string search data
1580 * @param textoffset of the start of the rearranged accent
1581 * @param status output error status if any
1582 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1583 * offset of the match. Note this start includes all preceding accents.
1586 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1590 const UChar
*text
= strsrch
->search
->text
;
1591 const UCollator
*collator
= strsrch
->collator
;
1592 int32_t safelength
= 0;
1594 int32_t safetextlength
;
1595 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1596 UCollationElements
*coleiter
= strsrch
->utilIter
;
1597 int32_t safeoffset
= textoffset
;
1599 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1601 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1602 safelength
= textoffset
- safeoffset
;
1603 safetextlength
= INITIAL_ARRAY_SIZE_
;
1604 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1605 text
+ safeoffset
, safelength
,
1606 strsrch
->canonicalSuffixAccents
,
1610 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1611 safetext
= strsrch
->canonicalSuffixAccents
;
1614 // if status is a failure, ucol_setText does nothing
1615 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1616 // status checked in loop below
1618 int32_t *ce
= strsrch
->pattern
.CE
;
1619 int32_t celength
= strsrch
->pattern
.CELength
;
1620 int ceindex
= celength
- 1;
1621 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1623 while (ceindex
>= 0) {
1624 int32_t textce
= ucol_previous(coleiter
, status
);
1625 if (U_FAILURE(*status
)) {
1627 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1629 return USEARCH_DONE
;
1631 if (textce
== UCOL_NULLORDER
) {
1632 // check if we have passed the safe buffer
1633 if (coleiter
== strsrch
->textIter
) {
1634 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1635 return USEARCH_DONE
;
1637 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1638 safetext
= safebuffer
;
1639 coleiter
= strsrch
->textIter
;
1640 setColEIterOffset(coleiter
, safeoffset
);
1641 // status checked at the start of the loop
1645 textce
= getCE(strsrch
, textce
);
1646 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1647 // do the beginning stuff
1648 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1649 if (isSafe
&& failedoffset
>= safelength
) {
1650 // alas... no hope. failed at rearranged accent set
1651 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1652 return USEARCH_DONE
;
1656 failedoffset
+= safeoffset
;
1657 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1660 // try rearranging the front accents
1661 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1662 failedoffset
, textoffset
, status
);
1663 if (result
!= USEARCH_DONE
) {
1664 // if status is a failure, ucol_setOffset does nothing
1665 setColEIterOffset(strsrch
->textIter
, result
);
1667 if (U_FAILURE(*status
)) {
1668 return USEARCH_DONE
;
1673 if (textce
== ce
[ceindex
]) {
1679 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1680 // sets the text iterator here with the correct expansion and offset
1681 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1682 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1683 if (result
>= safelength
) {
1684 result
= textoffset
;
1687 result
+= safeoffset
;
1689 setColEIterOffset(strsrch
->textIter
, result
);
1690 strsrch
->textIter
->iteratordata_
.toReturn
=
1691 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1695 return ucol_getOffset(coleiter
);
1699 * Trying out the substring and sees if it can be a canonical match.
1700 * This will try normalizing the end accents and arranging them into canonical
1701 * equivalents and check their corresponding ces with the pattern ce.
1702 * Suffix accents in the text will be grouped according to their combining
1703 * class and the groups will be mixed and matched to try find the perfect
1704 * match with the pattern.
1705 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1706 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1707 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1709 * step 2: check if any of the generated substrings matches the pattern.
1710 * Internal method, status assumed to be success, caller has to check status
1711 * before calling this method.
1712 * @param strsrch string search data
1713 * @param textoffset end offset in the collation element text that ends with
1714 * the accents to be rearranged
1715 * @param status error status if any
1716 * @return TRUE if the match is valid, FALSE otherwise
1719 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1723 const UChar
*text
= strsrch
->search
->text
;
1724 int32_t temp
= textoffset
;
1725 U16_BACK_1(text
, 0, temp
);
1726 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1727 UCollationElements
*coleiter
= strsrch
->textIter
;
1728 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1729 if (strsrch
->pattern
.hasPrefixAccents
) {
1730 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1732 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1733 setColEIterOffset(coleiter
, offset
);
1740 if (!strsrch
->pattern
.hasSuffixAccents
) {
1744 UChar accents
[INITIAL_ARRAY_SIZE_
];
1745 // offset to the last base character in substring to search
1746 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1747 // normalizing the offensive string
1748 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1749 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1750 // status checked in loop below
1752 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1753 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1755 // 2 power n - 1 plus the full set of accents
1756 int32_t count
= (2 << (size
- 1)) - 1;
1757 while (U_SUCCESS(*status
) && count
> 0) {
1758 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1759 // copy the base characters
1760 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1761 *rearrange
++ = accents
[k
];
1763 // forming all possible canonical rearrangement by dropping
1765 for (int i
= 0; i
<= size
- 1; i
++) {
1766 int32_t mask
= 1 << (size
- i
- 1);
1768 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1769 *rearrange
++ = accents
[j
];
1774 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1776 if (offset
!= USEARCH_DONE
) {
1777 return TRUE
; // match found
1785 * Gets the previous base character offset depending on the string search
1787 * @param strsrch string search data
1788 * @param textoffset current offset, current character
1789 * @return the offset of the next character after this base character or itself
1790 * if it is a composed character with accents
1793 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1796 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1797 const UChar
*text
= strsrch
->search
->text
;
1798 int32_t offset
= textoffset
;
1799 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1800 SECOND_LAST_BYTE_SHIFT_
) {
1801 return getPreviousBaseOffset(text
, textoffset
);
1808 * Checks match for contraction.
1809 * If the match ends with a partial contraction we fail.
1810 * If the match starts too far off (because of backwards iteration) we try to
1811 * chip off the extra characters
1812 * Internal method, status assumed to be success, caller has to check status
1813 * before calling this method.
1814 * @param strsrch string search data
1815 * @param start offset of potential match, to be modified if necessary
1816 * @param end offset of potential match, to be modified if necessary
1817 * @param status output error status if any
1818 * @return TRUE if match passes the contraction test, FALSE otherwise
1821 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1826 UCollationElements
*coleiter
= strsrch
->textIter
;
1827 int32_t textlength
= strsrch
->search
->textLength
;
1828 int32_t temp
= *start
;
1829 const UCollator
*collator
= strsrch
->collator
;
1830 const UChar
*text
= strsrch
->search
->text
;
1831 // This part checks if either ends of the match contains potential
1832 // contraction. If so we'll have to iterate through them
1833 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1834 (*start
+ 1 < textlength
1835 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1836 int32_t expansion
= getExpansionPrefix(coleiter
);
1837 UBool expandflag
= expansion
> 0;
1838 setColEIterOffset(coleiter
, *start
);
1839 while (expansion
> 0) {
1840 // getting rid of the redundant ce, caused by setOffset.
1841 // since backward contraction/expansion may have extra ces if we
1842 // are in the normalization buffer, hasAccentsBeforeMatch would
1843 // have taken care of it.
1844 // E.g. the character \u01FA will have an expansion of 3, but if
1845 // we are only looking for acute and ring \u030A and \u0301, we'll
1846 // have to skip the first ce in the expansion buffer.
1847 ucol_next(coleiter
, status
);
1848 if (U_FAILURE(*status
)) {
1851 if (ucol_getOffset(coleiter
) != temp
) {
1853 temp
= ucol_getOffset(coleiter
);
1858 int32_t *patternce
= strsrch
->pattern
.CE
;
1859 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1861 int32_t textlength
= strsrch
->search
->textLength
;
1862 while (count
< patterncelength
) {
1863 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1864 // status checked below, note that if status is a failure
1865 // ucol_next returns UCOL_NULLORDER
1866 if (ce
== UCOL_IGNORABLE
) {
1869 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1871 temp
= ucol_getOffset(coleiter
);
1874 if (count
== 0 && ce
!= patternce
[0]) {
1875 // accents may have extra starting ces, this occurs when a
1876 // pure accent pattern is matched without rearrangement
1877 // text \u0325\u0300 and looking for \u0300
1878 int32_t expected
= patternce
[0];
1879 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1880 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1881 while (U_SUCCESS(*status
) && ce
!= expected
&&
1882 ce
!= UCOL_NULLORDER
&&
1883 ucol_getOffset(coleiter
) <= *end
) {
1884 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1888 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1890 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1900 * Checks and sets the match information if found.
1903 * <li> the potential match does not repeat the previous match
1904 * <li> boundaries are correct
1905 * <li> potential match does not end in the middle of a contraction
1906 * <li> identical matches
1908 * Otherwise the offset will be shifted to the next character.
1909 * Internal method, status assumed to be success, caller has to check the
1910 * status before calling this method.
1911 * @param strsrch string search data
1912 * @param textoffset offset in the collation element text. the returned value
1913 * will be the truncated end offset of the match or the new start
1915 * @param status output error status if any
1916 * @return TRUE if the match is valid, FALSE otherwise
1919 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1920 int32_t *textoffset
,
1923 // to ensure that the start and ends are not composite characters
1924 UCollationElements
*coleiter
= strsrch
->textIter
;
1925 // if we have a canonical accent match
1926 if ((strsrch
->pattern
.hasSuffixAccents
&&
1927 strsrch
->canonicalSuffixAccents
[0]) ||
1928 (strsrch
->pattern
.hasPrefixAccents
&&
1929 strsrch
->canonicalPrefixAccents
[0])) {
1930 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1932 ucol_getOffset(coleiter
));
1933 strsrch
->search
->matchedLength
= *textoffset
-
1934 strsrch
->search
->matchedIndex
;
1938 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1939 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1940 status
) || U_FAILURE(*status
)) {
1944 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1945 // this totally matches, however we need to check if it is repeating
1946 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1947 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1948 !checkIdentical(strsrch
, start
, *textoffset
)) {
1950 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1951 strsrch
->search
->textLength
);
1955 strsrch
->search
->matchedIndex
= start
;
1956 strsrch
->search
->matchedLength
= *textoffset
- start
;
1961 * Shifting the collation element iterator position forward to prepare for
1962 * a preceding match. If the first character is a unsafe character, we'll only
1963 * shift by 1 to capture contractions, normalization etc.
1964 * Internal method, status assumed to be success, caller has to check status
1965 * before calling this method.
1966 * @param text strsrch string search data
1967 * @param textoffset start text position to do search
1968 * @param ce the text ce which failed the match.
1969 * @param patternceindex index of the ce within the pattern ce buffer which
1971 * @return final offset
1974 inline int32_t reverseShift(UStringSearch
*strsrch
,
1977 int32_t patternceindex
)
1979 if (strsrch
->search
->isOverlap
) {
1980 if (textoffset
!= strsrch
->search
->textLength
) {
1984 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1988 if (ce
!= UCOL_NULLORDER
) {
1989 int32_t shift
= strsrch
->pattern
.backShift
[hash(ce
)];
1991 // this is to adjust for characters in the middle of the substring
1992 // for matching that failed.
1993 int32_t adjust
= patternceindex
;
1994 if (adjust
> 1 && shift
> adjust
) {
1995 shift
-= adjust
- 1;
1997 textoffset
-= shift
;
2000 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
2003 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
2008 * Checks match for contraction.
2009 * If the match starts with a partial contraction we fail.
2010 * Internal method, status assumed to be success, caller has to check status
2011 * before calling this method.
2012 * @param strsrch string search data
2013 * @param start offset of potential match, to be modified if necessary
2014 * @param end offset of potential match, to be modified if necessary
2015 * @param status output error status if any
2016 * @return TRUE if match passes the contraction test, FALSE otherwise
2019 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
2021 int32_t *end
, UErrorCode
*status
)
2023 UCollationElements
*coleiter
= strsrch
->textIter
;
2024 int32_t textlength
= strsrch
->search
->textLength
;
2025 int32_t temp
= *end
;
2026 const UCollator
*collator
= strsrch
->collator
;
2027 const UChar
*text
= strsrch
->search
->text
;
2028 // This part checks if either if the start of the match contains potential
2029 // contraction. If so we'll have to iterate through them
2030 // Since we used ucol_next while previously looking for the potential
2031 // match, this guarantees that our end will not be a partial contraction,
2032 // or a partial supplementary character.
2033 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2034 int32_t expansion
= getExpansionSuffix(coleiter
);
2035 UBool expandflag
= expansion
> 0;
2036 setColEIterOffset(coleiter
, *end
);
2037 while (U_SUCCESS(*status
) && expansion
> 0) {
2038 // getting rid of the redundant ce
2039 // since forward contraction/expansion may have extra ces
2040 // if we are in the normalization buffer, hasAccentsBeforeMatch
2041 // would have taken care of it.
2042 // E.g. the character \u01FA will have an expansion of 3, but if
2043 // we are only looking for A ring A\u030A, we'll have to skip the
2044 // last ce in the expansion buffer
2045 ucol_previous(coleiter
, status
);
2046 if (U_FAILURE(*status
)) {
2049 if (ucol_getOffset(coleiter
) != temp
) {
2051 temp
= ucol_getOffset(coleiter
);
2056 int32_t *patternce
= strsrch
->pattern
.CE
;
2057 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2058 int32_t count
= patterncelength
;
2060 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2061 // status checked below, note that if status is a failure
2062 // ucol_previous returns UCOL_NULLORDER
2063 if (ce
== UCOL_IGNORABLE
) {
2066 if (expandflag
&& count
== 0 &&
2067 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2069 temp
= ucol_getOffset(coleiter
);
2071 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2073 *start
= getPreviousBaseOffset(text
, *start
);
2083 * Checks and sets the match information if found.
2086 * <li> the current match does not repeat the last match
2087 * <li> boundaries are correct
2088 * <li> exact matches has no extra accents
2089 * <li> identical matches
2091 * Otherwise the offset will be shifted to the preceding character.
2092 * Internal method, status assumed to be success, caller has to check status
2093 * before calling this method.
2094 * @param strsrch string search data
2096 * @param coleiter collation element iterator
2097 * @param text string
2098 * @param textoffset offset in the collation element text. the returned value
2099 * will be the truncated start offset of the match or the new start
2101 * @param status output error status if any
2102 * @return TRUE if the match is valid, FALSE otherwise
2105 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
2106 int32_t *textoffset
,
2109 // to ensure that the start and ends are not composite characters
2110 int32_t end
= ucol_getOffset(strsrch
->textIter
);
2111 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
2112 || U_FAILURE(*status
)) {
2116 // this totally matches, however we need to check if it is repeating
2118 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2119 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2120 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
2121 !checkIdentical(strsrch
, *textoffset
, end
) ||
2122 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
2124 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2129 //Add breakiterator boundary check for primary strength search.
2130 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
2131 checkBreakBoundary(strsrch
, textoffset
, &end
);
2134 strsrch
->search
->matchedIndex
= *textoffset
;
2135 strsrch
->search
->matchedLength
= end
- *textoffset
;
2140 * Rearranges the end accents to try matching.
2141 * Suffix accents in the text will be grouped according to their combining
2142 * class and the groups will be mixed and matched to try find the perfect
2143 * match with the pattern.
2144 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2145 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2146 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2148 * step 2: check if any of the generated substrings matches the pattern.
2149 * Internal method, status assumed to be success, user has to check status
2150 * before calling this method.
2151 * @param strsrch string search match
2152 * @param start offset of the first base character
2153 * @param end start of the last accent set
2154 * @param status only error status if any
2155 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2156 * offset of the match. Note this start includes all following accents.
2159 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
2164 const UChar
*text
= strsrch
->search
->text
;
2165 int32_t tempend
= end
;
2167 U16_BACK_1(text
, 0, tempend
);
2168 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2170 // die... failed at a base character
2171 return USEARCH_DONE
;
2173 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2175 if (U_SUCCESS(*status
)) {
2176 UChar accents
[INITIAL_ARRAY_SIZE_
];
2177 int32_t offset
= getPreviousBaseOffset(text
, end
);
2178 // normalizing the offensive string
2179 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2180 INITIAL_ARRAY_SIZE_
, status
);
2182 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2183 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2185 int32_t count
= (2 << (accentsize
- 1)) - 1;
2186 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2187 UCollationElements
*coleiter
= strsrch
->utilIter
;
2188 while (U_SUCCESS(*status
) && count
> 0) {
2189 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2190 // copy the base characters
2191 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2192 *rearrange
++ = accents
[k
];
2194 // forming all possible canonical rearrangement by dropping
2196 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2197 int32_t mask
= 1 << (accentsize
- i
- 1);
2199 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2200 *rearrange
++ = accents
[j
];
2205 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2206 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2207 strsrch
->canonicalPrefixAccents
,
2208 strsrch
->search
->text
+ start
,
2210 strsrch
->canonicalSuffixAccents
,
2213 // run the collator iterator through this match
2214 // if status is a failure ucol_setText does nothing
2215 ucol_setText(coleiter
, match
, matchsize
, status
);
2216 if (U_SUCCESS(*status
)) {
2217 if (checkCollationMatch(strsrch
, coleiter
)) {
2218 if (match
!= buffer
) {
2227 return USEARCH_DONE
;
2231 * Take the rearranged start accents and tries matching. If match failed at
2232 * a seperate following set of accents (seperated from the rearranged on by
2233 * at least a base character) then we rearrange the preceding accents and
2234 * tries matching again.
2235 * We allow skipping of the ends of the accent set if the ces do not match.
2236 * However if the failure is found before the accent set, it fails.
2237 * Internal method, status assumed to be success, caller has to check status
2238 * before calling this method.
2239 * @param strsrch string search data
2240 * @param textoffset of the ends of the rearranged accent
2241 * @param status output error status if any
2242 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2243 * offset of the match. Note this start includes all following accents.
2246 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2250 const UChar
*text
= strsrch
->search
->text
;
2251 const UCollator
*collator
= strsrch
->collator
;
2252 int32_t safelength
= 0;
2254 int32_t safetextlength
;
2255 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2256 int32_t safeoffset
= textoffset
;
2259 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2260 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2262 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2263 strsrch
->search
->textLength
);
2264 safelength
= safeoffset
- textoffset
;
2265 safetextlength
= INITIAL_ARRAY_SIZE_
;
2266 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2267 strsrch
->canonicalPrefixAccents
,
2268 text
+ textoffset
, safelength
,
2272 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2273 safetext
= strsrch
->canonicalPrefixAccents
;
2276 UCollationElements
*coleiter
= strsrch
->utilIter
;
2277 // if status is a failure, ucol_setText does nothing
2278 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2279 // status checked in loop below
2281 int32_t *ce
= strsrch
->pattern
.CE
;
2282 int32_t celength
= strsrch
->pattern
.CELength
;
2284 UBool isSafe
= TRUE
; // safe zone indication flag for position
2285 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2287 while (ceindex
< celength
) {
2288 int32_t textce
= ucol_next(coleiter
, status
);
2289 if (U_FAILURE(*status
)) {
2291 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2293 return USEARCH_DONE
;
2295 if (textce
== UCOL_NULLORDER
) {
2296 // check if we have passed the safe buffer
2297 if (coleiter
== strsrch
->textIter
) {
2298 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2299 return USEARCH_DONE
;
2301 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2302 safetext
= safebuffer
;
2303 coleiter
= strsrch
->textIter
;
2304 setColEIterOffset(coleiter
, safeoffset
);
2305 // status checked at the start of the loop
2309 textce
= getCE(strsrch
, textce
);
2310 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2311 // do the beginning stuff
2312 int32_t failedoffset
= ucol_getOffset(coleiter
);
2313 if (isSafe
&& failedoffset
<= prefixlength
) {
2314 // alas... no hope. failed at rearranged accent set
2315 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2316 return USEARCH_DONE
;
2320 failedoffset
= safeoffset
- failedoffset
;
2321 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2324 // try rearranging the end accents
2325 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2326 textoffset
, failedoffset
, status
);
2327 if (result
!= USEARCH_DONE
) {
2328 // if status is a failure, ucol_setOffset does nothing
2329 setColEIterOffset(strsrch
->textIter
, result
);
2331 if (U_FAILURE(*status
)) {
2332 return USEARCH_DONE
;
2337 if (textce
== ce
[ceindex
]) {
2343 int32_t result
= ucol_getOffset(coleiter
);
2344 // sets the text iterator here with the correct expansion and offset
2345 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2346 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2347 if (result
<= prefixlength
) {
2348 result
= textoffset
;
2351 result
= textoffset
+ (safeoffset
- result
);
2353 setColEIterOffset(strsrch
->textIter
, result
);
2354 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2358 return ucol_getOffset(coleiter
);
2362 * Trying out the substring and sees if it can be a canonical match.
2363 * This will try normalizing the starting accents and arranging them into
2364 * canonical equivalents and check their corresponding ces with the pattern ce.
2365 * Prefix accents in the text will be grouped according to their combining
2366 * class and the groups will be mixed and matched to try find the perfect
2367 * match with the pattern.
2368 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2369 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2370 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2372 * step 2: check if any of the generated substrings matches the pattern.
2373 * Internal method, status assumed to be success, caller has to check status
2374 * before calling this method.
2375 * @param strsrch string search data
2376 * @param textoffset start offset in the collation element text that starts
2377 * with the accents to be rearranged
2378 * @param status output error status if any
2379 * @return TRUE if the match is valid, FALSE otherwise
2382 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2386 const UChar
*text
= strsrch
->search
->text
;
2387 int32_t temp
= textoffset
;
2388 int32_t textlength
= strsrch
->search
->textLength
;
2389 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2390 UCollationElements
*coleiter
= strsrch
->textIter
;
2391 int32_t offset
= ucol_getOffset(coleiter
);
2392 if (strsrch
->pattern
.hasSuffixAccents
) {
2393 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2395 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2396 setColEIterOffset(coleiter
, offset
);
2403 if (!strsrch
->pattern
.hasPrefixAccents
) {
2407 UChar accents
[INITIAL_ARRAY_SIZE_
];
2408 // offset to the last base character in substring to search
2409 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2410 // normalizing the offensive string
2411 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2412 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2413 // status checked in loop
2415 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2416 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2418 // 2 power n - 1 plus the full set of accents
2419 int32_t count
= (2 << (size
- 1)) - 1;
2420 while (U_SUCCESS(*status
) && count
> 0) {
2421 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2422 // copy the base characters
2423 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2424 *rearrange
++ = accents
[k
];
2426 // forming all possible canonical rearrangement by dropping
2428 for (int i
= 0; i
<= size
- 1; i
++) {
2429 int32_t mask
= 1 << (size
- i
- 1);
2431 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2432 *rearrange
++ = accents
[j
];
2437 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2438 baseoffset
, status
);
2439 if (offset
!= USEARCH_DONE
) {
2440 return TRUE
; // match found
2448 * Checks match for contraction.
2449 * If the match starts with a partial contraction we fail.
2450 * Internal method, status assumed to be success, caller has to check status
2451 * before calling this method.
2452 * @param strsrch string search data
2453 * @param start offset of potential match, to be modified if necessary
2454 * @param end offset of potential match, to be modified if necessary
2455 * @param status only error status if any
2456 * @return TRUE if match passes the contraction test, FALSE otherwise
2459 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2461 int32_t *end
, UErrorCode
*status
)
2463 UCollationElements
*coleiter
= strsrch
->textIter
;
2464 int32_t textlength
= strsrch
->search
->textLength
;
2465 int32_t temp
= *end
;
2466 const UCollator
*collator
= strsrch
->collator
;
2467 const UChar
*text
= strsrch
->search
->text
;
2468 // This part checks if either if the start of the match contains potential
2469 // contraction. If so we'll have to iterate through them
2470 // Since we used ucol_next while previously looking for the potential
2471 // match, this guarantees that our end will not be a partial contraction,
2472 // or a partial supplementary character.
2473 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2474 int32_t expansion
= getExpansionSuffix(coleiter
);
2475 UBool expandflag
= expansion
> 0;
2476 setColEIterOffset(coleiter
, *end
);
2477 while (expansion
> 0) {
2478 // getting rid of the redundant ce
2479 // since forward contraction/expansion may have extra ces
2480 // if we are in the normalization buffer, hasAccentsBeforeMatch
2481 // would have taken care of it.
2482 // E.g. the character \u01FA will have an expansion of 3, but if
2483 // we are only looking for A ring A\u030A, we'll have to skip the
2484 // last ce in the expansion buffer
2485 ucol_previous(coleiter
, status
);
2486 if (U_FAILURE(*status
)) {
2489 if (ucol_getOffset(coleiter
) != temp
) {
2491 temp
= ucol_getOffset(coleiter
);
2496 int32_t *patternce
= strsrch
->pattern
.CE
;
2497 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2498 int32_t count
= patterncelength
;
2500 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2501 // status checked below, note that if status is a failure
2502 // ucol_previous returns UCOL_NULLORDER
2503 if (ce
== UCOL_IGNORABLE
) {
2506 if (expandflag
&& count
== 0 &&
2507 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2509 temp
= ucol_getOffset(coleiter
);
2511 if (count
== patterncelength
&&
2512 ce
!= patternce
[patterncelength
- 1]) {
2513 // accents may have extra starting ces, this occurs when a
2514 // pure accent pattern is matched without rearrangement
2515 int32_t expected
= patternce
[patterncelength
- 1];
2516 U16_BACK_1(text
, 0, *end
);
2517 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2518 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2519 while (U_SUCCESS(*status
) && ce
!= expected
&&
2520 ce
!= UCOL_NULLORDER
&&
2521 ucol_getOffset(coleiter
) <= *start
) {
2522 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2526 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2528 *start
= getPreviousBaseOffset(text
, *start
);
2538 * Checks and sets the match information if found.
2541 * <li> the potential match does not repeat the previous match
2542 * <li> boundaries are correct
2543 * <li> potential match does not end in the middle of a contraction
2544 * <li> identical matches
2546 * Otherwise the offset will be shifted to the next character.
2547 * Internal method, status assumed to be success, caller has to check status
2548 * before calling this method.
2549 * @param strsrch string search data
2550 * @param textoffset offset in the collation element text. the returned value
2551 * will be the truncated start offset of the match or the new start
2553 * @param status only error status if any
2554 * @return TRUE if the match is valid, FALSE otherwise
2557 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2558 int32_t *textoffset
,
2561 // to ensure that the start and ends are not composite characters
2562 UCollationElements
*coleiter
= strsrch
->textIter
;
2563 // if we have a canonical accent match
2564 if ((strsrch
->pattern
.hasSuffixAccents
&&
2565 strsrch
->canonicalSuffixAccents
[0]) ||
2566 (strsrch
->pattern
.hasPrefixAccents
&&
2567 strsrch
->canonicalPrefixAccents
[0])) {
2568 strsrch
->search
->matchedIndex
= *textoffset
;
2569 strsrch
->search
->matchedLength
=
2570 getNextUStringSearchBaseOffset(strsrch
,
2571 getColElemIterOffset(coleiter
, FALSE
))
2576 int32_t end
= ucol_getOffset(coleiter
);
2577 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2579 U_FAILURE(*status
)) {
2583 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2584 // this totally matches, however we need to check if it is repeating
2585 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2586 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2587 !checkIdentical(strsrch
, *textoffset
, end
)) {
2589 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2594 strsrch
->search
->matchedIndex
= *textoffset
;
2595 strsrch
->search
->matchedLength
= end
- *textoffset
;
2598 #endif // #if BOYER_MOORE
2600 // constructors and destructor -------------------------------------------
2602 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2603 int32_t patternlength
,
2607 UBreakIterator
*breakiter
,
2610 if (U_FAILURE(*status
)) {
2613 #if UCONFIG_NO_BREAK_ITERATION
2614 if (breakiter
!= NULL
) {
2615 *status
= U_UNSUPPORTED_ERROR
;
2620 // ucol_open internally checks for status
2621 UCollator
*collator
= ucol_open(locale
, status
);
2622 // pattern, text checks are done in usearch_openFromCollator
2623 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2624 patternlength
, text
, textlength
,
2625 collator
, breakiter
, status
);
2627 if (result
== NULL
|| U_FAILURE(*status
)) {
2629 ucol_close(collator
);
2634 result
->ownCollator
= TRUE
;
2638 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2642 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2643 const UChar
*pattern
,
2644 int32_t patternlength
,
2647 const UCollator
*collator
,
2648 UBreakIterator
*breakiter
,
2651 if (U_FAILURE(*status
)) {
2654 #if UCONFIG_NO_BREAK_ITERATION
2655 if (breakiter
!= NULL
) {
2656 *status
= U_UNSUPPORTED_ERROR
;
2660 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2661 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2665 // string search does not really work when numeric collation is turned on
2666 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2667 *status
= U_UNSUPPORTED_ERROR
;
2671 if (U_SUCCESS(*status
)) {
2672 initializeFCD(status
);
2673 if (U_FAILURE(*status
)) {
2677 UStringSearch
*result
;
2678 if (textlength
== -1) {
2679 textlength
= u_strlen(text
);
2681 if (patternlength
== -1) {
2682 patternlength
= u_strlen(pattern
);
2684 if (textlength
<= 0 || patternlength
<= 0) {
2685 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2689 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2690 if (result
== NULL
) {
2691 *status
= U_MEMORY_ALLOCATION_ERROR
;
2695 result
->collator
= collator
;
2696 result
->strength
= ucol_getStrength(collator
);
2697 result
->ceMask
= getMask(result
->strength
);
2699 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2701 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2703 result
->nfd
= Normalizer2Factory::getNFDInstance(*status
);
2705 if (U_FAILURE(*status
)) {
2710 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2711 if (result
->search
== NULL
) {
2712 *status
= U_MEMORY_ALLOCATION_ERROR
;
2717 result
->search
->text
= text
;
2718 result
->search
->textLength
= textlength
;
2720 result
->pattern
.text
= pattern
;
2721 result
->pattern
.textLength
= patternlength
;
2722 result
->pattern
.CE
= NULL
;
2723 result
->pattern
.PCE
= NULL
;
2725 result
->search
->breakIter
= breakiter
;
2726 #if !UCONFIG_NO_BREAK_ITERATION
2727 result
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(result
->collator
, ULOC_VALID_LOCALE
, status
), text
, textlength
, status
);
2729 ubrk_setText(breakiter
, text
, textlength
, status
);
2733 result
->ownCollator
= FALSE
;
2734 result
->search
->matchedLength
= 0;
2735 result
->search
->matchedIndex
= USEARCH_DONE
;
2736 result
->utilIter
= NULL
;
2737 result
->textIter
= ucol_openElements(collator
, text
,
2738 textlength
, status
);
2739 if (U_FAILURE(*status
)) {
2740 usearch_close(result
);
2744 result
->search
->isOverlap
= FALSE
;
2745 result
->search
->isCanonicalMatch
= FALSE
;
2746 result
->search
->elementComparisonType
= 0;
2747 result
->search
->isForwardSearching
= TRUE
;
2748 result
->search
->reset
= TRUE
;
2750 initialize(result
, status
);
2752 if (U_FAILURE(*status
)) {
2753 usearch_close(result
);
2762 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2765 if (strsrch
->pattern
.CE
!= strsrch
->pattern
.CEBuffer
&&
2766 strsrch
->pattern
.CE
) {
2767 uprv_free(strsrch
->pattern
.CE
);
2770 if (strsrch
->pattern
.PCE
!= NULL
&&
2771 strsrch
->pattern
.PCE
!= strsrch
->pattern
.PCEBuffer
) {
2772 uprv_free(strsrch
->pattern
.PCE
);
2775 ucol_closeElements(strsrch
->textIter
);
2776 ucol_closeElements(strsrch
->utilIter
);
2778 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2779 ucol_close((UCollator
*)strsrch
->collator
);
2782 #if !UCONFIG_NO_BREAK_ITERATION
2783 if (strsrch
->search
->internalBreakIter
) {
2784 ubrk_close(strsrch
->search
->internalBreakIter
);
2788 uprv_free(strsrch
->search
);
2793 // set and get methods --------------------------------------------------
2795 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2799 if (U_SUCCESS(*status
) && strsrch
) {
2800 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2801 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2804 setColEIterOffset(strsrch
->textIter
, position
);
2806 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2807 strsrch
->search
->matchedLength
= 0;
2808 strsrch
->search
->reset
= FALSE
;
2812 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2815 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2816 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2817 return USEARCH_DONE
;
2821 return USEARCH_DONE
;
2824 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2825 USearchAttribute attribute
,
2826 USearchAttributeValue value
,
2829 if (U_SUCCESS(*status
) && strsrch
) {
2832 case USEARCH_OVERLAP
:
2833 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2835 case USEARCH_CANONICAL_MATCH
:
2836 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2839 case USEARCH_ELEMENT_COMPARISON
:
2840 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2841 strsrch
->search
->elementComparisonType
= (int16_t)value
;
2843 strsrch
->search
->elementComparisonType
= 0;
2846 case USEARCH_ATTRIBUTE_COUNT
:
2848 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2851 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2852 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2856 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2857 const UStringSearch
*strsrch
,
2858 USearchAttribute attribute
)
2861 switch (attribute
) {
2862 case USEARCH_OVERLAP
:
2863 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2865 case USEARCH_CANONICAL_MATCH
:
2866 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2868 case USEARCH_ELEMENT_COMPARISON
:
2870 int16_t value
= strsrch
->search
->elementComparisonType
;
2871 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2872 return (USearchAttributeValue
)value
;
2874 return USEARCH_STANDARD_ELEMENT_COMPARISON
;
2877 case USEARCH_ATTRIBUTE_COUNT
:
2878 return USEARCH_DEFAULT
;
2881 return USEARCH_DEFAULT
;
2884 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2885 const UStringSearch
*strsrch
)
2887 if (strsrch
== NULL
) {
2888 return USEARCH_DONE
;
2890 return strsrch
->search
->matchedIndex
;
2894 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2896 int32_t resultCapacity
,
2899 if (U_FAILURE(*status
)) {
2900 return USEARCH_DONE
;
2902 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2904 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2905 return USEARCH_DONE
;
2908 int32_t copylength
= strsrch
->search
->matchedLength
;
2909 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2910 if (copyindex
== USEARCH_DONE
) {
2911 u_terminateUChars(result
, resultCapacity
, 0, status
);
2912 return USEARCH_DONE
;
2915 if (resultCapacity
< copylength
) {
2916 copylength
= resultCapacity
;
2918 if (copylength
> 0) {
2919 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2920 copylength
* sizeof(UChar
));
2922 return u_terminateUChars(result
, resultCapacity
,
2923 strsrch
->search
->matchedLength
, status
);
2926 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2927 const UStringSearch
*strsrch
)
2930 return strsrch
->search
->matchedLength
;
2932 return USEARCH_DONE
;
2935 #if !UCONFIG_NO_BREAK_ITERATION
2937 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2938 UBreakIterator
*breakiter
,
2941 if (U_SUCCESS(*status
) && strsrch
) {
2942 strsrch
->search
->breakIter
= breakiter
;
2944 ubrk_setText(breakiter
, strsrch
->search
->text
,
2945 strsrch
->search
->textLength
, status
);
2950 U_CAPI
const UBreakIterator
* U_EXPORT2
2951 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2954 return strsrch
->search
->breakIter
;
2961 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2966 if (U_SUCCESS(*status
)) {
2967 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2969 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2972 if (textlength
== -1) {
2973 textlength
= u_strlen(text
);
2975 strsrch
->search
->text
= text
;
2976 strsrch
->search
->textLength
= textlength
;
2977 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2978 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2979 strsrch
->search
->matchedLength
= 0;
2980 strsrch
->search
->reset
= TRUE
;
2981 #if !UCONFIG_NO_BREAK_ITERATION
2982 if (strsrch
->search
->breakIter
!= NULL
) {
2983 ubrk_setText(strsrch
->search
->breakIter
, text
,
2984 textlength
, status
);
2986 ubrk_setText(strsrch
->search
->internalBreakIter
, text
, textlength
, status
);
2992 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
2996 *length
= strsrch
->search
->textLength
;
2997 return strsrch
->search
->text
;
3002 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
3003 const UCollator
*collator
,
3006 if (U_SUCCESS(*status
)) {
3007 if (collator
== NULL
) {
3008 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3013 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
3014 ucol_close((UCollator
*)strsrch
->collator
);
3015 strsrch
->ownCollator
= FALSE
;
3017 strsrch
->collator
= collator
;
3018 strsrch
->strength
= ucol_getStrength(collator
);
3019 strsrch
->ceMask
= getMask(strsrch
->strength
);
3020 #if !UCONFIG_NO_BREAK_ITERATION
3021 ubrk_close(strsrch
->search
->internalBreakIter
);
3022 strsrch
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(collator
, ULOC_VALID_LOCALE
, status
),
3023 strsrch
->search
->text
, strsrch
->search
->textLength
, status
);
3025 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3027 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
3029 // if status is a failure, ucol_getVariableTop returns 0
3030 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
3031 if (U_SUCCESS(*status
)) {
3032 initialize(strsrch
, status
);
3033 if (U_SUCCESS(*status
)) {
3034 /* free offset buffer to avoid memory leak before initializing. */
3035 ucol_freeOffsetBuffer(&(strsrch
->textIter
->iteratordata_
));
3036 uprv_init_collIterate(collator
, strsrch
->search
->text
,
3037 strsrch
->search
->textLength
,
3038 &(strsrch
->textIter
->iteratordata_
),
3040 strsrch
->utilIter
->iteratordata_
.coll
= collator
;
3045 // **** are these calls needed?
3046 // **** we call uprv_init_pce in initializePatternPCETable
3047 // **** and the CEBuffer constructor...
3049 uprv_init_pce(strsrch
->textIter
);
3050 uprv_init_pce(strsrch
->utilIter
);
3055 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
3058 return (UCollator
*)strsrch
->collator
;
3063 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
3064 const UChar
*pattern
,
3065 int32_t patternlength
,
3068 if (U_SUCCESS(*status
)) {
3069 if (strsrch
== NULL
|| pattern
== NULL
) {
3070 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3073 if (patternlength
== -1) {
3074 patternlength
= u_strlen(pattern
);
3076 if (patternlength
== 0) {
3077 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3080 strsrch
->pattern
.text
= pattern
;
3081 strsrch
->pattern
.textLength
= patternlength
;
3082 initialize(strsrch
, status
);
3087 U_CAPI
const UChar
* U_EXPORT2
3088 usearch_getPattern(const UStringSearch
*strsrch
,
3092 *length
= strsrch
->pattern
.textLength
;
3093 return strsrch
->pattern
.text
;
3098 // miscellanous methods --------------------------------------------------
3100 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
3103 if (strsrch
&& U_SUCCESS(*status
)) {
3104 strsrch
->search
->isForwardSearching
= TRUE
;
3105 usearch_setOffset(strsrch
, 0, status
);
3106 if (U_SUCCESS(*status
)) {
3107 return usearch_next(strsrch
, status
);
3110 return USEARCH_DONE
;
3113 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
3117 if (strsrch
&& U_SUCCESS(*status
)) {
3118 strsrch
->search
->isForwardSearching
= TRUE
;
3119 // position checked in usearch_setOffset
3120 usearch_setOffset(strsrch
, position
, status
);
3121 if (U_SUCCESS(*status
)) {
3122 return usearch_next(strsrch
, status
);
3125 return USEARCH_DONE
;
3128 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
3131 if (strsrch
&& U_SUCCESS(*status
)) {
3132 strsrch
->search
->isForwardSearching
= FALSE
;
3133 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
3134 if (U_SUCCESS(*status
)) {
3135 return usearch_previous(strsrch
, status
);
3138 return USEARCH_DONE
;
3141 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
3145 if (strsrch
&& U_SUCCESS(*status
)) {
3146 strsrch
->search
->isForwardSearching
= FALSE
;
3147 // position checked in usearch_setOffset
3148 usearch_setOffset(strsrch
, position
, status
);
3149 if (U_SUCCESS(*status
)) {
3150 return usearch_previous(strsrch
, status
);
3153 return USEARCH_DONE
;
3157 * If a direction switch is required, we'll count the number of ces till the
3158 * beginning of the collation element iterator and iterate forwards that
3159 * number of times. This is so that we get to the correct point within the
3160 * string to continue the search in. Imagine when we are in the middle of the
3161 * normalization buffer when the change in direction is request. arrrgghh....
3162 * After searching the offset within the collation element iterator will be
3163 * shifted to the start of the match. If a match is not found, the offset would
3164 * have been set to the end of the text string in the collation element
3166 * Okay, here's my take on normalization buffer. The only time when there can
3167 * be 2 matches within the same normalization is when the pattern is consists
3168 * of all accents. But since the offset returned is from the text string, we
3169 * should not confuse the caller by returning the second match within the
3170 * same normalization buffer. If we do, the 2 results will have the same match
3171 * offsets, and that'll be confusing. I'll return the next match that doesn't
3172 * fall within the same normalization buffer. Note this does not affect the
3173 * results of matches spanning the text and the normalization buffer.
3174 * The position to start searching is taken from the collation element
3175 * iterator. Callers of this API would have to set the offset in the collation
3176 * element iterator before using this method.
3178 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
3181 if (U_SUCCESS(*status
) && strsrch
) {
3182 // note offset is either equivalent to the start of the previous match
3183 // or is set by the user
3184 int32_t offset
= usearch_getOffset(strsrch
);
3185 USearch
*search
= strsrch
->search
;
3186 search
->reset
= FALSE
;
3187 int32_t textlength
= search
->textLength
;
3188 if (search
->isForwardSearching
) {
3190 if (offset
== textlength
3191 || (!search
->isOverlap
&&
3192 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
3193 (search
->matchedIndex
!= USEARCH_DONE
&&
3194 offset
+ search
->matchedLength
>= textlength
)))) {
3195 // not enough characters to match
3196 setMatchNotFound(strsrch
);
3197 return USEARCH_DONE
;
3200 if (offset
== textlength
||
3201 (! search
->isOverlap
&&
3202 (search
->matchedIndex
!= USEARCH_DONE
&&
3203 offset
+ search
->matchedLength
> textlength
))) {
3204 // not enough characters to match
3205 setMatchNotFound(strsrch
);
3206 return USEARCH_DONE
;
3211 // switching direction.
3212 // if matchedIndex == USEARCH_DONE, it means that either a
3213 // setOffset has been called or that previous ran off the text
3214 // string. the iterator would have been set to offset 0 if a
3215 // match is not found.
3216 search
->isForwardSearching
= TRUE
;
3217 if (search
->matchedIndex
!= USEARCH_DONE
) {
3218 // there's no need to set the collation element iterator
3219 // the next call to next will set the offset.
3220 return search
->matchedIndex
;
3224 if (U_SUCCESS(*status
)) {
3225 if (strsrch
->pattern
.CELength
== 0) {
3226 if (search
->matchedIndex
== USEARCH_DONE
) {
3227 search
->matchedIndex
= offset
;
3229 else { // moves by codepoints
3230 U16_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3233 search
->matchedLength
= 0;
3234 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3235 // status checked below
3236 if (search
->matchedIndex
== textlength
) {
3237 search
->matchedIndex
= USEARCH_DONE
;
3241 if (search
->matchedLength
> 0) {
3242 // if matchlength is 0 we are at the start of the iteration
3243 if (search
->isOverlap
) {
3244 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3247 ucol_setOffset(strsrch
->textIter
,
3248 offset
+ search
->matchedLength
, status
);
3252 // for boundary check purposes. this will ensure that the
3253 // next match will not preceed the current offset
3254 // note search->matchedIndex will always be set to something
3256 search
->matchedIndex
= offset
- 1;
3259 if (search
->isCanonicalMatch
) {
3260 // can't use exact here since extra accents are allowed.
3261 usearch_handleNextCanonical(strsrch
, status
);
3264 usearch_handleNextExact(strsrch
, status
);
3268 if (U_FAILURE(*status
)) {
3269 return USEARCH_DONE
;
3273 if (search
->matchedIndex
== USEARCH_DONE
) {
3274 ucol_setOffset(strsrch
->textIter
, search
->textLength
, status
);
3276 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
, status
);
3280 return search
->matchedIndex
;
3283 return USEARCH_DONE
;
3286 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3289 if (U_SUCCESS(*status
) && strsrch
) {
3291 USearch
*search
= strsrch
->search
;
3292 if (search
->reset
) {
3293 offset
= search
->textLength
;
3294 search
->isForwardSearching
= FALSE
;
3295 search
->reset
= FALSE
;
3296 setColEIterOffset(strsrch
->textIter
, offset
);
3299 offset
= usearch_getOffset(strsrch
);
3302 int32_t matchedindex
= search
->matchedIndex
;
3303 if (search
->isForwardSearching
== TRUE
) {
3304 // switching direction.
3305 // if matchedIndex == USEARCH_DONE, it means that either a
3306 // setOffset has been called or that next ran off the text
3307 // string. the iterator would have been set to offset textLength if
3308 // a match is not found.
3309 search
->isForwardSearching
= FALSE
;
3310 if (matchedindex
!= USEARCH_DONE
) {
3311 return matchedindex
;
3316 if (offset
== 0 || matchedindex
== 0 ||
3317 (!search
->isOverlap
&&
3318 (offset
< strsrch
->pattern
.defaultShiftSize
||
3319 (matchedindex
!= USEARCH_DONE
&&
3320 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3321 // not enough characters to match
3322 setMatchNotFound(strsrch
);
3323 return USEARCH_DONE
;
3326 // Could check pattern length, but the
3327 // linear search will do the right thing
3328 if (offset
== 0 || matchedindex
== 0) {
3329 setMatchNotFound(strsrch
);
3330 return USEARCH_DONE
;
3335 if (U_SUCCESS(*status
)) {
3336 if (strsrch
->pattern
.CELength
== 0) {
3337 search
->matchedIndex
=
3338 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3339 if (search
->matchedIndex
== 0) {
3340 setMatchNotFound(strsrch
);
3341 // status checked below
3343 else { // move by codepoints
3344 U16_BACK_1(search
->text
, 0, search
->matchedIndex
);
3345 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3346 // status checked below
3347 search
->matchedLength
= 0;
3351 if (strsrch
->search
->isCanonicalMatch
) {
3352 // can't use exact here since extra accents are allowed.
3353 usearch_handlePreviousCanonical(strsrch
, status
);
3354 // status checked below
3357 usearch_handlePreviousExact(strsrch
, status
);
3358 // status checked below
3362 if (U_FAILURE(*status
)) {
3363 return USEARCH_DONE
;
3366 return search
->matchedIndex
;
3369 return USEARCH_DONE
;
3374 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3377 reset is setting the attributes that are already in
3378 string search, hence all attributes in the collator should
3379 be retrieved without any problems
3382 UErrorCode status
= U_ZERO_ERROR
;
3383 UBool sameCollAttribute
= TRUE
;
3388 // **** hack to deal w/ how processed CEs encode quaternary ****
3389 UCollationStrength newStrength
= ucol_getStrength(strsrch
->collator
);
3390 if ((strsrch
->strength
< UCOL_QUATERNARY
&& newStrength
>= UCOL_QUATERNARY
) ||
3391 (strsrch
->strength
>= UCOL_QUATERNARY
&& newStrength
< UCOL_QUATERNARY
)) {
3392 sameCollAttribute
= FALSE
;
3395 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3396 ceMask
= getMask(strsrch
->strength
);
3397 if (strsrch
->ceMask
!= ceMask
) {
3398 strsrch
->ceMask
= ceMask
;
3399 sameCollAttribute
= FALSE
;
3402 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3403 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3404 &status
) == UCOL_SHIFTED
;
3405 if (strsrch
->toShift
!= shift
) {
3406 strsrch
->toShift
= shift
;
3407 sameCollAttribute
= FALSE
;
3410 // if status is a failure, ucol_getVariableTop returns 0
3411 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3412 if (strsrch
->variableTop
!= varTop
) {
3413 strsrch
->variableTop
= varTop
;
3414 sameCollAttribute
= FALSE
;
3416 if (!sameCollAttribute
) {
3417 initialize(strsrch
, &status
);
3419 /* free offset buffer to avoid memory leak before initializing. */
3420 ucol_freeOffsetBuffer(&(strsrch
->textIter
->iteratordata_
));
3421 uprv_init_collIterate(strsrch
->collator
, strsrch
->search
->text
,
3422 strsrch
->search
->textLength
,
3423 &(strsrch
->textIter
->iteratordata_
),
3425 strsrch
->search
->matchedLength
= 0;
3426 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3427 strsrch
->search
->isOverlap
= FALSE
;
3428 strsrch
->search
->isCanonicalMatch
= FALSE
;
3429 strsrch
->search
->elementComparisonType
= 0;
3430 strsrch
->search
->isForwardSearching
= TRUE
;
3431 strsrch
->search
->reset
= TRUE
;
3436 // CEI Collation Element + source text index.
3437 // These structs are kept in the circular buffer.
3449 // CEBuffer A circular buffer of CEs from the text being searched.
3451 #define DEFAULT_CEBUFFER_SIZE 96
3452 #define CEBUFFER_EXTRA 32
3453 // Some typical max values to make buffer size more reasonable for asymmetric search.
3454 // #8694 is for a better long-term solution to allocation of this buffer.
3455 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3456 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3457 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3459 CEI defBuf
[DEFAULT_CEBUFFER_SIZE
];
3464 UCollationElements
*ceIter
;
3465 UStringSearch
*strSearch
;
3469 CEBuffer(UStringSearch
*ss
, UErrorCode
*status
);
3471 const CEI
*get(int32_t index
);
3472 const CEI
*getPrevious(int32_t index
);
3476 CEBuffer::CEBuffer(UStringSearch
*ss
, UErrorCode
*status
) {
3479 bufSize
= ss
->pattern
.PCELength
+ CEBUFFER_EXTRA
;
3480 if (ss
->search
->elementComparisonType
!= 0) {
3481 const UChar
* patText
= ss
->pattern
.text
;
3483 const UChar
* patTextLimit
= patText
+ ss
->pattern
.textLength
;
3484 while ( patText
< patTextLimit
) {
3485 UChar c
= *patText
++;
3486 if (MIGHT_BE_JAMO_L(c
)) {
3487 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L
;
3489 // No check for surrogates, we might allocate slightly more buffer than necessary.
3490 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_OTHER
;
3495 ceIter
= ss
->textIter
;
3499 uprv_init_pce(ceIter
);
3501 if (bufSize
>DEFAULT_CEBUFFER_SIZE
) {
3502 buf
= (CEI
*)uprv_malloc(bufSize
* sizeof(CEI
));
3504 *status
= U_MEMORY_ALLOCATION_ERROR
;
3509 // TODO: add a reset or init function so that allocated
3510 // buffers can be retained & reused.
3512 CEBuffer::~CEBuffer() {
3513 if (buf
!= defBuf
) {
3519 // Get the CE with the specified index.
3520 // Index must be in the range
3521 // n-history_size < index < n+1
3522 // where n is the largest index to have been fetched by some previous call to this function.
3523 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3525 const CEI
*CEBuffer::get(int32_t index
) {
3526 int i
= index
% bufSize
;
3528 if (index
>=firstIx
&& index
<limitIx
) {
3529 // The request was for an entry already in our buffer.
3534 // Caller is requesting a new, never accessed before, CE.
3535 // Verify that it is the next one in sequence, which is all
3537 if (index
!= limitIx
) {
3543 // Manage the circular CE buffer indexing
3546 if (limitIx
- firstIx
>= bufSize
) {
3547 // The buffer is full, knock out the lowest-indexed entry.
3551 UErrorCode status
= U_ZERO_ERROR
;
3553 buf
[i
].ce
= ucol_nextProcessed(ceIter
, &buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3558 // Get the CE with the specified index.
3559 // Index must be in the range
3560 // n-history_size < index < n+1
3561 // where n is the largest index to have been fetched by some previous call to this function.
3562 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3564 const CEI
*CEBuffer::getPrevious(int32_t index
) {
3565 int i
= index
% bufSize
;
3567 if (index
>=firstIx
&& index
<limitIx
) {
3568 // The request was for an entry already in our buffer.
3573 // Caller is requesting a new, never accessed before, CE.
3574 // Verify that it is the next one in sequence, which is all
3576 if (index
!= limitIx
) {
3582 // Manage the circular CE buffer indexing
3585 if (limitIx
- firstIx
>= bufSize
) {
3586 // The buffer is full, knock out the lowest-indexed entry.
3590 UErrorCode status
= U_ZERO_ERROR
;
3592 buf
[i
].ce
= ucol_previousProcessed(ceIter
, &buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3600 // #define USEARCH_DEBUG
3602 #ifdef USEARCH_DEBUG
3608 * Find the next break boundary after startIndex. If the UStringSearch object
3609 * has an external break iterator, use that. Otherwise use the internal character
3612 static int32_t nextBoundaryAfter(UStringSearch
*strsrch
, int32_t startIndex
) {
3614 const UChar
*text
= strsrch
->search
->text
;
3615 int32_t textLen
= strsrch
->search
->textLength
;
3617 U_ASSERT(startIndex
>=0);
3618 U_ASSERT(startIndex
<=textLen
);
3620 if (startIndex
>= textLen
) {
3625 int32_t i
= startIndex
;
3626 U16_NEXT(text
, i
, textLen
, c
);
3628 // If we are on a control character, stop without looking for combining marks.
3629 // Control characters do not combine.
3630 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3631 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
3635 // The initial character was not a control, and can thus accept trailing
3636 // combining characters. Advance over however many of them there are.
3637 int32_t indexOfLastCharChecked
;
3639 indexOfLastCharChecked
= i
;
3643 U16_NEXT(text
, i
, textLen
, c
);
3644 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3645 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3649 return indexOfLastCharChecked
;
3650 #elif !UCONFIG_NO_BREAK_ITERATION
3651 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3653 if (breakiterator
== NULL
) {
3654 breakiterator
= strsrch
->search
->internalBreakIter
;
3657 if (breakiterator
!= NULL
) {
3658 return ubrk_following(breakiterator
, startIndex
);
3663 // **** or should we use the original code? ****
3670 * Returns TRUE if index is on a break boundary. If the UStringSearch
3671 * has an external break iterator, test using that, otherwise test
3672 * using the internal character break iterator.
3674 static UBool
isBreakBoundary(UStringSearch
*strsrch
, int32_t index
) {
3676 const UChar
*text
= strsrch
->search
->text
;
3677 int32_t textLen
= strsrch
->search
->textLength
;
3680 U_ASSERT(index
<=textLen
);
3682 if (index
>=textLen
|| index
<=0) {
3686 // If the character at the current index is not a GRAPHEME_EXTEND
3687 // then we can not be within a combining sequence.
3689 U16_GET(text
, 0, index
, textLen
, c
);
3690 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3691 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3695 // We are at a combining mark. If the preceding character is anything
3696 // except a CONTROL, CR or LF, we are in a combining sequence.
3697 U16_PREV(text
, 0, index
, c
);
3698 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3699 UBool combining
= !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
3701 #elif !UCONFIG_NO_BREAK_ITERATION
3702 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3704 if (breakiterator
== NULL
) {
3705 breakiterator
= strsrch
->search
->internalBreakIter
;
3708 return (breakiterator
!= NULL
&& ubrk_isBoundary(breakiterator
, index
));
3710 // **** or use the original code? ****
3716 static UBool
onBreakBoundaries(const UStringSearch
*strsrch
, int32_t start
, int32_t end
)
3718 #if !UCONFIG_NO_BREAK_ITERATION
3719 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3721 if (breakiterator
!= NULL
) {
3722 int32_t startindex
= ubrk_first(breakiterator
);
3723 int32_t endindex
= ubrk_last(breakiterator
);
3725 // out-of-range indexes are never boundary positions
3726 if (start
< startindex
|| start
> endindex
||
3727 end
< startindex
|| end
> endindex
) {
3731 return ubrk_isBoundary(breakiterator
, start
) &&
3732 ubrk_isBoundary(breakiterator
, end
);
3745 } UCompareCEsResult
;
3746 #define U_CE_LEVEL2_BASE 0x00000005
3747 #define U_CE_LEVEL3_BASE 0x00050000
3749 static UCompareCEsResult
compareCE64s(int64_t targCE
, int64_t patCE
, int16_t compareType
) {
3750 if (targCE
== patCE
) {
3753 if (compareType
== 0) {
3754 return U_CE_NO_MATCH
;
3757 int64_t targCEshifted
= targCE
>> 32;
3758 int64_t patCEshifted
= patCE
>> 32;
3762 int32_t targLev1
= (int32_t)(targCEshifted
& mask
);
3763 int32_t patLev1
= (int32_t)(patCEshifted
& mask
);
3764 if ( targLev1
!= patLev1
) {
3765 if ( targLev1
== 0 ) {
3766 return U_CE_SKIP_TARG
;
3768 if ( patLev1
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3769 return U_CE_SKIP_PATN
;
3771 return U_CE_NO_MATCH
;
3775 int32_t targLev2
= (int32_t)(targCEshifted
& mask
);
3776 int32_t patLev2
= (int32_t)(patCEshifted
& mask
);
3777 if ( targLev2
!= patLev2
) {
3778 if ( targLev2
== 0 ) {
3779 return U_CE_SKIP_TARG
;
3781 if ( patLev2
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3782 return U_CE_SKIP_PATN
;
3784 return (patLev2
== U_CE_LEVEL2_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev2
== U_CE_LEVEL2_BASE
) )?
3785 U_CE_MATCH
: U_CE_NO_MATCH
;
3789 int32_t targLev3
= (int32_t)(targCE
& mask
);
3790 int32_t patLev3
= (int32_t)(patCE
& mask
);
3791 if ( targLev3
!= patLev3
) {
3792 return (patLev3
== U_CE_LEVEL3_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev3
== U_CE_LEVEL3_BASE
) )?
3793 U_CE_MATCH
: U_CE_NO_MATCH
;
3800 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3803 U_CAPI UBool U_EXPORT2
usearch_search(UStringSearch
*strsrch
,
3805 int32_t *matchStart
,
3806 int32_t *matchLimit
,
3809 if (U_FAILURE(*status
)) {
3813 // TODO: reject search patterns beginning with a combining char.
3815 #ifdef USEARCH_DEBUG
3816 if (getenv("USEARCH_DEBUG") != NULL
) {
3817 printf("Pattern CEs\n");
3818 for (int ii
=0; ii
<strsrch
->pattern
.CELength
; ii
++) {
3819 printf(" %8x", strsrch
->pattern
.CE
[ii
]);
3825 // Input parameter sanity check.
3826 // TODO: should input indicies clip to the text length
3827 // in the same way that UText does.
3828 if(strsrch
->pattern
.CELength
== 0 ||
3830 startIdx
> strsrch
->search
->textLength
||
3831 strsrch
->pattern
.CE
== NULL
) {
3832 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3836 if (strsrch
->pattern
.PCE
== NULL
) {
3837 initializePatternPCETable(strsrch
, status
);
3840 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3841 CEBuffer
ceb(strsrch
, status
);
3844 int32_t targetIx
= 0;
3845 const CEI
*targetCEI
= NULL
;
3849 int32_t mStart
= -1;
3850 int32_t mLimit
= -1;
3856 // Outer loop moves over match starting positions in the
3858 // Here we see the target as a sequence of collation elements, resulting from the following:
3859 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3860 // (for example, digraphs such as IJ may be broken into two characters).
3861 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3862 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3863 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3864 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3865 // then the CE is deleted, so the following code sees only CEs that are relevant.
3866 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3867 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3868 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3870 for(targetIx
=0; ; targetIx
++)
3873 // Inner loop checks for a match beginning at each
3874 // position from the outer loop.
3875 int32_t targetIxOffset
= 0;
3877 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3878 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3879 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3880 const CEI
*firstCEI
= ceb
.get(targetIx
);
3881 if (firstCEI
== NULL
) {
3882 *status
= U_INTERNAL_PROGRAM_ERROR
;
3887 for (patIx
=0; patIx
<strsrch
->pattern
.PCELength
; patIx
++) {
3888 patCE
= strsrch
->pattern
.PCE
[patIx
];
3889 targetCEI
= ceb
.get(targetIx
+patIx
+targetIxOffset
);
3890 // Compare CE from target string with CE from the pattern.
3891 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3892 // which will fail the compare, below.
3893 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3894 if ( ceMatch
== U_CE_NO_MATCH
) {
3897 } else if ( ceMatch
> U_CE_NO_MATCH
) {
3898 if ( ceMatch
== U_CE_SKIP_TARG
) {
3899 // redo with same patCE, next targCE
3902 } else { // ceMatch == U_CE_SKIP_PATN
3903 // redo with same targCE, next patCE
3908 targetIxOffset
+= strsrch
->pattern
.PCELength
; // this is now the offset in target CE space to end of the match so far
3910 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
3911 // No match at this targetIx. Try again at the next.
3916 // No match at all, we have run off the end of the target text.
3921 // We have found a match in CE space.
3922 // Now determine the bounds in string index space.
3923 // There still is a chance of match failure if the CE range not correspond to
3924 // an acceptable character range.
3926 const CEI
*lastCEI
= ceb
.get(targetIx
+ targetIxOffset
- 1);
3928 mStart
= firstCEI
->lowIndex
;
3929 minLimit
= lastCEI
->lowIndex
;
3931 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3932 // extended to the end of input, and the match is good.
3934 // Look at the high and low indices of the CE following the match. If
3935 // they are the same it means one of two things:
3936 // 1. The match extended to the last CE from the target text, which is OK, or
3937 // 2. The last CE that was part of the match is in an expansion that extends
3938 // to the first CE after the match. In this case, we reject the match.
3939 const CEI
*nextCEI
= 0;
3940 if (strsrch
->search
->elementComparisonType
== 0) {
3941 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3942 maxLimit
= nextCEI
->lowIndex
;
3943 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3947 for ( ; ; ++targetIxOffset
) {
3948 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3949 maxLimit
= nextCEI
->lowIndex
;
3950 // If we are at the end of the target too, match succeeds
3951 if ( nextCEI
->ce
== UCOL_PROCESSED_NULLORDER
) {
3954 // As long as the next CE has primary weight of 0,
3955 // it is part of the last target element matched by the pattern;
3956 // make sure it can be part of a match with the last patCE
3957 if ( (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) == 0 ) {
3958 UCompareCEsResult ceMatch
= compareCE64s(nextCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3959 if ( ceMatch
== U_CE_NO_MATCH
|| ceMatch
== U_CE_SKIP_PATN
) {
3963 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3964 // target element, but it has non-zero primary weight => match fails
3965 } else if ( nextCEI
->lowIndex
== nextCEI
->highIndex
) {
3968 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3976 // Check for the start of the match being within a combining sequence.
3977 // This can happen if the pattern itself begins with a combining char, and
3978 // the match found combining marks in the target text that were attached
3979 // to something else.
3980 // This type of match should be rejected for not completely consuming a
3981 // combining sequence.
3982 if (!isBreakBoundary(strsrch
, mStart
)) {
3986 // Check for the start of the match being within an Collation Element Expansion,
3987 // meaning that the first char of the match is only partially matched.
3988 // With exapnsions, the first CE will report the index of the source
3989 // character, and all subsequent (expansions) CEs will report the source index of the
3990 // _following_ character.
3991 int32_t secondIx
= firstCEI
->highIndex
;
3992 if (mStart
== secondIx
) {
3996 // Advance the match end position to the first acceptable match boundary.
3997 // This advances the index over any combining charcters.
3999 if (minLimit
< maxLimit
) {
4000 // When the last CE's low index is same with its high index, the CE is likely
4001 // a part of expansion. In this case, the index is located just after the
4002 // character corresponding to the CEs compared above. If the index is right
4003 // at the break boundary, move the position to the next boundary will result
4004 // incorrect match length when there are ignorable characters exist between
4005 // the position and the next character produces CE(s). See ticket#8482.
4006 if (minLimit
== lastCEI
->highIndex
&& isBreakBoundary(strsrch
, minLimit
)) {
4009 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4010 if (nba
>= lastCEI
->highIndex
) {
4016 #ifdef USEARCH_DEBUG
4017 if (getenv("USEARCH_DEBUG") != NULL
) {
4018 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4022 // If advancing to the end of a combining sequence in character indexing space
4023 // advanced us beyond the end of the match in CE space, reject this match.
4024 if (mLimit
> maxLimit
) {
4028 if (!isBreakBoundary(strsrch
, mLimit
)) {
4032 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4041 #ifdef USEARCH_DEBUG
4042 if (getenv("USEARCH_DEBUG") != NULL
) {
4043 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4044 int32_t lastToPrint
= ceb
.limitIx
+2;
4045 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4046 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4048 printf("\n%s\n", found
? "match found" : "no match");
4052 // All Done. Store back the match bounds to the caller.
4059 if (matchStart
!= NULL
) {
4060 *matchStart
= mStart
;
4063 if (matchLimit
!= NULL
) {
4064 *matchLimit
= mLimit
;
4070 U_CAPI UBool U_EXPORT2
usearch_searchBackwards(UStringSearch
*strsrch
,
4072 int32_t *matchStart
,
4073 int32_t *matchLimit
,
4076 if (U_FAILURE(*status
)) {
4080 // TODO: reject search patterns beginning with a combining char.
4082 #ifdef USEARCH_DEBUG
4083 if (getenv("USEARCH_DEBUG") != NULL
) {
4084 printf("Pattern CEs\n");
4085 for (int ii
=0; ii
<strsrch
->pattern
.CELength
; ii
++) {
4086 printf(" %8x", strsrch
->pattern
.CE
[ii
]);
4092 // Input parameter sanity check.
4093 // TODO: should input indicies clip to the text length
4094 // in the same way that UText does.
4095 if(strsrch
->pattern
.CELength
== 0 ||
4097 startIdx
> strsrch
->search
->textLength
||
4098 strsrch
->pattern
.CE
== NULL
) {
4099 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
4103 if (strsrch
->pattern
.PCE
== NULL
) {
4104 initializePatternPCETable(strsrch
, status
);
4107 CEBuffer
ceb(strsrch
, status
);
4108 int32_t targetIx
= 0;
4111 * Pre-load the buffer with the CE's for the grapheme
4112 * after our starting position so that we're sure that
4113 * we can look at the CE following the match when we
4114 * check the match boundaries.
4116 * This will also pre-fetch the first CE that we'll
4117 * consider for the match.
4119 if (startIdx
< strsrch
->search
->textLength
) {
4120 UBreakIterator
*bi
= strsrch
->search
->internalBreakIter
;
4121 int32_t next
= ubrk_following(bi
, startIdx
);
4123 ucol_setOffset(strsrch
->textIter
, next
, status
);
4125 for (targetIx
= 0; ; targetIx
+= 1) {
4126 if (ceb
.getPrevious(targetIx
)->lowIndex
< startIdx
) {
4131 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
4135 const CEI
*targetCEI
= NULL
;
4139 int32_t limitIx
= targetIx
;
4140 int32_t mStart
= -1;
4141 int32_t mLimit
= -1;
4147 // Outer loop moves over match starting positions in the
4149 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4150 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4151 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4152 // and the beginning of the base text.
4153 for(targetIx
= limitIx
; ; targetIx
+= 1)
4156 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4157 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4158 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4159 const CEI
*lastCEI
= ceb
.getPrevious(targetIx
);
4160 if (lastCEI
== NULL
) {
4161 *status
= U_INTERNAL_PROGRAM_ERROR
;
4165 // Inner loop checks for a match beginning at each
4166 // position from the outer loop.
4167 int32_t targetIxOffset
= 0;
4168 for (patIx
= strsrch
->pattern
.PCELength
- 1; patIx
>= 0; patIx
-= 1) {
4169 int64_t patCE
= strsrch
->pattern
.PCE
[patIx
];
4171 targetCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.PCELength
- 1 - patIx
+ targetIxOffset
);
4172 // Compare CE from target string with CE from the pattern.
4173 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4174 // which will fail the compare, below.
4175 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
4176 if ( ceMatch
== U_CE_NO_MATCH
) {
4179 } else if ( ceMatch
> U_CE_NO_MATCH
) {
4180 if ( ceMatch
== U_CE_SKIP_TARG
) {
4181 // redo with same patCE, next targCE
4184 } else { // ceMatch == U_CE_SKIP_PATN
4185 // redo with same targCE, next patCE
4191 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
4192 // No match at this targetIx. Try again at the next.
4197 // No match at all, we have run off the end of the target text.
4202 // We have found a match in CE space.
4203 // Now determine the bounds in string index space.
4204 // There still is a chance of match failure if the CE range not correspond to
4205 // an acceptable character range.
4207 const CEI
*firstCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.PCELength
- 1 + targetIxOffset
);
4208 mStart
= firstCEI
->lowIndex
;
4210 // Check for the start of the match being within a combining sequence.
4211 // This can happen if the pattern itself begins with a combining char, and
4212 // the match found combining marks in the target text that were attached
4213 // to something else.
4214 // This type of match should be rejected for not completely consuming a
4215 // combining sequence.
4216 if (!isBreakBoundary(strsrch
, mStart
)) {
4220 // Look at the high index of the first CE in the match. If it's the same as the
4221 // low index, the first CE in the match is in the middle of an expansion.
4222 if (mStart
== firstCEI
->highIndex
) {
4227 minLimit
= lastCEI
->lowIndex
;
4230 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4231 // extended to the end of input, and the match is good.
4233 // Look at the high and low indices of the CE following the match. If
4234 // they are the same it means one of two things:
4235 // 1. The match extended to the last CE from the target text, which is OK, or
4236 // 2. The last CE that was part of the match is in an expansion that extends
4237 // to the first CE after the match. In this case, we reject the match.
4238 const CEI
*nextCEI
= ceb
.getPrevious(targetIx
- 1);
4240 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4244 mLimit
= maxLimit
= nextCEI
->lowIndex
;
4246 // Advance the match end position to the first acceptable match boundary.
4247 // This advances the index over any combining charcters.
4248 if (minLimit
< maxLimit
) {
4249 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4251 if (nba
>= lastCEI
->highIndex
) {
4256 // If advancing to the end of a combining sequence in character indexing space
4257 // advanced us beyond the end of the match in CE space, reject this match.
4258 if (mLimit
> maxLimit
) {
4262 // Make sure the end of the match is on a break boundary
4263 if (!isBreakBoundary(strsrch
, mLimit
)) {
4268 // No non-ignorable CEs after this point.
4269 // The maximum position is detected by boundary after
4270 // the last non-ignorable CE. Combining sequence
4271 // across the start index will be truncated.
4272 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4273 mLimit
= maxLimit
= (nba
> 0) && (startIdx
> nba
) ? nba
: startIdx
;
4276 #ifdef USEARCH_DEBUG
4277 if (getenv("USEARCH_DEBUG") != NULL
) {
4278 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4283 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4292 #ifdef USEARCH_DEBUG
4293 if (getenv("USEARCH_DEBUG") != NULL
) {
4294 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4295 int32_t lastToPrint
= ceb
.limitIx
+2;
4296 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4297 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4299 printf("\n%s\n", found
? "match found" : "no match");
4303 // All Done. Store back the match bounds to the caller.
4310 if (matchStart
!= NULL
) {
4311 *matchStart
= mStart
;
4314 if (matchLimit
!= NULL
) {
4315 *matchLimit
= mLimit
;
4321 // internal use methods declared in usrchimp.h -----------------------------
4323 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4325 if (U_FAILURE(*status
)) {
4326 setMatchNotFound(strsrch
);
4331 UCollationElements
*coleiter
= strsrch
->textIter
;
4332 int32_t textlength
= strsrch
->search
->textLength
;
4333 int32_t *patternce
= strsrch
->pattern
.CE
;
4334 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4335 int32_t textoffset
= ucol_getOffset(coleiter
);
4337 // status used in setting coleiter offset, since offset is checked in
4338 // shiftForward before setting the coleiter offset, status never
4340 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4342 while (textoffset
<= textlength
)
4344 uint32_t patternceindex
= patterncelength
- 1;
4346 UBool found
= FALSE
;
4347 int32_t lastce
= UCOL_NULLORDER
;
4349 setColEIterOffset(coleiter
, textoffset
);
4352 // finding the last pattern ce match, imagine composite characters
4353 // for example: search for pattern A in text \u00C0
4354 // we'll have to skip \u0300 the grave first before we get to A
4355 targetce
= ucol_previous(coleiter
, status
);
4356 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4360 targetce
= getCE(strsrch
, targetce
);
4361 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
4362 // this is for the text \u0315\u0300 that requires
4363 // normalization and pattern \u0300, where \u0315 is ignorable
4366 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4369 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4370 if (targetce
== patternce
[patternceindex
]) {
4371 // the first ce can be a contraction
4375 if (!hasExpansion(coleiter
)) {
4381 //targetce = lastce;
4383 while (found
&& patternceindex
> 0) {
4385 targetce
= ucol_previous(coleiter
, status
);
4386 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4390 targetce
= getCE(strsrch
, targetce
);
4391 if (targetce
== UCOL_IGNORABLE
) {
4396 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4397 found
= found
&& targetce
== patternce
[patternceindex
];
4403 if (U_FAILURE(*status
)) {
4406 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4408 // status checked at loop.
4409 patternceindex
= patterncelength
;
4413 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
4414 // status checked in ucol_setOffset
4415 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4419 setMatchNotFound(strsrch
);
4422 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4426 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4427 strsrch
->search
->matchedIndex
= start
;
4428 strsrch
->search
->matchedLength
= end
- start
;
4431 setMatchNotFound(strsrch
);
4437 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
4439 if (U_FAILURE(*status
)) {
4440 setMatchNotFound(strsrch
);
4445 UCollationElements
*coleiter
= strsrch
->textIter
;
4446 int32_t textlength
= strsrch
->search
->textLength
;
4447 int32_t *patternce
= strsrch
->pattern
.CE
;
4448 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4449 int32_t textoffset
= ucol_getOffset(coleiter
);
4450 UBool hasPatternAccents
=
4451 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4453 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4455 strsrch
->canonicalPrefixAccents
[0] = 0;
4456 strsrch
->canonicalSuffixAccents
[0] = 0;
4458 while (textoffset
<= textlength
)
4460 int32_t patternceindex
= patterncelength
- 1;
4462 UBool found
= FALSE
;
4463 int32_t lastce
= UCOL_NULLORDER
;
4465 setColEIterOffset(coleiter
, textoffset
);
4468 // finding the last pattern ce match, imagine composite characters
4469 // for example: search for pattern A in text \u00C0
4470 // we'll have to skip \u0300 the grave first before we get to A
4471 targetce
= ucol_previous(coleiter
, status
);
4472 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4476 targetce
= getCE(strsrch
, targetce
);
4477 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4480 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4481 if (targetce
== patternce
[patternceindex
]) {
4482 // the first ce can be a contraction
4486 if (!hasExpansion(coleiter
)) {
4492 while (found
&& patternceindex
> 0) {
4493 targetce
= ucol_previous(coleiter
, status
);
4494 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4498 targetce
= getCE(strsrch
, targetce
);
4499 if (targetce
== UCOL_IGNORABLE
) {
4504 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4505 found
= found
&& targetce
== patternce
[patternceindex
];
4508 // initializing the rearranged accent array
4509 if (hasPatternAccents
&& !found
) {
4510 strsrch
->canonicalPrefixAccents
[0] = 0;
4511 strsrch
->canonicalSuffixAccents
[0] = 0;
4512 if (U_FAILURE(*status
)) {
4515 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
4519 if (U_FAILURE(*status
)) {
4522 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4524 // status checked at loop
4525 patternceindex
= patterncelength
;
4529 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
4530 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4534 setMatchNotFound(strsrch
);
4537 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4541 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4542 strsrch
->search
->matchedIndex
= start
;
4543 strsrch
->search
->matchedLength
= end
- start
;
4546 setMatchNotFound(strsrch
);
4552 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4554 if (U_FAILURE(*status
)) {
4555 setMatchNotFound(strsrch
);
4560 UCollationElements
*coleiter
= strsrch
->textIter
;
4561 int32_t *patternce
= strsrch
->pattern
.CE
;
4562 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4563 int32_t textoffset
= ucol_getOffset(coleiter
);
4565 // shifting it check for setting offset
4566 // if setOffset is called previously or there was no previous match, we
4567 // leave the offset as it is.
4568 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4569 textoffset
= strsrch
->search
->matchedIndex
;
4572 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4575 while (textoffset
>= 0)
4577 int32_t patternceindex
= 1;
4579 UBool found
= FALSE
;
4580 int32_t firstce
= UCOL_NULLORDER
;
4582 // if status is a failure, ucol_setOffset does nothing
4583 setColEIterOffset(coleiter
, textoffset
);
4586 // finding the first pattern ce match, imagine composite
4587 // characters. for example: search for pattern \u0300 in text
4588 // \u00C0, we'll have to skip A first before we get to
4589 // \u0300 the grave accent
4590 targetce
= ucol_next(coleiter
, status
);
4591 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4595 targetce
= getCE(strsrch
, targetce
);
4596 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4599 if (targetce
== UCOL_IGNORABLE
&& strsrch
->strength
!= UCOL_PRIMARY
) {
4602 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603 if (targetce
== patternce
[0]) {
4607 if (!hasExpansion(coleiter
)) {
4608 // checking for accents in composite character
4614 //targetce = firstce;
4616 while (found
&& (patternceindex
< patterncelength
)) {
4618 targetce
= ucol_next(coleiter
, status
);
4619 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4623 targetce
= getCE(strsrch
, targetce
);
4624 if (targetce
== UCOL_IGNORABLE
) {
4628 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4629 found
= found
&& targetce
== patternce
[patternceindex
];
4636 if (U_FAILURE(*status
)) {
4640 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4646 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
4647 setColEIterOffset(coleiter
, textoffset
);
4651 setMatchNotFound(strsrch
);
4656 if (strsrch
->search
->isOverlap
) {
4657 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4658 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4660 // move the start position at the end of possible match
4661 initializePatternPCETable(strsrch
, status
);
4662 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.PCELength
- 1; nPCEs
++) {
4663 int64_t pce
= ucol_nextProcessed(strsrch
->textIter
, NULL
, NULL
, status
);
4664 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4665 // at the end of the text
4669 if (U_FAILURE(*status
)) {
4670 setMatchNotFound(strsrch
);
4673 textOffset
= ucol_getOffset(strsrch
->textIter
);
4676 textOffset
= ucol_getOffset(strsrch
->textIter
);
4682 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4683 strsrch
->search
->matchedIndex
= start
;
4684 strsrch
->search
->matchedLength
= end
- start
;
4687 setMatchNotFound(strsrch
);
4693 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
4696 if (U_FAILURE(*status
)) {
4697 setMatchNotFound(strsrch
);
4702 UCollationElements
*coleiter
= strsrch
->textIter
;
4703 int32_t *patternce
= strsrch
->pattern
.CE
;
4704 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4705 int32_t textoffset
= ucol_getOffset(coleiter
);
4706 UBool hasPatternAccents
=
4707 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4709 // shifting it check for setting offset
4710 // if setOffset is called previously or there was no previous match, we
4711 // leave the offset as it is.
4712 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4713 textoffset
= strsrch
->search
->matchedIndex
;
4716 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4718 strsrch
->canonicalPrefixAccents
[0] = 0;
4719 strsrch
->canonicalSuffixAccents
[0] = 0;
4721 while (textoffset
>= 0)
4723 int32_t patternceindex
= 1;
4725 UBool found
= FALSE
;
4726 int32_t firstce
= UCOL_NULLORDER
;
4728 setColEIterOffset(coleiter
, textoffset
);
4730 // finding the first pattern ce match, imagine composite
4731 // characters. for example: search for pattern \u0300 in text
4732 // \u00C0, we'll have to skip A first before we get to
4733 // \u0300 the grave accent
4734 targetce
= ucol_next(coleiter
, status
);
4735 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4739 targetce
= getCE(strsrch
, targetce
);
4740 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4744 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4745 if (targetce
== patternce
[0]) {
4746 // the first ce can be a contraction
4750 if (!hasExpansion(coleiter
)) {
4751 // checking for accents in composite character
4759 while (found
&& patternceindex
< patterncelength
) {
4760 targetce
= ucol_next(coleiter
, status
);
4761 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4765 targetce
= getCE(strsrch
, targetce
);
4766 if (targetce
== UCOL_IGNORABLE
) {
4770 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4771 found
= found
&& targetce
== patternce
[patternceindex
];
4775 // initializing the rearranged accent array
4776 if (hasPatternAccents
&& !found
) {
4777 strsrch
->canonicalPrefixAccents
[0] = 0;
4778 strsrch
->canonicalSuffixAccents
[0] = 0;
4779 if (U_FAILURE(*status
)) {
4782 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
4786 if (U_FAILURE(*status
)) {
4789 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4795 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
4796 setColEIterOffset(coleiter
, textoffset
);
4800 setMatchNotFound(strsrch
);
4805 if (strsrch
->search
->isOverlap
) {
4806 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4807 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4809 // move the start position at the end of possible match
4810 initializePatternPCETable(strsrch
, status
);
4811 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.PCELength
- 1; nPCEs
++) {
4812 int64_t pce
= ucol_nextProcessed(strsrch
->textIter
, NULL
, NULL
, status
);
4813 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4814 // at the end of the text
4818 if (U_FAILURE(*status
)) {
4819 setMatchNotFound(strsrch
);
4822 textOffset
= ucol_getOffset(strsrch
->textIter
);
4825 textOffset
= ucol_getOffset(strsrch
->textIter
);
4831 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4832 strsrch
->search
->matchedIndex
= start
;
4833 strsrch
->search
->matchedLength
= end
- start
;
4836 setMatchNotFound(strsrch
);
4842 #endif /* #if !UCONFIG_NO_COLLATION */